Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Wed, 13 Feb 2013 21:01:16 +0000 (16:01 -0500)
committerRoland Schulz <roland@utk.edu>
Wed, 13 Feb 2013 21:01:16 +0000 (16:01 -0500)
Conflicts (all trivial):
        CMakeLists.txt
        admin/mkhtml
        share/template/CMakeLists.txt
        src/gromacs/gmxlib/nonbonded/nb_generic_adress.c
        src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre

Manual changes:
        CMakeLists.txt: Reverted change to CPACK_PACKAGE_VERSION_PATCH

Change-Id: Ib38fbbdd5d3d7e531d4e1db0229d6305b84bbd6b

20 files changed:
1  2 
CMakeLists.txt
admin/mkhtml
share/html/online.html
share/html/online/getting_started.html
src/gromacs/gmxlib/nonbonded/nb_generic_adress.c
src/gromacs/gmxlib/nonbonded/nb_generic_cg.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre
src/gromacs/gmxlib/nrnb.c
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/types/nrnb.h
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/ns.c
src/gromacs/mdlib/pull_rotation.c

diff --cc CMakeLists.txt
Simple merge
diff --cc admin/mkhtml
index 99dd21278214496b7c82ee080c9e1a4196fef8ad,432739a816a0ec557cad549a6cd2263c7f47172a..eb6d41c10b5a1358bced3d40ee879bf5d64e9ec8
@@@ -85,7 -85,7 +85,7 @@@ cat >> $HTMLIDX <<EO
  <br>
  EOD
  foreach program ( $PROGRAMS )
-   if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) ) then
 -  if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "luck" ) && ( $program != "demux.pl" ) && ( $program != "xplor2gmx.pl" ) ) then  
++  if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "demux.pl" ) && ( $program != "xplor2gmx.pl" ) ) then  
      echo "<br><a href="$MANDIR/$program.html">$program</a>" >> $HTMLIDX
    endif
  end
index 2da5590a0862102e9432f4ce7c7767ef1ef71e73,8993da80abeff90123f5c524213e0b92cffd95fe..cb41520370bc7e5e89b82b20d831c0b3ff2ee103
@@@ -78,7 -85,10 +85,9 @@@ Sat 19 Jan 2013</B></td
  <br><a href=online/g_hbond.html>g_hbond</a>
  <br><a href=online/g_helix.html>g_helix</a>
  <br><a href=online/g_helixorient.html>g_helixorient</a>
+ <br><a href=online/g_hydorder.html>g_hydorder</a>
+ <br><a href=online/g_kinetics.html>g_kinetics</a>
  <br><a href=online/g_lie.html>g_lie</a>
 -<br><a href=online/g_luck.html>g_luck</a>
  <br><a href=online/g_mdmat.html>g_mdmat</a>
  <br><a href=online/g_membed.html>g_membed</a>
  <br><a href=online/g_mindist.html>g_mindist</a>
index 7a96c75a317640b9b0788665cfa1e9a7fbbcf87a,0000000000000000000000000000000000000000..81bb26ed695a5621e4462121c5180f5dc6466155
mode 100644,000000..100644
--- /dev/null
@@@ -1,518 -1,0 +1,516 @@@
-  * Copyright (c) 2012, The GROMACS development team,
-  * check out http://www.gromacs.org for more information.
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2009 Christoph Junghans, Brad Lambeth.
 + * Copyright (c) 2011 Christoph Junghans, Sebastian Fritsch
-     /* Estimate flops, average for generic kernel:
-      * 12 flops per outer iteration
-      * 50 flops per inner iteration
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "typedefs.h"
 +#include "nb_generic_adress.h"
 +#include "nrnb.h"
 +
 +#include "nonbonded.h"
 +#include "nb_kernel.h"
 +
 +#define ALMOST_ZERO 1e-30
 +#define ALMOST_ONE 1-(1e-30)
 +void
 +gmx_nb_generic_adress_kernel(t_nblist *                nlist,
 +                             rvec *                    xx,
 +                             rvec *                    ff,
 +                             t_forcerec *              fr,
 +                             t_mdatoms *               mdatoms,
 +                             nb_kernel_data_t *        kernel_data,
 +                             t_nrnb *                  nrnb)
 +{
 +    int           nri, ntype, table_nelements, ielec, ivdw;
 +    real          facel, gbtabscale;
 +    int           n, ii, is3, ii3, k, nj0, nj1, jnr, j3, ggid, nnn, n0;
 +    real          shX, shY, shZ;
 +    real          fscal, felec, fvdw, velec, vvdw, tx, ty, tz;
 +    real          rinvsq;
 +    real          iq;
 +    real          qq, vctot;
 +    int           nti, nvdwparam;
 +    int           tj;
 +    real          rt, r, eps, eps2, Y, F, Geps, Heps2, VV, FF, Fp, fijD, fijR;
 +    real          rinvsix;
 +    real          vvdwtot;
 +    real          vvdw_rep, vvdw_disp;
 +    real          ix, iy, iz, fix, fiy, fiz;
 +    real          jx, jy, jz;
 +    real          dx, dy, dz, rsq, rinv;
 +    real          c6, c12, cexp1, cexp2, br;
 +    real *        charge;
 +    real *        shiftvec;
 +    real *        vdwparam;
 +    int *         shift;
 +    int *         type;
 +    real *        fshift;
 +    real *        velecgrp;
 +    real *        vvdwgrp;
 +    real          tabscale;
 +    real *        VFtab;
 +    real *        x;
 +    real *        f;
 +    int           ewitab;
 +    real          ewtabscale, eweps, sh_ewald, ewrt, ewtabhalfspace;
 +    real *        ewtab;
 +    real          rcoulomb2, rvdw, rvdw2, sh_invrc6;
 +    real          rcutoff, rcutoff2;
 +    real          rswitch_elec, rswitch_vdw, d, d2, sw, dsw, rinvcorr;
 +    real          elec_swV3, elec_swV4, elec_swV5, elec_swF2, elec_swF3, elec_swF4;
 +    real          vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
 +    gmx_bool      bExactElecCutoff, bExactVdwCutoff, bExactCutoff;
 +
 +    real    *     wf;
 +    real          weight_cg1;
 +    real          weight_cg2;
 +    real          weight_product;
 +    real          hybscal; /* the multiplicator to the force for hybrid interactions*/
 +    real          force_cap;
 +    gmx_bool      bCG;
 +    int           egp_nr;
 +
 +    wf                  = mdatoms->wf;
 +
 +    force_cap           = fr->adress_ex_forcecap;
 +
 +    x                   = xx[0];
 +    f                   = ff[0];
 +    ielec               = nlist->ielec;
 +    ivdw                = nlist->ivdw;
 +
 +    fshift              = fr->fshift[0];
 +    velecgrp            = kernel_data->energygrp_elec;
 +    vvdwgrp             = kernel_data->energygrp_vdw;
 +    tabscale            = kernel_data->table_elec_vdw->scale;
 +    VFtab               = kernel_data->table_elec_vdw->data;
 +
 +    sh_ewald            = fr->ic->sh_ewald;
 +    ewtab               = fr->ic->tabq_coul_FDV0;
 +    ewtabscale          = fr->ic->tabq_scale;
 +    ewtabhalfspace      = 0.5/ewtabscale;
 +
 +    rcoulomb2           = fr->rcoulomb*fr->rcoulomb;
 +    rvdw                = fr->rvdw;
 +    rvdw2               = rvdw*rvdw;
 +    sh_invrc6           = fr->ic->sh_invrc6;
 +
 +    if (fr->coulomb_modifier == eintmodPOTSWITCH)
 +    {
 +        d               = fr->rcoulomb-fr->rcoulomb_switch;
 +        elec_swV3       = -10.0/(d*d*d);
 +        elec_swV4       =  15.0/(d*d*d*d);
 +        elec_swV5       =  -6.0/(d*d*d*d*d);
 +        elec_swF2       = -30.0/(d*d*d);
 +        elec_swF3       =  60.0/(d*d*d*d);
 +        elec_swF4       = -30.0/(d*d*d*d*d);
 +    }
 +    else
 +    {
 +        /* Avoid warnings from stupid compilers (looking at you, Clang!) */
 +        elec_swV3 = elec_swV4 = elec_swV5 = elec_swF2 = elec_swF3 = elec_swF4 = 0.0;
 +    }
 +    if (fr->vdw_modifier == eintmodPOTSWITCH)
 +    {
 +        d               = fr->rvdw-fr->rvdw_switch;
 +        vdw_swV3        = -10.0/(d*d*d);
 +        vdw_swV4        =  15.0/(d*d*d*d);
 +        vdw_swV5        =  -6.0/(d*d*d*d*d);
 +        vdw_swF2        = -30.0/(d*d*d);
 +        vdw_swF3        =  60.0/(d*d*d*d);
 +        vdw_swF4        = -30.0/(d*d*d*d*d);
 +    }
 +    else
 +    {
 +        /* Avoid warnings from stupid compilers (looking at you, Clang!) */
 +        vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = 0.0;
 +    }
 +
 +    bExactElecCutoff    = (fr->coulomb_modifier != eintmodNONE) || fr->eeltype == eelRF_ZERO;
 +    bExactVdwCutoff     = (fr->vdw_modifier != eintmodNONE);
 +    bExactCutoff        = bExactElecCutoff || bExactVdwCutoff;
 +
 +    if (bExactCutoff)
 +    {
 +        rcutoff  = ( fr->rcoulomb > fr->rvdw ) ? fr->rcoulomb : fr->rvdw;
 +        rcutoff2 = rcutoff*rcutoff;
 +    }
 +    else
 +    {
 +        /* Fix warnings for stupid compilers */
 +        rcutoff = rcutoff2 = 1e30;
 +    }
 +
 +    /* avoid compiler warnings for cases that cannot happen */
 +    nnn                 = 0;
 +    eps                 = 0.0;
 +    eps2                = 0.0;
 +
 +    /* 3 VdW parameters for buckingham, otherwise 2 */
 +    nvdwparam           = (ivdw == GMX_NBKERNEL_VDW_BUCKINGHAM) ? 3 : 2;
 +    table_nelements     = 12;
 +
 +    charge              = mdatoms->chargeA;
 +    type                = mdatoms->typeA;
 +    facel               = fr->epsfac;
 +    shiftvec            = fr->shift_vec[0];
 +    vdwparam            = fr->nbfp;
 +    ntype               = fr->ntype;
 +
 +    for (n = 0; (n < nlist->nri); n++)
 +    {
 +        is3              = 3*nlist->shift[n];
 +        shX              = shiftvec[is3];
 +        shY              = shiftvec[is3+1];
 +        shZ              = shiftvec[is3+2];
 +        nj0              = nlist->jindex[n];
 +        nj1              = nlist->jindex[n+1];
 +        ii               = nlist->iinr[n];
 +        ii3              = 3*ii;
 +        ix               = shX + x[ii3+0];
 +        iy               = shY + x[ii3+1];
 +        iz               = shZ + x[ii3+2];
 +        iq               = facel*charge[ii];
 +        nti              = nvdwparam*ntype*type[ii];
 +        vctot            = 0;
 +        vvdwtot          = 0;
 +        fix              = 0;
 +        fiy              = 0;
 +        fiz              = 0;
 +
 +        /* We need to find out if this i atom is part of an
 +           all-atom or CG energy group  */
 +        egp_nr = mdatoms->cENER[ii];
 +        bCG    = !fr->adress_group_explicit[egp_nr];
 +
 +        weight_cg1       = wf[ii];
 +
 +        if ((!bCG) && weight_cg1 < ALMOST_ZERO)
 +        {
 +            continue;
 +        }
 +
 +        for (k = nj0; (k < nj1); k++)
 +        {
 +            jnr              = nlist->jjnr[k];
 +            weight_cg2       = wf[jnr];
 +            weight_product   = weight_cg1*weight_cg2;
 +
 +            if (weight_product < ALMOST_ZERO)
 +            {
 +                /* if it's a explicit loop, skip this atom */
 +                if (!bCG)
 +                {
 +                    continue;
 +                }
 +                else /* if it's a coarse grained loop, include this atom */
 +                {
 +                    hybscal = 1.0;
 +                }
 +            }
 +            else if (weight_product >= ALMOST_ONE)
 +            {
 +
 +                /* if it's a explicit loop, include this atom */
 +                if (!bCG)
 +                {
 +                    hybscal = 1.0;
 +                }
 +                else  /* if it's a coarse grained loop, skip this atom */
 +                {
 +                    continue;
 +                }
 +            }
 +            /* both have double identity, get hybrid scaling factor */
 +            else
 +            {
 +                hybscal = weight_product;
 +
 +                if (bCG)
 +                {
 +                    hybscal = 1.0 - hybscal;
 +                }
 +            }
 +
 +            j3               = 3*jnr;
 +            jx               = x[j3+0];
 +            jy               = x[j3+1];
 +            jz               = x[j3+2];
 +            dx               = ix - jx;
 +            dy               = iy - jy;
 +            dz               = iz - jz;
 +            rsq              = dx*dx+dy*dy+dz*dz;
 +            rinv             = gmx_invsqrt(rsq);
 +            rinvsq           = rinv*rinv;
 +            felec            = 0;
 +            fvdw             = 0;
 +            velec            = 0;
 +            vvdw             = 0;
 +
 +            if (bExactCutoff && rsq > rcutoff2)
 +            {
 +                continue;
 +            }
 +
 +            if (ielec == GMX_NBKERNEL_ELEC_CUBICSPLINETABLE || ivdw == GMX_NBKERNEL_VDW_CUBICSPLINETABLE)
 +            {
 +                r                = rsq*rinv;
 +                rt               = r*tabscale;
 +                n0               = rt;
 +                eps              = rt-n0;
 +                eps2             = eps*eps;
 +                nnn              = table_nelements*n0;
 +            }
 +
 +            /* Coulomb interaction. ielec==0 means no interaction */
 +            if (ielec != GMX_NBKERNEL_ELEC_NONE)
 +            {
 +                qq               = iq*charge[jnr];
 +
 +                switch (ielec)
 +                {
 +                    case GMX_NBKERNEL_ELEC_NONE:
 +                        break;
 +
 +                    case GMX_NBKERNEL_ELEC_COULOMB:
 +                        /* Vanilla cutoff coulomb */
 +                        velec            = qq*rinv;
 +                        felec            = velec*rinvsq;
 +                        break;
 +
 +                    case GMX_NBKERNEL_ELEC_REACTIONFIELD:
 +                        /* Reaction-field */
 +                        velec            = qq*(rinv+fr->k_rf*rsq-fr->c_rf);
 +                        felec            = qq*(rinv*rinvsq-2.0*fr->k_rf);
 +                        break;
 +
 +                    case GMX_NBKERNEL_ELEC_CUBICSPLINETABLE:
 +                        /* Tabulated coulomb */
 +                        Y                = VFtab[nnn];
 +                        F                = VFtab[nnn+1];
 +                        Geps             = eps*VFtab[nnn+2];
 +                        Heps2            = eps2*VFtab[nnn+3];
 +                        Fp               = F+Geps+Heps2;
 +                        VV               = Y+eps*Fp;
 +                        FF               = Fp+Geps+2.0*Heps2;
 +                        velec            = qq*VV;
 +                        felec            = -qq*FF*tabscale*rinv;
 +                        break;
 +
 +                    case GMX_NBKERNEL_ELEC_GENERALIZEDBORN:
 +                        /* GB */
 +                        gmx_fatal(FARGS, "Death & horror! GB generic interaction not implemented.\n");
 +                        break;
 +
 +                    case GMX_NBKERNEL_ELEC_EWALD:
 +                        ewrt             = rsq*rinv*ewtabscale;
 +                        ewitab           = ewrt;
 +                        eweps            = ewrt-ewitab;
 +                        ewitab           = 4*ewitab;
 +                        felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
 +                        rinvcorr         = (fr->coulomb_modifier == eintmodPOTSHIFT) ? rinv-fr->ic->sh_ewald : rinv;
 +                        velec            = qq*(rinvcorr-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
 +                        felec            = qq*rinv*(rinvsq-felec);
 +                        break;
 +
 +                    default:
 +                        gmx_fatal(FARGS, "Death & horror! No generic coulomb interaction for ielec=%d.\n", ielec);
 +                        break;
 +                }
 +                if (fr->coulomb_modifier == eintmodPOTSWITCH)
 +                {
 +                    d                = rsq*rinv-fr->rcoulomb_switch;
 +                    d                = (d > 0.0) ? d : 0.0;
 +                    d2               = d*d;
 +                    sw               = 1.0+d2*d*(elec_swV3+d*(elec_swV4+d*elec_swV5));
 +                    dsw              = d2*(elec_swF2+d*(elec_swF3+d*elec_swF4));
 +                    /* Apply switch function. Note that felec=f/r since it will be multiplied
 +                     * by the i-j displacement vector. This means felec'=f'/r=-(v*sw)'/r=
 +                     * -(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=felec*sw-v*dsw/r
 +                     */
 +                    felec            = felec*sw - rinv*velec*dsw;
 +                    /* Once we have used velec to update felec we can modify velec too */
 +                    velec           *= sw;
 +                }
 +                if (bExactElecCutoff)
 +                {
 +                    felec            = (rsq <= rcoulomb2) ? felec : 0.0;
 +                    velec            = (rsq <= rcoulomb2) ? velec : 0.0;
 +                }
 +                vctot           += velec;
 +            } /* End of coulomb interactions */
 +
 +
 +            /* VdW interaction. ivdw==0 means no interaction */
 +            if (ivdw != GMX_NBKERNEL_VDW_NONE)
 +            {
 +                tj               = nti+nvdwparam*type[jnr];
 +
 +                switch (ivdw)
 +                {
 +                    case GMX_NBKERNEL_VDW_NONE:
 +                        break;
 +
 +                    case GMX_NBKERNEL_VDW_LENNARDJONES:
 +                        /* Vanilla Lennard-Jones cutoff */
 +                        c6               = vdwparam[tj];
 +                        c12              = vdwparam[tj+1];
 +                        rinvsix          = rinvsq*rinvsq*rinvsq;
 +                        vvdw_disp        = c6*rinvsix;
 +                        vvdw_rep         = c12*rinvsix*rinvsix;
 +                        fvdw             = (vvdw_rep-vvdw_disp)*rinvsq;
 +                        if (fr->vdw_modifier == eintmodPOTSHIFT)
 +                        {
 +                            vvdw             = (vvdw_rep-c12*sh_invrc6*sh_invrc6)*(1.0/12.0)-(vvdw_disp-c6*sh_invrc6)*(1.0/6.0);
 +                        }
 +                        else
 +                        {
 +                            vvdw             = vvdw_rep/12.0-vvdw_disp/6.0;
 +                        }
 +                        break;
 +
 +                    case GMX_NBKERNEL_VDW_BUCKINGHAM:
 +                        /* Buckingham */
 +                        c6               = vdwparam[tj];
 +                        cexp1            = vdwparam[tj+1];
 +                        cexp2            = vdwparam[tj+2];
 +
 +                        rinvsix          = rinvsq*rinvsq*rinvsq;
 +                        vvdw_disp        = c6*rinvsix;
 +                        br               = cexp2*rsq*rinv;
 +                        vvdw_rep         = cexp1*exp(-br);
 +                        fvdw             = (br*vvdw_rep-vvdw_disp)*rinvsq;
 +                        if (fr->vdw_modifier == eintmodPOTSHIFT)
 +                        {
 +                            vvdw             = (vvdw_rep-cexp1*exp(-cexp2*rvdw))-(vvdw_disp-c6*sh_invrc6)/6.0;
 +                        }
 +                        else
 +                        {
 +                            vvdw             = vvdw_rep-vvdw_disp/6.0;
 +                        }
 +                        break;
 +
 +                    case GMX_NBKERNEL_VDW_CUBICSPLINETABLE:
 +                        /* Tabulated VdW */
 +                        c6               = vdwparam[tj];
 +                        c12              = vdwparam[tj+1];
 +                        Y                = VFtab[nnn+4];
 +                        F                = VFtab[nnn+5];
 +                        Geps             = eps*VFtab[nnn+6];
 +                        Heps2            = eps2*VFtab[nnn+7];
 +                        Fp               = F+Geps+Heps2;
 +                        VV               = Y+eps*Fp;
 +                        FF               = Fp+Geps+2.0*Heps2;
 +                        vvdw_disp        = c6*VV;
 +                        fijD             = c6*FF;
 +                        Y                = VFtab[nnn+8];
 +                        F                = VFtab[nnn+9];
 +                        Geps             = eps*VFtab[nnn+10];
 +                        Heps2            = eps2*VFtab[nnn+11];
 +                        Fp               = F+Geps+Heps2;
 +                        VV               = Y+eps*Fp;
 +                        FF               = Fp+Geps+2.0*Heps2;
 +                        vvdw_rep         = c12*VV;
 +                        fijR             = c12*FF;
 +                        fvdw             = -(fijD+fijR)*tabscale*rinv;
 +                        vvdw             = vvdw_disp + vvdw_rep;
 +                        break;
 +
 +                    default:
 +                        gmx_fatal(FARGS, "Death & horror! No generic VdW interaction for ivdw=%d.\n", ivdw);
 +                        break;
 +                }
 +                if (fr->vdw_modifier == eintmodPOTSWITCH)
 +                {
 +                    d                = rsq*rinv-fr->rvdw_switch;
 +                    d                = (d > 0.0) ? d : 0.0;
 +                    d2               = d*d;
 +                    sw               = 1.0+d2*d*(vdw_swV3+d*(vdw_swV4+d*vdw_swV5));
 +                    dsw              = d2*(vdw_swF2+d*(vdw_swF3+d*vdw_swF4));
 +                    /* See coulomb interaction for the force-switch formula */
 +                    fvdw             = fvdw*sw - rinv*vvdw*dsw;
 +                    vvdw            *= sw;
 +                }
 +                if (bExactVdwCutoff)
 +                {
 +                    fvdw             = (rsq <= rvdw2) ? fvdw : 0.0;
 +                    vvdw             = (rsq <= rvdw2) ? vvdw : 0.0;
 +                }
 +                vvdwtot         += vvdw;
 +            } /* end VdW interactions */
 +
 +            fscal            = felec+fvdw;
 +
 +            if (!bCG && force_cap > 0 && (fabs(fscal) > force_cap))
 +            {
 +                fscal = force_cap*fscal/fabs(fscal);
 +            }
 +
 +            fscal           *= hybscal;
 +
 +            tx               = fscal*dx;
 +            ty               = fscal*dy;
 +            tz               = fscal*dz;
 +            fix              = fix + tx;
 +            fiy              = fiy + ty;
 +            fiz              = fiz + tz;
 +            f[j3+0]          = f[j3+0] - tx;
 +            f[j3+1]          = f[j3+1] - ty;
 +            f[j3+2]          = f[j3+2] - tz;
 +        }
 +
 +        f[ii3+0]         = f[ii3+0] + fix;
 +        f[ii3+1]         = f[ii3+1] + fiy;
 +        f[ii3+2]         = f[ii3+2] + fiz;
 +        fshift[is3]      = fshift[is3]+fix;
 +        fshift[is3+1]    = fshift[is3+1]+fiy;
 +        fshift[is3+2]    = fshift[is3+2]+fiz;
 +        ggid             = nlist->gid[n];
 +        velecgrp[ggid]  += vctot;
 +        vvdwgrp[ggid]   += vvdwtot;
 +    }
-     inc_nrnb(nrnb, eNR_NBKERNEL_GENERIC, nlist->nri*12 + nlist->jindex[n]*50);
++    /* Estimate flops, average for generic adress kernel:
++     * 14 flops per outer iteration
++     * 54 flops per inner iteration
 +     */
++    inc_nrnb(nrnb, eNR_NBKERNEL_GENERIC_ADRESS, nlist->nri*14 + nlist->jindex[n]*54);
 +}
index a7befbec0616522ec3e031b7ba7ba300ef933455,0000000000000000000000000000000000000000..cc24267d1162f1420adc2cc1cf58f8c6c3d950ca
mode 100644,000000..100644
--- /dev/null
@@@ -1,331 -1,0 +1,331 @@@
-     inc_nrnb(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist->nri*12 + nlist->jindex[n]*100);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "typedefs.h"
 +#include "nb_generic_cg.h"
 +#include "nonbonded.h"
 +#include "nb_kernel.h"
 +#include "nrnb.h"
 +
 +void
 +gmx_nb_generic_cg_kernel(t_nblist *                nlist,
 +                         rvec *                    xx,
 +                         rvec *                    ff,
 +                         t_forcerec *              fr,
 +                         t_mdatoms *               mdatoms,
 +                         nb_kernel_data_t *        kernel_data,
 +                         t_nrnb *                  nrnb)
 +{
 +    int           nri, ntype, table_nelements, ielec, ivdw;
 +    real          facel, gbtabscale;
 +    int           n, is3, i3, k, nj0, nj1, j3, ggid, nnn, n0;
 +    int           ai0, ai1, ai, aj0, aj1, aj;
 +    real          shX, shY, shZ;
 +    real          fscal, tx, ty, tz;
 +    real          rinvsq;
 +    real          iq;
 +    real          qq, vcoul, krsq, vctot;
 +    int           nti, nvdwparam;
 +    int           tj;
 +    real          rt, r, eps, eps2, Y, F, Geps, Heps2, VV, FF, Fp, fijD, fijR;
 +    real          rinvsix;
 +    real          Vvdwtot;
 +    real          Vvdw_rep, Vvdw_disp;
 +    real          ix, iy, iz, fix, fiy, fiz;
 +    real          jx, jy, jz;
 +    real          dx, dy, dz, rsq, rinv;
 +    real          c6, c12, cexp1, cexp2, br;
 +    real *        charge;
 +    real *        shiftvec;
 +    real *        vdwparam;
 +    int *         shift;
 +    int *         type;
 +    t_excl *      excl;
 +    real *        fshift;
 +    real *        Vc;
 +    real *        Vvdw;
 +    real          tabscale;
 +    real *        VFtab;
 +    real *        x;
 +    real *        f;
 +
 +    x                   = xx[0];
 +    f                   = ff[0];
 +    ielec               = nlist->ielec;
 +    ivdw                = nlist->ivdw;
 +
 +    fshift              = fr->fshift[0];
 +    Vc                  = kernel_data->energygrp_elec;
 +    Vvdw                = kernel_data->energygrp_vdw;
 +    tabscale            = kernel_data->table_elec_vdw->scale;
 +    VFtab               = kernel_data->table_elec_vdw->data;
 +
 +    /* avoid compiler warnings for cases that cannot happen */
 +    nnn                 = 0;
 +    vcoul               = 0.0;
 +    eps                 = 0.0;
 +    eps2                = 0.0;
 +
 +    /* 3 VdW parameters for buckingham, otherwise 2 */
 +    nvdwparam           = (nlist->ivdw == 2) ? 3 : 2;
 +    table_nelements     = (ielec == 3) ? 4 : 0;
 +    table_nelements    += (ivdw == 3) ? 8 : 0;
 +
 +    charge              = mdatoms->chargeA;
 +    type                = mdatoms->typeA;
 +    facel               = fr->epsfac;
 +    shiftvec            = fr->shift_vec[0];
 +    vdwparam            = fr->nbfp;
 +    ntype               = fr->ntype;
 +
 +    for (n = 0; (n < nlist->nri); n++)
 +    {
 +        is3              = 3*nlist->shift[n];
 +        shX              = shiftvec[is3];
 +        shY              = shiftvec[is3+1];
 +        shZ              = shiftvec[is3+2];
 +        nj0              = nlist->jindex[n];
 +        nj1              = nlist->jindex[n+1];
 +        ai0              = nlist->iinr[n];
 +        ai1              = nlist->iinr_end[n];
 +        vctot            = 0;
 +        Vvdwtot          = 0;
 +        fix              = 0;
 +        fiy              = 0;
 +        fiz              = 0;
 +
 +        for (k = nj0; (k < nj1); k++)
 +        {
 +            aj0              = nlist->jjnr[k];
 +            aj1              = nlist->jjnr_end[k];
 +            excl             = &nlist->excl[k*MAX_CGCGSIZE];
 +
 +            for (ai = ai0; (ai < ai1); ai++)
 +            {
 +                i3               = ai*3;
 +                ix               = shX + x[i3+0];
 +                iy               = shY + x[i3+1];
 +                iz               = shZ + x[i3+2];
 +                iq               = facel*charge[ai];
 +                nti              = nvdwparam*ntype*type[ai];
 +
 +                /* Note that this code currently calculates
 +                 * all LJ and Coulomb interactions,
 +                 * even if the LJ parameters or charges are zero.
 +                 * If required, this can be optimized.
 +                 */
 +
 +                for (aj = aj0; (aj < aj1); aj++)
 +                {
 +                    /* Check if this interaction is excluded */
 +                    if (excl[aj-aj0] & (1<<(ai-ai0)))
 +                    {
 +                        continue;
 +                    }
 +
 +                    j3               = aj*3;
 +                    jx               = x[j3+0];
 +                    jy               = x[j3+1];
 +                    jz               = x[j3+2];
 +                    dx               = ix - jx;
 +                    dy               = iy - jy;
 +                    dz               = iz - jz;
 +                    rsq              = dx*dx+dy*dy+dz*dz;
 +                    rinv             = gmx_invsqrt(rsq);
 +                    rinvsq           = rinv*rinv;
 +                    fscal            = 0;
 +
 +                    if (ielec == 3 || ivdw == 3)
 +                    {
 +                        r                = rsq*rinv;
 +                        rt               = r*tabscale;
 +                        n0               = rt;
 +                        eps              = rt-n0;
 +                        eps2             = eps*eps;
 +                        nnn              = table_nelements*n0;
 +                    }
 +
 +                    /* Coulomb interaction. ielec==0 means no interaction */
 +                    if (ielec > 0)
 +                    {
 +                        qq               = iq*charge[aj];
 +
 +                        switch (ielec)
 +                        {
 +                            case 1:
 +                                /* Vanilla cutoff coulomb */
 +                                vcoul            = qq*rinv;
 +                                fscal            = vcoul*rinvsq;
 +                                break;
 +
 +                            case 2:
 +                                /* Reaction-field */
 +                                krsq             = fr->k_rf*rsq;
 +                                vcoul            = qq*(rinv+krsq-fr->c_rf);
 +                                fscal            = qq*(rinv-2.0*krsq)*rinvsq;
 +                                break;
 +
 +                            case 3:
 +                                /* Tabulated coulomb */
 +                                Y                = VFtab[nnn];
 +                                F                = VFtab[nnn+1];
 +                                Geps             = eps*VFtab[nnn+2];
 +                                Heps2            = eps2*VFtab[nnn+3];
 +                                nnn             += 4;
 +                                Fp               = F+Geps+Heps2;
 +                                VV               = Y+eps*Fp;
 +                                FF               = Fp+Geps+2.0*Heps2;
 +                                vcoul            = qq*VV;
 +                                fscal            = -qq*FF*tabscale*rinv;
 +                                break;
 +
 +                            case 4:
 +                                /* GB */
 +                                gmx_fatal(FARGS, "Death & horror! GB generic interaction not implemented.\n");
 +                                break;
 +
 +                            default:
 +                                gmx_fatal(FARGS, "Death & horror! No generic coulomb interaction for ielec=%d.\n", ielec);
 +                                break;
 +                        }
 +                        vctot            = vctot+vcoul;
 +                    }  /* End of coulomb interactions */
 +
 +
 +                    /* VdW interaction. ivdw==0 means no interaction */
 +                    if (ivdw > 0)
 +                    {
 +                        tj               = nti+nvdwparam*type[aj];
 +
 +                        switch (ivdw)
 +                        {
 +                            case 1:
 +                                /* Vanilla Lennard-Jones cutoff */
 +                                c6               = vdwparam[tj];
 +                                c12              = vdwparam[tj+1];
 +
 +                                rinvsix          = rinvsq*rinvsq*rinvsq;
 +                                Vvdw_disp        = c6*rinvsix;
 +                                Vvdw_rep         = c12*rinvsix*rinvsix;
 +                                fscal           += (12.0*Vvdw_rep-6.0*Vvdw_disp)*rinvsq;
 +                                Vvdwtot          = Vvdwtot+Vvdw_rep-Vvdw_disp;
 +                                break;
 +
 +                            case 2:
 +                                /* Buckingham */
 +                                c6               = vdwparam[tj];
 +                                cexp1            = vdwparam[tj+1];
 +                                cexp2            = vdwparam[tj+2];
 +
 +                                rinvsix          = rinvsq*rinvsq*rinvsq;
 +                                Vvdw_disp        = c6*rinvsix;
 +                                br               = cexp2*rsq*rinv;
 +                                Vvdw_rep         = cexp1*exp(-br);
 +                                fscal           += (br*Vvdw_rep-6.0*Vvdw_disp)*rinvsq;
 +                                Vvdwtot          = Vvdwtot+Vvdw_rep-Vvdw_disp;
 +                                break;
 +
 +                            case 3:
 +                                /* Tabulated VdW */
 +                                c6               = vdwparam[tj];
 +                                c12              = vdwparam[tj+1];
 +
 +                                Y                = VFtab[nnn];
 +                                F                = VFtab[nnn+1];
 +                                Geps             = eps*VFtab[nnn+2];
 +                                Heps2            = eps2*VFtab[nnn+3];
 +                                Fp               = F+Geps+Heps2;
 +                                VV               = Y+eps*Fp;
 +                                FF               = Fp+Geps+2.0*Heps2;
 +                                Vvdw_disp        = c6*VV;
 +                                fijD             = c6*FF;
 +                                nnn             += 4;
 +                                Y                = VFtab[nnn];
 +                                F                = VFtab[nnn+1];
 +                                Geps             = eps*VFtab[nnn+2];
 +                                Heps2            = eps2*VFtab[nnn+3];
 +                                Fp               = F+Geps+Heps2;
 +                                VV               = Y+eps*Fp;
 +                                FF               = Fp+Geps+2.0*Heps2;
 +                                Vvdw_rep         = c12*VV;
 +                                fijR             = c12*FF;
 +                                fscal           += -(fijD+fijR)*tabscale*rinv;
 +                                Vvdwtot          = Vvdwtot + Vvdw_disp + Vvdw_rep;
 +                                break;
 +
 +                            default:
 +                                gmx_fatal(FARGS, "Death & horror! No generic VdW interaction for ivdw=%d.\n", ivdw);
 +                                break;
 +                        }
 +                    }  /* end VdW interactions */
 +
 +
 +                    tx               = fscal*dx;
 +                    ty               = fscal*dy;
 +                    tz               = fscal*dz;
 +                    f[i3+0]         += tx;
 +                    f[i3+1]         += ty;
 +                    f[i3+2]         += tz;
 +                    f[j3+0]         -= tx;
 +                    f[j3+1]         -= ty;
 +                    f[j3+2]         -= tz;
 +                    fix             += tx;
 +                    fiy             += ty;
 +                    fiz             += tz;
 +                }
 +            }
 +        }
 +
 +        fshift[is3]     += fix;
 +        fshift[is3+1]   += fiy;
 +        fshift[is3+2]   += fiz;
 +        ggid             = nlist->gid[n];
 +        Vc[ggid]        += vctot;
 +        Vvdw[ggid]      += Vvdwtot;
 +    }
 +    /* Estimate flops, average for generic cg kernel:
 +     * 12  flops per outer iteration
 +     * 100 flops per inner iteration
 +     */
++    inc_nrnb(nrnb, eNR_NBKERNEL_GENERIC_CG, nlist->nri*12 + nlist->jindex[n]*100);
 +}
index 8188a7c0d95315a13b42890eb4159d355b5a7ded,0000000000000000000000000000000000000000..7117127ba7dd20dd5dedfeeb781fedd317c32589
mode 100644,000000..100644
--- /dev/null
@@@ -1,959 -1,0 +1,959 @@@
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_pd();
 +        vgbsum           = _mm256_setzero_pd();
 +        vvdwsum          = _mm256_setzero_pd();
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm256_mul_pd(c6_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm256_mul_pd(c12_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            vvdw             = _mm256_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 91 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm256_mul_pd(c6_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm256_mul_pd(c12_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            vvdw             = _mm256_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*92);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 81 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*82);
 +}
index 9edb654bddf411c67fef4e81b81692ab21941775,0000000000000000000000000000000000000000..78519efa590b9e251fbc643495c5da6b8dcc42b6
mode 100644,000000..100644
--- /dev/null
@@@ -1,857 -1,0 +1,857 @@@
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_pd();
 +        vgbsum           = _mm256_setzero_pd();
 +        vvdwsum          = _mm256_setzero_pd();
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 70 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*71);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 63 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
 +}
index 40a10c2a31b68e6df0e6f62714d968bed522866a,0000000000000000000000000000000000000000..0d8e8bb2382b0439d12003c4672de8c8d9a91c0b
mode 100644,000000..100644
--- /dev/null
@@@ -1,762 -1,0 +1,762 @@@
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_pd();
 +        vgbsum           = _mm256_setzero_pd();
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 57 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*58);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 55 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*56);
 +}
index 2f0e86719395d773f98f8e170853deca4475dff1,0000000000000000000000000000000000000000..b62ea7ffb30309bd16e946cc92387aa0387df583
mode 100644,000000..100644
--- /dev/null
@@@ -1,1050 -1,0 +1,1050 @@@
-             dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    real *           vdwioffsetptr{I};
 +    __m256d          ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
 +    __m256d          jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m256d          dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm256_set1_pd(fr->ic->k_rf);
 +    krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
 +    crf              = _mm256_set1_pd(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
 +    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff);
 +    beta2            = _mm256_mul_pd(beta,beta);
 +    beta3            = _mm256_mul_pd(beta,beta2);
 +
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm256_set1_pd(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm256_mul_pd(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
 +    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
 +    rvdw             = _mm256_set1_pd(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm256_set1_pd(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm256_set1_pd(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm256_set1_pd(d_scalar);
 +    swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                    &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm256_setzero_pd();
 +        fiy{I}             = _mm256_setzero_pd();
 +        fiz{I}             = _mm256_setzero_pd();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+{I}]));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm256_set1_pd(invsqrta[inr+{I}]);
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm256_setzero_pd();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm256_setzero_pd();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm256_setzero_pd();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm256_setzero_pd();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* Get j neighbor index, and coordinate index */
 +            /* #if ROUND =='Loop' */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            /* #else */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            /* #endif */
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            /* #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #else                                */
 +            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                                 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
 +                                                 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm256_sub_pd(ix{I},jx{J});
 +            dy{I}{J}             = _mm256_sub_pd(iy{I},jy{J});
 +            dz{I}{J}             = _mm256_sub_pd(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm256_calc_rsq_pd(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm256_invsqrt_pd(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm256_inv_pd(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm256_mul_pd(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            jq{J}              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+{J},charge+jnrB+{J},
 +                                                                 charge+jnrC+{J},charge+jnrD+{J});
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            isaj{J}            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
 +                                                                 invsqrta+jnrC+{J},invsqrta+jnrD+{J});
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
 +            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm256_setzero_pd();
 +            fjy{J}             = _mm256_setzero_pd();
 +            fjz{J}             = _mm256_setzero_pd();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm256_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm256_mul_pd(rsq{I}{J},rinv{I}{J});
 +            /*         #if ROUND == 'Epilogue' */
 +            r{I}{J}              = _mm256_andnot_pd(dummy_mask,r{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm256_mul_pd(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr{I}+vdwjidx{J}A,
 +                                            vdwioffsetptr{I}+vdwjidx{J}B,
 +                                            vdwioffsetptr{I}+vdwjidx{J}C,
 +                                            vdwioffsetptr{I}+vdwjidx{J}D,
 +                                            &c6_{I}{J},&c12_{I}{J});
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r{I}{J},vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm256_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(_mm256_add_pd(rinv{I}{J},_mm256_mul_pd(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(_mm256_mul_pd(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai{I},isaj{J});
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq{I}{J},_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r{I}{J},gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r{I}{J})));
 +            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm256_andnot_pd(dummy_mask,dvdatmp);
 +            /*                 #endif */
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /*                 #if ROUND == 'Loop' */
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            /*                 #else */
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            /*                 #endif */
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj{J},isaj{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+12 */
 +            /*             #endif */
 +            velec            = _mm256_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm256_mul_pd(r{I}{J},ewtabscale);
 +            ewitab           = _mm256_cvttpd_epi32(ewrt);
 +            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
 +            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
 +            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
 +            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
 +            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
 +            velec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(_mm256_sub_pd(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
 +            velec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(_mm256_mul_pd(qq{I}{J},rinv{I}{J}),_mm256_sub_pd(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
 +                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
 +                                            &ewtabF,&ewtabFn);
 +            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
 +            felec            = _mm256_mul_pd(_mm256_mul_pd(qq{I}{J},rinv{I}{J}),_mm256_sub_pd(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            velec            = _mm256_mul_pd(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq{I}{J},FF),_mm256_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm256_mul_pd(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm256_mul_pd(c12_{I}{J},_mm256_mul_pd(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_{I}{J},_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_{I}{J},rinvsix),c6_{I}{J}),_mm256_mul_pd(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm256_mul_pd(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm256_mul_pd(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm256_add_pd(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm256_sub_pd(r{I}{J},rswitch);
 +            d                = _mm256_max_pd(d,_mm256_setzero_pd());
 +            d2               = _mm256_mul_pd(d,d);
 +            sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv{I}{J},_mm256_mul_pd(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv{I}{J},_mm256_mul_pd(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm256_mul_pd(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm256_mul_pd(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            cutoff_mask      = _mm256_cmp_pd(rsq{I}{J},rcutoff2,_CMP_LT_OQ);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm256_and_pd(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            /*             #endif */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm256_and_pd(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            /*             #endif */
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
 +            /*             #endif */
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            fscal            = _mm256_and_pd(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx{I}{J});
 +            ty               = _mm256_mul_pd(fscal,dy{I}{J});
 +            tz               = _mm256_mul_pd(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm256_add_pd(fix{I},tx);
 +            fiy{I}             = _mm256_add_pd(fiy{I},ty);
 +            fiz{I}             = _mm256_add_pd(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm256_add_pd(fjx{J},tx);
 +            fjy{J}             = _mm256_add_pd(fjy{J},ty);
 +            fjz{J}             = _mm256_add_pd(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if GEOMETRY_I != 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            /* #endif */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 +                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+12     */
 +            /*     #else                                */
 +            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
 +                                                      fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                                 f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai{I},isai{I}));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index b0cb50e45c30462e07197d40c11fdf6cc1ad3595,0000000000000000000000000000000000000000..3a402136edc83b7e0731cd54a666d023d1a1cea5
mode 100644,000000..100644
--- /dev/null
@@@ -1,530 -1,0 +1,532 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "types/commrec.h"
 +#include "sysstuff.h"
 +#include "gmx_fatal.h"
 +#include "names.h"
 +#include "macros.h"
 +#include "nrnb.h"
 +#include "main.h"
 +#include "smalloc.h"
 +#include "copyrite.h"
 +
 +
 +
 +
 +
 +typedef struct {
 +    const char *name;
 +    int         flop;
 +} t_nrnb_data;
 +
 +
 +static const t_nrnb_data nbdata[eNRNB] = {
 +    /* These are re-used for different NB kernels, since there are so many.
 +     * The actual number of flops is set dynamically.
 +     */
 +    { "NB VdW [V&F]",                    1 },
 +    { "NB VdW [F]",                      1 },
 +    { "NB Elec. [V&F]",                  1 },
 +    { "NB Elec. [F]",                    1 },
 +    { "NB Elec. [W3,V&F]",               1 },
 +    { "NB Elec. [W3,F]",                 1 },
 +    { "NB Elec. [W3-W3,V&F]",            1 },
 +    { "NB Elec. [W3-W3,F]",              1 },
 +    { "NB Elec. [W4,V&F]",               1 },
 +    { "NB Elec. [W4,F]",                 1 },
 +    { "NB Elec. [W4-W4,V&F]",            1 },
 +    { "NB Elec. [W4-W4,F]",              1 },
 +    { "NB VdW & Elec. [V&F]",            1 },
 +    { "NB VdW & Elec. [F]",              1 },
 +    { "NB VdW & Elec. [W3,V&F]",         1 },
 +    { "NB VdW & Elec. [W3,F]",           1 },
 +    { "NB VdW & Elec. [W3-W3,V&F]",      1 },
 +    { "NB VdW & Elec. [W3-W3,F]",        1 },
 +    { "NB VdW & Elec. [W4,V&F]",         1 },
 +    { "NB VdW & Elec. [W4,F]",           1 },
 +    { "NB VdW & Elec. [W4-W4,V&F]",      1 },
 +    { "NB VdW & Elec. [W4-W4,F]",        1 },
 +
 +    { "NB Generic kernel",               1 },
++    { "NB Generic charge grp kernel",    1 },
++    { "NB Generic AdResS kernel",        1 },
 +    { "NB Free energy kernel",           1 },
 +    { "NB All-vs-all",                   1 },
 +    { "NB All-vs-all, GB",               1 },
 +
 +    { "Pair Search distance check",      9 }, /* nbnxn pair dist. check */
 +    /* nbnxn kernel flops are based on inner-loops without exclusion checks.
 +     * Plain Coulomb runs through the RF kernels, except with CUDA.
 +     * invsqrt is counted as 6 flops: 1 for _mm_rsqt_ps + 5 for iteration.
 +     * The flops are equal for plain-C, x86 SIMD and CUDA, except for:
 +     * - plain-C kernel uses one flop more for Coulomb-only (F) than listed
 +     * - x86 SIMD LJ geom-comb.rule kernels (fastest) use 2 more flops
 +     * - x86 SIMD LJ LB-comb.rule kernels (fast) use 3 (8 for F+E) more flops
 +     * - GPU always does exclusions, which requires 2-4 flops, but as invsqrt
 +     *   is always counted as 6 flops, this roughly compensates.
 +     */
 +    { "NxN RF Elec. + VdW [F]",         38 }, /* nbnxn kernel LJ+RF, no ener */
 +    { "NxN RF Elec. + VdW [V&F]",       54 },
 +    { "NxN QSTab Elec. + VdW [F]",      41 }, /* nbnxn kernel LJ+tab, no en */
 +    { "NxN QSTab Elec. + VdW [V&F]",    59 },
 +    { "NxN Ewald Elec. + VdW [F]",      66 }, /* nbnxn kernel LJ+Ewald, no en */
 +    { "NxN Ewald Elec. + VdW [V&F]",   107 },
 +    { "NxN VdW [F]",                    33 }, /* nbnxn kernel LJ, no ener */
 +    { "NxN VdW [V&F]",                  43 },
 +    { "NxN RF Electrostatics [F]",      31 }, /* nbnxn kernel RF, no ener */
 +    { "NxN RF Electrostatics [V&F]",    36 },
 +    { "NxN QSTab Elec. [F]",            34 }, /* nbnxn kernel tab, no ener */
 +    { "NxN QSTab Elec. [V&F]",          41 },
 +    { "NxN Ewald Elec. [F]",            61 }, /* nbnxn kernel Ewald, no ener */
 +    { "NxN Ewald Elec. [V&F]",          84 },
 +    { "1,4 nonbonded interactions",     90 },
 +    { "Born radii (Still)",             47 },
 +    { "Born radii (HCT/OBC)",          183 },
 +    { "Born force chain rule",          15 },
 +    { "All-vs-All Still radii",          1 },
 +    { "All-vs-All HCT/OBC radii",        1 },
 +    { "All-vs-All Born chain rule",      1 },
 +    { "Calc Weights",                   36 },
 +    { "Spread Q",                        6 },
 +    { "Spread Q Bspline",                2 },
 +    { "Gather F",                      23  },
 +    { "Gather F Bspline",              6   },
 +    { "3D-FFT",                        8   },
 +    { "Convolution",                   4   },
 +    { "Solve PME",                     64  },
 +    { "NS-Pairs",                      21  },
 +    { "Reset In Box",                  3   },
 +    { "Shift-X",                       6   },
 +    { "CG-CoM",                        3   },
 +    { "Sum Forces",                    1   },
 +    { "Bonds",                         59  },
 +    { "G96Bonds",                      44  },
 +    { "FENE Bonds",                    58  },
 +    { "Tab. Bonds",                    62  },
 +    { "Restraint Potential",           86  },
 +    { "Linear Angles",                 57  },
 +    { "Angles",                        168 },
 +    { "G96Angles",                     150 },
 +    { "Quartic Angles",                160 },
 +    { "Tab. Angles",                   169 },
 +    { "Propers",                       229 },
 +    { "Impropers",                     208 },
 +    { "RB-Dihedrals",                  247 },
 +    { "Four. Dihedrals",               247 },
 +    { "Tab. Dihedrals",                227 },
 +    { "Dist. Restr.",                  200 },
 +    { "Orient. Restr.",                200 },
 +    { "Dihedral Restr.",               200 },
 +    { "Pos. Restr.",                   50  },
 +    { "Flat-bottom posres",            50  },
 +    { "Angle Restr.",                  191 },
 +    { "Angle Restr. Z",                164 },
 +    { "Morse Potent.",                 83  },
 +    { "Cubic Bonds",                   54  },
 +    { "Walls",                         31  },
 +    { "Polarization",                  59  },
 +    { "Anharmonic Polarization",       72  },
 +    { "Water Pol.",                    62  },
 +    { "Thole Pol.",                    296 },
 +    { "Virial",                        18  },
 +    { "Update",                        31  },
 +    { "Ext.ens. Update",               54  },
 +    { "Stop-CM",                       10  },
 +    { "P-Coupling",                    6   },
 +    { "Calc-Ekin",                     27  },
 +    { "Lincs",                         60  },
 +    { "Lincs-Mat",                     4   },
 +    { "Shake",                         30  },
 +    { "Constraint-V",                   8  },
 +    { "Shake-Init",                    10  },
 +    { "Constraint-Vir",                24  },
 +    { "Settle",                        323 },
 +    { "Virtual Site 2",                23  },
 +    { "Virtual Site 3",                37  },
 +    { "Virtual Site 3fd",              95  },
 +    { "Virtual Site 3fad",             176 },
 +    { "Virtual Site 3out",             87  },
 +    { "Virtual Site 4fd",              110 },
 +    { "Virtual Site 4fdn",             254 },
 +    { "Virtual Site N",                 15 },
 +    { "Mixed Generalized Born stuff",   10 }
 +};
 +
 +
 +void init_nrnb(t_nrnb *nrnb)
 +{
 +    int i;
 +
 +    for (i = 0; (i < eNRNB); i++)
 +    {
 +        nrnb->n[i] = 0.0;
 +    }
 +}
 +
 +void cp_nrnb(t_nrnb *dest, t_nrnb *src)
 +{
 +    int i;
 +
 +    for (i = 0; (i < eNRNB); i++)
 +    {
 +        dest->n[i] = src->n[i];
 +    }
 +}
 +
 +void add_nrnb(t_nrnb *dest, t_nrnb *s1, t_nrnb *s2)
 +{
 +    int i;
 +
 +    for (i = 0; (i < eNRNB); i++)
 +    {
 +        dest->n[i] = s1->n[i]+s2->n[i];
 +    }
 +}
 +
 +void print_nrnb(FILE *out, t_nrnb *nrnb)
 +{
 +    int i;
 +
 +    for (i = 0; (i < eNRNB); i++)
 +    {
 +        if (nrnb->n[i] > 0)
 +        {
 +            fprintf(out, " %-26s %10.0f.\n", nbdata[i].name, nrnb->n[i]);
 +        }
 +    }
 +}
 +
 +void _inc_nrnb(t_nrnb *nrnb, int enr, int inc, char *file, int line)
 +{
 +    nrnb->n[enr] += inc;
 +#ifdef DEBUG_NRNB
 +    printf("nrnb %15s(%2d) incremented with %8d from file %s line %d\n",
 +           nbdata[enr].name, enr, inc, file, line);
 +#endif
 +}
 +
 +void print_flop(FILE *out, t_nrnb *nrnb, double *nbfs, double *mflop)
 +{
 +    int           i;
 +    double        mni, frac, tfrac, tflop;
 +    const char   *myline = "-----------------------------------------------------------------------------";
 +
 +    *nbfs = 0.0;
 +    for (i = 0; (i < eNR_NBKERNEL_ALLVSALLGB); i++)
 +    {
 +        if (strstr(nbdata[i].name, "W3-W3") != NULL)
 +        {
 +            *nbfs += 9e-6*nrnb->n[i];
 +        }
 +        else if (strstr(nbdata[i].name, "W3") != NULL)
 +        {
 +            *nbfs += 3e-6*nrnb->n[i];
 +        }
 +        else if (strstr(nbdata[i].name, "W4-W4") != NULL)
 +        {
 +            *nbfs += 10e-6*nrnb->n[i];
 +        }
 +        else if (strstr(nbdata[i].name, "W4") != NULL)
 +        {
 +            *nbfs += 4e-6*nrnb->n[i];
 +        }
 +        else
 +        {
 +            *nbfs += 1e-6*nrnb->n[i];
 +        }
 +    }
 +    tflop = 0;
 +    for (i = 0; (i < eNRNB); i++)
 +    {
 +        tflop += 1e-6*nrnb->n[i]*nbdata[i].flop;
 +    }
 +
 +    if (tflop == 0)
 +    {
 +        fprintf(out, "No MEGA Flopsen this time\n");
 +        return;
 +    }
 +    if (out)
 +    {
 +        fprintf(out, "\n\tM E G A - F L O P S   A C C O U N T I N G\n\n");
 +    }
 +
 +    if (out)
 +    {
 +        fprintf(out, " NB=Group-cutoff nonbonded kernels    NxN=N-by-N cluster Verlet kernels\n");
 +        fprintf(out, " RF=Reaction-Field  VdW=Van der Waals  QSTab=quadratic-spline table\n");
 +        fprintf(out, " W3=SPC/TIP3p  W4=TIP4p (single or pairs)\n");
 +        fprintf(out, " V&F=Potential and force  V=Potential only  F=Force only\n\n");
 +
 +        fprintf(out, " %-32s %16s %15s  %7s\n",
 +                "Computing:", "M-Number", "M-Flops", "% Flops");
 +        fprintf(out, "%s\n", myline);
 +    }
 +    *mflop = 0.0;
 +    tfrac  = 0.0;
 +    for (i = 0; (i < eNRNB); i++)
 +    {
 +        mni     = 1e-6*nrnb->n[i];
 +        *mflop += mni*nbdata[i].flop;
 +        frac    = 100.0*mni*nbdata[i].flop/tflop;
 +        tfrac  += frac;
 +        if (out && mni != 0)
 +        {
 +            fprintf(out, " %-32s %16.6f %15.3f  %6.1f\n",
 +                    nbdata[i].name, mni, mni*nbdata[i].flop, frac);
 +        }
 +    }
 +    if (out)
 +    {
 +        fprintf(out, "%s\n", myline);
 +        fprintf(out, " %-32s %16s %15.3f  %6.1f\n",
 +                "Total", "", *mflop, tfrac);
 +        fprintf(out, "%s\n\n", myline);
 +    }
 +}
 +
 +void print_perf(FILE *out, double nodetime, double realtime, int nprocs,
 +                gmx_large_int_t nsteps, real delta_t,
 +                double nbfs, double mflop,
 +                int omp_nth_pp)
 +{
 +    real runtime;
 +
 +    fprintf(out, "\n");
 +
 +    if (realtime > 0)
 +    {
 +        fprintf(out, "%12s %12s %12s %10s\n", "", "Core t (s)", "Wall t (s)", "(%)");
 +        fprintf(out, "%12s %12.3f %12.3f %10.1f\n", "Time:",
 +                nodetime, realtime, 100.0*nodetime/realtime);
 +        /* only print day-hour-sec format if realtime is more than 30 min */
 +        if (realtime > 30*60)
 +        {
 +            fprintf(out, "%12s %12s", "", "");
 +            pr_difftime(out, realtime);
 +        }
 +        if (delta_t > 0)
 +        {
 +            mflop   = mflop/realtime;
 +            runtime = nsteps*delta_t;
 +
 +            if (getenv("GMX_DETAILED_PERF_STATS") == NULL)
 +            {
 +                fprintf(out, "%12s %12s %12s\n",
 +                        "", "(ns/day)", "(hour/ns)");
 +                fprintf(out, "%12s %12.3f %12.3f\n", "Performance:",
 +                        runtime*24*3.6/realtime, 1000*realtime/(3600*runtime));
 +            }
 +            else
 +            {
 +                fprintf(out, "%12s %12s %12s %12s %12s\n",
 +                        "", "(Mnbf/s)", (mflop > 1000) ? "(GFlops)" : "(MFlops)",
 +                        "(ns/day)", "(hour/ns)");
 +                fprintf(out, "%12s %12.3f %12.3f %12.3f %12.3f\n", "Performance:",
 +                        nbfs/realtime, (mflop > 1000) ? (mflop/1000) : mflop,
 +                        runtime*24*3.6/realtime, 1000*realtime/(3600*runtime));
 +            }
 +        }
 +        else
 +        {
 +            if (getenv("GMX_DETAILED_PERF_STATS") == NULL)
 +            {
 +                fprintf(out, "%12s %14s\n",
 +                        "", "(steps/hour)");
 +                fprintf(out, "%12s %14.1f\n", "Performance:",
 +                        nsteps*3600.0/realtime);
 +            }
 +            else
 +            {
 +                fprintf(out, "%12s %12s %12s %14s\n",
 +                        "", "(Mnbf/s)", (mflop > 1000) ? "(GFlops)" : "(MFlops)",
 +                        "(steps/hour)");
 +                fprintf(out, "%12s %12.3f %12.3f %14.1f\n", "Performance:",
 +                        nbfs/realtime, (mflop > 1000) ? (mflop/1000) : mflop,
 +                        nsteps*3600.0/realtime);
 +            }
 +        }
 +    }
 +}
 +
 +int cost_nrnb(int enr)
 +{
 +    return nbdata[enr].flop;
 +}
 +
 +const char *nrnb_str(int enr)
 +{
 +    return nbdata[enr].name;
 +}
 +
 +static const int    force_index[] = {
 +    eNR_BONDS,  eNR_ANGLES,  eNR_PROPER, eNR_IMPROPER,
 +    eNR_RB,     eNR_DISRES,  eNR_ORIRES, eNR_POSRES,
 +    eNR_FBPOSRES, eNR_NS,
 +};
 +#define NFORCE_INDEX asize(force_index)
 +
 +static const int    constr_index[] = {
 +    eNR_SHAKE,     eNR_SHAKE_RIJ, eNR_SETTLE,       eNR_UPDATE,       eNR_PCOUPL,
 +    eNR_CONSTR_VIR, eNR_CONSTR_V
 +};
 +#define NCONSTR_INDEX asize(constr_index)
 +
 +static double pr_av(FILE *log, t_commrec *cr,
 +                    double fav, double ftot[], const char *title)
 +{
 +    int    i, perc;
 +    double dperc, unb;
 +
 +    unb = 0;
 +    if (fav > 0)
 +    {
 +        fav /= cr->nnodes - cr->npmenodes;
 +        fprintf(log, "\n %-26s", title);
 +        for (i = 0; (i < cr->nnodes); i++)
 +        {
 +            dperc = (100.0*ftot[i])/fav;
 +            unb   = max(unb, dperc);
 +            perc  = dperc;
 +            fprintf(log, "%3d ", perc);
 +        }
 +        if (unb > 0)
 +        {
 +            perc = 10000.0/unb;
 +            fprintf(log, "%6d%%\n\n", perc);
 +        }
 +        else
 +        {
 +            fprintf(log, "\n\n");
 +        }
 +    }
 +    return unb;
 +}
 +
 +void pr_load(FILE *log, t_commrec *cr, t_nrnb nrnb[])
 +{
 +    int     i, j, perc;
 +    double  dperc, unb, uf, us;
 +    double *ftot, fav;
 +    double *stot, sav;
 +    t_nrnb *av;
 +
 +    snew(av, 1);
 +    snew(ftot, cr->nnodes);
 +    snew(stot, cr->nnodes);
 +    init_nrnb(av);
 +    for (i = 0; (i < cr->nnodes); i++)
 +    {
 +        add_nrnb(av, av, &(nrnb[i]));
 +        /* Cost due to forces */
 +        for (j = 0; (j < eNR_NBKERNEL_ALLVSALLGB); j++)
 +        {
 +            ftot[i] += nrnb[i].n[j]*cost_nrnb(j);
 +        }
 +        for (j = 0; (j < NFORCE_INDEX); j++)
 +        {
 +            ftot[i] += nrnb[i].n[force_index[j]]*cost_nrnb(force_index[j]);
 +        }
 +        /* Due to shake */
 +        for (j = 0; (j < NCONSTR_INDEX); j++)
 +        {
 +            stot[i] += nrnb[i].n[constr_index[j]]*cost_nrnb(constr_index[j]);
 +        }
 +    }
 +    for (j = 0; (j < eNRNB); j++)
 +    {
 +        av->n[j] = av->n[j]/(double)(cr->nnodes - cr->npmenodes);
 +    }
 +
 +    fprintf(log, "\nDetailed load balancing info in percentage of average\n");
 +
 +    fprintf(log, " Type                 NODE:");
 +    for (i = 0; (i < cr->nnodes); i++)
 +    {
 +        fprintf(log, "%3d ", i);
 +    }
 +    fprintf(log, "Scaling\n");
 +    fprintf(log, "---------------------------");
 +    for (i = 0; (i < cr->nnodes); i++)
 +    {
 +        fprintf(log, "----");
 +    }
 +    fprintf(log, "-------\n");
 +
 +    for (j = 0; (j < eNRNB); j++)
 +    {
 +        unb = 100.0;
 +        if (av->n[j] > 0)
 +        {
 +            fprintf(log, " %-26s", nrnb_str(j));
 +            for (i = 0; (i < cr->nnodes); i++)
 +            {
 +                dperc = (100.0*nrnb[i].n[j])/av->n[j];
 +                unb   = max(unb, dperc);
 +                perc  = dperc;
 +                fprintf(log, "%3d ", perc);
 +            }
 +            if (unb > 0)
 +            {
 +                perc = 10000.0/unb;
 +                fprintf(log, "%6d%%\n", perc);
 +            }
 +            else
 +            {
 +                fprintf(log, "\n");
 +            }
 +        }
 +    }
 +    fav = sav = 0;
 +    for (i = 0; (i < cr->nnodes); i++)
 +    {
 +        fav += ftot[i];
 +        sav += stot[i];
 +    }
 +    uf = pr_av(log, cr, fav, ftot, "Total Force");
 +    us = pr_av(log, cr, sav, stot, "Total Constr.");
 +
 +    unb = (uf*fav+us*sav)/(fav+sav);
 +    if (unb > 0)
 +    {
 +        unb = 10000.0/unb;
 +        fprintf(log, "\nTotal Scaling: %.0f%% of max performance\n\n", unb);
 +    }
 +}
index 57eb8d9d0ad58b8f6f373f1bef82fd828a55a83d,0000000000000000000000000000000000000000..2bcc288e5d9708a9bddbddb4e700ff906d409b34
mode 100644,000000..100644
--- /dev/null
@@@ -1,3866 -1,0 +1,3866 @@@
-                 "The switch/shift interaction settings are just for compatibility; you will get better"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include <stdlib.h>
 +#include <limits.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "index.h"
 +#include "symtab.h"
 +#include "string2.h"
 +#include "readinp.h"
 +#include "warninp.h"
 +#include "readir.h"
 +#include "toputil.h"
 +#include "index.h"
 +#include "network.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "mtop_util.h"
 +#include "chargegroup.h"
 +#include "inputrec.h"
 +
 +#define MAXPTR 254
 +#define NOGID  255
 +#define MAXLAMBDAS 1024
 +
 +/* Resource parameters
 + * Do not change any of these until you read the instruction
 + * in readinp.h. Some cpp's do not take spaces after the backslash
 + * (like the c-shell), which will give you a very weird compiler
 + * message.
 + */
 +
 +static char tcgrps[STRLEN], tau_t[STRLEN], ref_t[STRLEN],
 +            acc[STRLEN], accgrps[STRLEN], freeze[STRLEN], frdim[STRLEN],
 +            energy[STRLEN], user1[STRLEN], user2[STRLEN], vcm[STRLEN], xtc_grps[STRLEN],
 +            couple_moltype[STRLEN], orirefitgrp[STRLEN], egptable[STRLEN], egpexcl[STRLEN],
 +            wall_atomtype[STRLEN], wall_density[STRLEN], deform[STRLEN], QMMM[STRLEN];
 +static char   fep_lambda[efptNR][STRLEN];
 +static char   lambda_weights[STRLEN];
 +static char **pull_grp;
 +static char **rot_grp;
 +static char   anneal[STRLEN], anneal_npoints[STRLEN],
 +              anneal_time[STRLEN], anneal_temp[STRLEN];
 +static char   QMmethod[STRLEN], QMbasis[STRLEN], QMcharge[STRLEN], QMmult[STRLEN],
 +              bSH[STRLEN], CASorbitals[STRLEN], CASelectrons[STRLEN], SAon[STRLEN],
 +              SAoff[STRLEN], SAsteps[STRLEN], bTS[STRLEN], bOPT[STRLEN];
 +static char efield_x[STRLEN], efield_xt[STRLEN], efield_y[STRLEN],
 +            efield_yt[STRLEN], efield_z[STRLEN], efield_zt[STRLEN];
 +
 +enum {
 +    egrptpALL,         /* All particles have to be a member of a group.     */
 +    egrptpALL_GENREST, /* A rest group with name is generated for particles *
 +                        * that are not part of any group.                   */
 +    egrptpPART,        /* As egrptpALL_GENREST, but no name is generated    *
 +                        * for the rest group.                               */
 +    egrptpONE          /* Merge all selected groups into one group,         *
 +                        * make a rest group for the remaining particles.    */
 +};
 +
 +
 +void init_ir(t_inputrec *ir, t_gromppopts *opts)
 +{
 +    snew(opts->include, STRLEN);
 +    snew(opts->define, STRLEN);
 +    snew(ir->fepvals, 1);
 +    snew(ir->expandedvals, 1);
 +    snew(ir->simtempvals, 1);
 +}
 +
 +static void GetSimTemps(int ntemps, t_simtemp *simtemp, double *temperature_lambdas)
 +{
 +
 +    int i;
 +
 +    for (i = 0; i < ntemps; i++)
 +    {
 +        /* simple linear scaling -- allows more control */
 +        if (simtemp->eSimTempScale == esimtempLINEAR)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*temperature_lambdas[i];
 +        }
 +        else if (simtemp->eSimTempScale == esimtempGEOMETRIC)  /* should give roughly equal acceptance for constant heat capacity . . . */
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low * pow(simtemp->simtemp_high/simtemp->simtemp_low, (1.0*i)/(ntemps-1));
 +        }
 +        else if (simtemp->eSimTempScale == esimtempEXPONENTIAL)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*((exp(temperature_lambdas[i])-1)/(exp(1.0)-1));
 +        }
 +        else
 +        {
 +            char errorstr[128];
 +            sprintf(errorstr, "eSimTempScale=%d not defined", simtemp->eSimTempScale);
 +            gmx_fatal(FARGS, errorstr);
 +        }
 +    }
 +}
 +
 +
 +
 +static void _low_check(gmx_bool b, char *s, warninp_t wi)
 +{
 +    if (b)
 +    {
 +        warning_error(wi, s);
 +    }
 +}
 +
 +static void check_nst(const char *desc_nst, int nst,
 +                      const char *desc_p, int *p,
 +                      warninp_t wi)
 +{
 +    char buf[STRLEN];
 +
 +    if (*p > 0 && *p % nst != 0)
 +    {
 +        /* Round up to the next multiple of nst */
 +        *p = ((*p)/nst + 1)*nst;
 +        sprintf(buf, "%s should be a multiple of %s, changing %s to %d\n",
 +                desc_p, desc_nst, desc_p, *p);
 +        warning(wi, buf);
 +    }
 +}
 +
 +static gmx_bool ir_NVE(const t_inputrec *ir)
 +{
 +    return ((ir->eI == eiMD || EI_VV(ir->eI)) && ir->etc == etcNO);
 +}
 +
 +static int lcd(int n1, int n2)
 +{
 +    int d, i;
 +
 +    d = 1;
 +    for (i = 2; (i <= n1 && i <= n2); i++)
 +    {
 +        if (n1 % i == 0 && n2 % i == 0)
 +        {
 +            d = i;
 +        }
 +    }
 +
 +    return d;
 +}
 +
 +static void process_interaction_modifier(const t_inputrec *ir, int *eintmod)
 +{
 +    if (*eintmod == eintmodPOTSHIFT_VERLET)
 +    {
 +        if (ir->cutoff_scheme == ecutsVERLET)
 +        {
 +            *eintmod = eintmodPOTSHIFT;
 +        }
 +        else
 +        {
 +            *eintmod = eintmodNONE;
 +        }
 +    }
 +}
 +
 +void check_ir(const char *mdparin, t_inputrec *ir, t_gromppopts *opts,
 +              warninp_t wi)
 +/* Check internal consistency */
 +{
 +    /* Strange macro: first one fills the err_buf, and then one can check
 +     * the condition, which will print the message and increase the error
 +     * counter.
 +     */
 +#define CHECK(b) _low_check(b, err_buf, wi)
 +    char        err_buf[256], warn_buf[STRLEN];
 +    int         i, j;
 +    int         ns_type  = 0;
 +    real        dt_coupl = 0;
 +    real        dt_pcoupl;
 +    int         nstcmin;
 +    t_lambda   *fep    = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +
 +    set_warning_line(wi, mdparin, -1);
 +
 +    /* BASIC CUT-OFF STUFF */
 +    if (ir->rcoulomb < 0)
 +    {
 +        warning_error(wi, "rcoulomb should be >= 0");
 +    }
 +    if (ir->rvdw < 0)
 +    {
 +        warning_error(wi, "rvdw should be >= 0");
 +    }
 +    if (ir->rlist < 0 &&
 +        !(ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0))
 +    {
 +        warning_error(wi, "rlist should be >= 0");
 +    }
 +
 +    process_interaction_modifier(ir, &ir->coulomb_modifier);
 +    process_interaction_modifier(ir, &ir->vdw_modifier);
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* BASIC CUT-OFF STUFF */
 +        if (ir->rlist == 0 ||
 +            !((EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > ir->rlist) ||
 +              (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype)    && ir->rvdw     > ir->rlist)))
 +        {
 +            /* No switched potential and/or no twin-range:
 +             * we can set the long-range cut-off to the maximum of the other cut-offs.
 +             */
 +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +        }
 +        else if (ir->rlistlong < 0)
 +        {
 +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +            sprintf(warn_buf, "rlistlong was not set, setting it to %g (no buffer)",
 +                    ir->rlistlong);
 +            warning(wi, warn_buf);
 +        }
 +        if (ir->rlistlong == 0 && ir->ePBC != epbcNONE)
 +        {
 +            warning_error(wi, "Can not have an infinite cut-off with PBC");
 +        }
 +        if (ir->rlistlong > 0 && (ir->rlist == 0 || ir->rlistlong < ir->rlist))
 +        {
 +            warning_error(wi, "rlistlong can not be shorter than rlist");
 +        }
 +        if (IR_TWINRANGE(*ir) && ir->nstlist <= 0)
 +        {
 +            warning_error(wi, "Can not have nstlist<=0 with twin-range interactions");
 +        }
 +    }
 +
 +    if (ir->rlistlong == ir->rlist)
 +    {
 +        ir->nstcalclr = 0;
 +    }
 +    else if (ir->rlistlong > ir->rlist && ir->nstcalclr == 0)
 +    {
 +        warning_error(wi, "With different cutoffs for electrostatics and VdW, nstcalclr must be -1 or a positive number");
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        real rc_max;
 +
 +        /* Normal Verlet type neighbor-list, currently only limited feature support */
 +        if (inputrec2nboundeddim(ir) < 3)
 +        {
 +            warning_error(wi, "With Verlet lists only full pbc or pbc=xy with walls is supported");
 +        }
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            warning_error(wi, "With Verlet lists rcoulomb!=rvdw is not supported");
 +        }
 +        if (ir->vdwtype != evdwCUT)
 +        {
 +            warning_error(wi, "With Verlet lists only cut-off LJ interactions are supported");
 +        }
 +        if (!(ir->coulombtype == eelCUT ||
 +              (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC) ||
 +              EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD))
 +        {
 +            warning_error(wi, "With Verlet lists only cut-off, reaction-field, PME and Ewald electrostatics are supported");
 +        }
 +
 +        if (ir->nstlist <= 0)
 +        {
 +            warning_error(wi, "With Verlet lists nstlist should be larger than 0");
 +        }
 +
 +        if (ir->nstlist < 10)
 +        {
 +            warning_note(wi, "With Verlet lists the optimal nstlist is >= 10, with GPUs >= 20. Note that with the Verlet scheme, nstlist has no effect on the accuracy of your simulation.");
 +        }
 +
 +        rc_max = max(ir->rvdw, ir->rcoulomb);
 +
 +        if (ir->verletbuf_drift <= 0)
 +        {
 +            if (ir->verletbuf_drift == 0)
 +            {
 +                warning_error(wi, "Can not have an energy drift of exactly 0");
 +            }
 +
 +            if (ir->rlist < rc_max)
 +            {
 +                warning_error(wi, "With verlet lists rlist can not be smaller than rvdw or rcoulomb");
 +            }
 +
 +            if (ir->rlist == rc_max && ir->nstlist > 1)
 +            {
 +                warning_note(wi, "rlist is equal to rvdw and/or rcoulomb: there is no explicit Verlet buffer. The cluster pair list does have a buffering effect, but choosing a larger rlist might be necessary for good energy conservation.");
 +            }
 +        }
 +        else
 +        {
 +            if (ir->rlist > rc_max)
 +            {
 +                warning_note(wi, "You have set rlist larger than the interaction cut-off, but you also have verlet-buffer-drift > 0. Will set rlist using verlet-buffer-drift.");
 +            }
 +
 +            if (ir->nstlist == 1)
 +            {
 +                /* No buffer required */
 +                ir->rlist = rc_max;
 +            }
 +            else
 +            {
 +                if (EI_DYNAMICS(ir->eI))
 +                {
 +                    if (EI_MD(ir->eI) && ir->etc == etcNO)
 +                    {
 +                        warning_error(wi, "Temperature coupling is required for calculating rlist using the energy drift with verlet-buffer-drift > 0. Either use temperature coupling or set rlist yourself together with verlet-buffer-drift = -1.");
 +                    }
 +
 +                    if (inputrec2nboundeddim(ir) < 3)
 +                    {
 +                        warning_error(wi, "The box volume is required for calculating rlist from the energy drift with verlet-buffer-drift > 0. You are using at least one unbounded dimension, so no volume can be computed. Either use a finite box, or set rlist yourself together with verlet-buffer-drift = -1.");
 +                    }
 +                    /* Set rlist temporarily so we can continue processing */
 +                    ir->rlist = rc_max;
 +                }
 +                else
 +                {
 +                    /* Set the buffer to 5% of the cut-off */
 +                    ir->rlist = 1.05*rc_max;
 +                }
 +            }
 +        }
 +
 +        /* No twin-range calculations with Verlet lists */
 +        ir->rlistlong = ir->rlist;
 +    }
 +
 +    if (ir->nstcalclr == -1)
 +    {
 +        /* if rlist=rlistlong, this will later be changed to nstcalclr=0 */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    else if (ir->nstcalclr > 0)
 +    {
 +        if (ir->nstlist > 0 && (ir->nstlist % ir->nstcalclr != 0))
 +        {
 +            warning_error(wi, "nstlist must be evenly divisible by nstcalclr. Use nstcalclr = -1 to automatically follow nstlist");
 +        }
 +    }
 +    else if (ir->nstcalclr < -1)
 +    {
 +        warning_error(wi, "nstcalclr must be a positive number (divisor of nstcalclr), or -1 to follow nstlist.");
 +    }
 +
 +    if (EEL_PME(ir->coulombtype) && ir->rcoulomb > ir->rvdw && ir->nstcalclr > 1)
 +    {
 +        warning_error(wi, "When used with PME, the long-range component of twin-range interactions must be updated every step (nstcalclr)");
 +    }
 +
 +    /* GENERAL INTEGRATOR STUFF */
 +    if (!(ir->eI == eiMD || EI_VV(ir->eI)))
 +    {
 +        ir->etc = etcNO;
 +    }
 +    if (ir->eI == eiVVAK)
 +    {
 +        sprintf(warn_buf, "Integrator method %s is implemented primarily for validation purposes; for molecular dynamics, you should probably be using %s or %s", ei_names[eiVVAK], ei_names[eiMD], ei_names[eiVV]);
 +        warning_note(wi, warn_buf);
 +    }
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        ir->epc = epcNO;
 +    }
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        if (ir->nstcalcenergy < 0)
 +        {
 +            ir->nstcalcenergy = ir_optimal_nstcalcenergy(ir);
 +            if (ir->nstenergy != 0 && ir->nstenergy < ir->nstcalcenergy)
 +            {
 +                /* nstcalcenergy larger than nstener does not make sense.
 +                 * We ideally want nstcalcenergy=nstener.
 +                 */
 +                if (ir->nstlist > 0)
 +                {
 +                    ir->nstcalcenergy = lcd(ir->nstenergy, ir->nstlist);
 +                }
 +                else
 +                {
 +                    ir->nstcalcenergy = ir->nstenergy;
 +                }
 +            }
 +        }
 +        else if ( (ir->nstenergy > 0 && ir->nstcalcenergy > ir->nstenergy) ||
 +                  (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 &&
 +                   (ir->nstcalcenergy > ir->fepvals->nstdhdl) ) )
 +
 +        {
 +            const char *nsten    = "nstenergy";
 +            const char *nstdh    = "nstdhdl";
 +            const char *min_name = nsten;
 +            int         min_nst  = ir->nstenergy;
 +
 +            /* find the smallest of ( nstenergy, nstdhdl ) */
 +            if (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 &&
 +                (ir->fepvals->nstdhdl < ir->nstenergy) )
 +            {
 +                min_nst  = ir->fepvals->nstdhdl;
 +                min_name = nstdh;
 +            }
 +            /* If the user sets nstenergy small, we should respect that */
 +            sprintf(warn_buf,
 +                    "Setting nstcalcenergy (%d) equal to %s (%d)",
 +                    ir->nstcalcenergy, min_name, min_nst);
 +            warning_note(wi, warn_buf);
 +            ir->nstcalcenergy = min_nst;
 +        }
 +
 +        if (ir->epc != epcNO)
 +        {
 +            if (ir->nstpcouple < 0)
 +            {
 +                ir->nstpcouple = ir_optimal_nstpcouple(ir);
 +            }
 +        }
 +        if (IR_TWINRANGE(*ir))
 +        {
 +            check_nst("nstlist", ir->nstlist,
 +                      "nstcalcenergy", &ir->nstcalcenergy, wi);
 +            if (ir->epc != epcNO)
 +            {
 +                check_nst("nstlist", ir->nstlist,
 +                          "nstpcouple", &ir->nstpcouple, wi);
 +            }
 +        }
 +
 +        if (ir->nstcalcenergy > 0)
 +        {
 +            if (ir->efep != efepNO)
 +            {
 +                /* nstdhdl should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy", ir->nstcalcenergy,
 +                          "nstdhdl", &ir->fepvals->nstdhdl, wi);
 +                /* nstexpanded should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy", ir->nstcalcenergy,
 +                          "nstexpanded", &ir->expandedvals->nstexpanded, wi);
 +            }
 +            /* for storing exact averages nstenergy should be
 +             * a multiple of nstcalcenergy
 +             */
 +            check_nst("nstcalcenergy", ir->nstcalcenergy,
 +                      "nstenergy", &ir->nstenergy, wi);
 +        }
 +    }
 +
 +    /* LD STUFF */
 +    if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +        ir->bContinuation && ir->ld_seed != -1)
 +    {
 +        warning_note(wi, "You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +    }
 +
 +    /* TPI STUFF */
 +    if (EI_TPI(ir->eI))
 +    {
 +        sprintf(err_buf, "TPI only works with pbc = %s", epbc_names[epbcXYZ]);
 +        CHECK(ir->ePBC != epbcXYZ);
 +        sprintf(err_buf, "TPI only works with ns = %s", ens_names[ensGRID]);
 +        CHECK(ir->ns_type != ensGRID);
 +        sprintf(err_buf, "with TPI nstlist should be larger than zero");
 +        CHECK(ir->nstlist <= 0);
 +        sprintf(err_buf, "TPI does not work with full electrostatics other than PME");
 +        CHECK(EEL_FULL(ir->coulombtype) && !EEL_PME(ir->coulombtype));
 +    }
 +
 +    /* SHAKE / LINCS */
 +    if ( (opts->nshake > 0) && (opts->bMorse) )
 +    {
 +        sprintf(warn_buf,
 +                "Using morse bond-potentials while constraining bonds is useless");
 +        warning(wi, warn_buf);
 +    }
 +
 +    if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +        ir->bContinuation && ir->ld_seed != -1)
 +    {
 +        warning_note(wi, "You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +    }
 +    /* verify simulated tempering options */
 +
 +    if (ir->bSimTemp)
 +    {
 +        gmx_bool bAllTempZero = TRUE;
 +        for (i = 0; i < fep->n_lambda; i++)
 +        {
 +            sprintf(err_buf, "Entry %d for %s must be between 0 and 1, instead is %g", i, efpt_names[efptTEMPERATURE], fep->all_lambda[efptTEMPERATURE][i]);
 +            CHECK((fep->all_lambda[efptTEMPERATURE][i] < 0) || (fep->all_lambda[efptTEMPERATURE][i] > 1));
 +            if (fep->all_lambda[efptTEMPERATURE][i] > 0)
 +            {
 +                bAllTempZero = FALSE;
 +            }
 +        }
 +        sprintf(err_buf, "if simulated tempering is on, temperature-lambdas may not be all zero");
 +        CHECK(bAllTempZero == TRUE);
 +
 +        sprintf(err_buf, "Simulated tempering is currently only compatible with md-vv");
 +        CHECK(ir->eI != eiVV);
 +
 +        /* check compatability of the temperature coupling with simulated tempering */
 +
 +        if (ir->etc == etcNOSEHOOVER)
 +        {
 +            sprintf(warn_buf, "Nose-Hoover based temperature control such as [%s] my not be entirelyconsistent with simulated tempering", etcoupl_names[ir->etc]);
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        /* check that the temperatures make sense */
 +
 +        sprintf(err_buf, "Higher simulated tempering temperature (%g) must be >= than the simulated tempering lower temperature (%g)", ir->simtempvals->simtemp_high, ir->simtempvals->simtemp_low);
 +        CHECK(ir->simtempvals->simtemp_high <= ir->simtempvals->simtemp_low);
 +
 +        sprintf(err_buf, "Higher simulated tempering temperature (%g) must be >= zero", ir->simtempvals->simtemp_high);
 +        CHECK(ir->simtempvals->simtemp_high <= 0);
 +
 +        sprintf(err_buf, "Lower simulated tempering temperature (%g) must be >= zero", ir->simtempvals->simtemp_low);
 +        CHECK(ir->simtempvals->simtemp_low <= 0);
 +    }
 +
 +    /* verify free energy options */
 +
 +    if (ir->efep != efepNO)
 +    {
 +        fep = ir->fepvals;
 +        sprintf(err_buf, "The soft-core power is %d and can only be 1 or 2",
 +                fep->sc_power);
 +        CHECK(fep->sc_alpha != 0 && fep->sc_power != 1 && fep->sc_power != 2);
 +
 +        sprintf(err_buf, "The soft-core sc-r-power is %d and can only be 6 or 48",
 +                (int)fep->sc_r_power);
 +        CHECK(fep->sc_alpha != 0 && fep->sc_r_power != 6.0 && fep->sc_r_power != 48.0);
 +
 +        /* check validity of options */
 +        if (fep->n_lambda > 0 && ir->rlist < max(ir->rvdw, ir->rcoulomb))
 +        {
 +            sprintf(warn_buf,
 +                    "For foreign lambda free energy differences it is assumed that the soft-core interactions have no effect beyond the neighborlist cut-off");
 +            warning(wi, warn_buf);
 +        }
 +
 +        sprintf(err_buf, "Can't use postive delta-lambda (%g) if initial state/lambda does not start at zero", fep->delta_lambda);
 +        CHECK(fep->delta_lambda > 0 && ((fep->init_fep_state > 0) ||  (fep->init_lambda > 0)));
 +
 +        sprintf(err_buf, "Can't use postive delta-lambda (%g) with expanded ensemble simulations", fep->delta_lambda);
 +        CHECK(fep->delta_lambda > 0 && (ir->efep == efepEXPANDED));
 +
 +        sprintf(err_buf, "Free-energy not implemented for Ewald");
 +        CHECK(ir->coulombtype == eelEWALD);
 +
 +        /* check validty of lambda inputs */
 +        if (fep->n_lambda == 0)
 +        {
 +            /* Clear output in case of no states:*/
 +            sprintf(err_buf, "init-lambda-state set to %d: no lambda states are defined.", fep->init_fep_state);
 +            CHECK((fep->init_fep_state >= 0) && (fep->n_lambda == 0));
 +        }
 +        else
 +        {
 +            sprintf(err_buf, "initial thermodynamic state %d does not exist, only goes to %d", fep->init_fep_state, fep->n_lambda-1);
 +            CHECK((fep->init_fep_state >= fep->n_lambda));
 +        }
 +
 +        sprintf(err_buf, "Lambda state must be set, either with init-lambda-state or with init-lambda");
 +        CHECK((fep->init_fep_state < 0) && (fep->init_lambda < 0));
 +
 +        sprintf(err_buf, "init-lambda=%g while init-lambda-state=%d. Lambda state must be set either with init-lambda-state or with init-lambda, but not both",
 +                fep->init_lambda, fep->init_fep_state);
 +        CHECK((fep->init_fep_state >= 0) && (fep->init_lambda >= 0));
 +
 +
 +
 +        if ((fep->init_lambda >= 0) && (fep->delta_lambda == 0))
 +        {
 +            int n_lambda_terms;
 +            n_lambda_terms = 0;
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                if (fep->separate_dvdl[i])
 +                {
 +                    n_lambda_terms++;
 +                }
 +            }
 +            if (n_lambda_terms > 1)
 +            {
 +                sprintf(warn_buf, "If lambda vector states (fep-lambdas, coul-lambdas etc.) are set, don't use init-lambda to set lambda state (except for slow growth). Use init-lambda-state instead.");
 +                warning(wi, warn_buf);
 +            }
 +
 +            if (n_lambda_terms < 2 && fep->n_lambda > 0)
 +            {
 +                warning_note(wi,
 +                             "init-lambda is deprecated for setting lambda state (except for slow growth). Use init-lambda-state instead.");
 +            }
 +        }
 +
 +        for (j = 0; j < efptNR; j++)
 +        {
 +            for (i = 0; i < fep->n_lambda; i++)
 +            {
 +                sprintf(err_buf, "Entry %d for %s must be between 0 and 1, instead is %g", i, efpt_names[j], fep->all_lambda[j][i]);
 +                CHECK((fep->all_lambda[j][i] < 0) || (fep->all_lambda[j][i] > 1));
 +            }
 +        }
 +
 +        if ((fep->sc_alpha > 0) && (!fep->bScCoul))
 +        {
 +            for (i = 0; i < fep->n_lambda; i++)
 +            {
 +                sprintf(err_buf, "For state %d, vdw-lambdas (%f) is changing with vdw softcore, while coul-lambdas (%f) is nonzero without coulomb softcore: this will lead to crashes, and is not supported.", i, fep->all_lambda[efptVDW][i],
 +                        fep->all_lambda[efptCOUL][i]);
 +                CHECK((fep->sc_alpha > 0) &&
 +                      (((fep->all_lambda[efptCOUL][i] > 0.0) &&
 +                        (fep->all_lambda[efptCOUL][i] < 1.0)) &&
 +                       ((fep->all_lambda[efptVDW][i] > 0.0) &&
 +                        (fep->all_lambda[efptVDW][i] < 1.0))));
 +            }
 +        }
 +
 +        if ((fep->bScCoul) && (EEL_PME(ir->coulombtype)))
 +        {
 +            sprintf(warn_buf, "With coulomb soft core, the reciprocal space calculation will not necessarily cancel.  It may be necessary to decrease the reciprocal space energy, and increase the cutoff radius to get sufficiently close matches to energies with free energy turned off.");
 +            warning(wi, warn_buf);
 +        }
 +
 +        /*  Free Energy Checks -- In an ideal world, slow growth and FEP would
 +            be treated differently, but that's the next step */
 +
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            for (j = 0; j < fep->n_lambda; j++)
 +            {
 +                sprintf(err_buf, "%s[%d] must be between 0 and 1", efpt_names[i], j);
 +                CHECK((fep->all_lambda[i][j] < 0) || (fep->all_lambda[i][j] > 1));
 +            }
 +        }
 +    }
 +
 +    if ((ir->bSimTemp) || (ir->efep == efepEXPANDED))
 +    {
 +        fep    = ir->fepvals;
 +        expand = ir->expandedvals;
 +
 +        /* checking equilibration of weights inputs for validity */
 +
 +        sprintf(err_buf, "weight-equil-number-all-lambda (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_n_at_lam, elmceq_names[elmceqNUMATLAM]);
 +        CHECK((expand->equil_n_at_lam > 0) && (expand->elmceq != elmceqNUMATLAM));
 +
 +        sprintf(err_buf, "weight-equil-number-samples (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_samples, elmceq_names[elmceqSAMPLES]);
 +        CHECK((expand->equil_samples > 0) && (expand->elmceq != elmceqSAMPLES));
 +
 +        sprintf(err_buf, "weight-equil-number-steps (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_steps, elmceq_names[elmceqSTEPS]);
 +        CHECK((expand->equil_steps > 0) && (expand->elmceq != elmceqSTEPS));
 +
 +        sprintf(err_buf, "weight-equil-wl-delta (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_samples, elmceq_names[elmceqWLDELTA]);
 +        CHECK((expand->equil_wl_delta > 0) && (expand->elmceq != elmceqWLDELTA));
 +
 +        sprintf(err_buf, "weight-equil-count-ratio (%f) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_ratio, elmceq_names[elmceqRATIO]);
 +        CHECK((expand->equil_ratio > 0) && (expand->elmceq != elmceqRATIO));
 +
 +        sprintf(err_buf, "weight-equil-number-all-lambda (%d) must be a positive integer if lmc-weights-equil=%s",
 +                expand->equil_n_at_lam, elmceq_names[elmceqNUMATLAM]);
 +        CHECK((expand->equil_n_at_lam <= 0) && (expand->elmceq == elmceqNUMATLAM));
 +
 +        sprintf(err_buf, "weight-equil-number-samples (%d) must be a positive integer if lmc-weights-equil=%s",
 +                expand->equil_samples, elmceq_names[elmceqSAMPLES]);
 +        CHECK((expand->equil_samples <= 0) && (expand->elmceq == elmceqSAMPLES));
 +
 +        sprintf(err_buf, "weight-equil-number-steps (%d) must be a positive integer if lmc-weights-equil=%s",
 +                expand->equil_steps, elmceq_names[elmceqSTEPS]);
 +        CHECK((expand->equil_steps <= 0) && (expand->elmceq == elmceqSTEPS));
 +
 +        sprintf(err_buf, "weight-equil-wl-delta (%f) must be > 0 if lmc-weights-equil=%s",
 +                expand->equil_wl_delta, elmceq_names[elmceqWLDELTA]);
 +        CHECK((expand->equil_wl_delta <= 0) && (expand->elmceq == elmceqWLDELTA));
 +
 +        sprintf(err_buf, "weight-equil-count-ratio (%f) must be > 0 if lmc-weights-equil=%s",
 +                expand->equil_ratio, elmceq_names[elmceqRATIO]);
 +        CHECK((expand->equil_ratio <= 0) && (expand->elmceq == elmceqRATIO));
 +
 +        sprintf(err_buf, "lmc-weights-equil=%s only possible when lmc-stats = %s or lmc-stats %s",
 +                elmceq_names[elmceqWLDELTA], elamstats_names[elamstatsWL], elamstats_names[elamstatsWWL]);
 +        CHECK((expand->elmceq == elmceqWLDELTA) && (!EWL(expand->elamstats)));
 +
 +        sprintf(err_buf, "lmc-repeats (%d) must be greater than 0", expand->lmc_repeats);
 +        CHECK((expand->lmc_repeats <= 0));
 +        sprintf(err_buf, "minimum-var-min (%d) must be greater than 0", expand->minvarmin);
 +        CHECK((expand->minvarmin <= 0));
 +        sprintf(err_buf, "weight-c-range (%d) must be greater or equal to 0", expand->c_range);
 +        CHECK((expand->c_range < 0));
 +        sprintf(err_buf, "init-lambda-state (%d) must be zero if lmc-forced-nstart (%d)> 0 and lmc-move != 'no'",
 +                fep->init_fep_state, expand->lmc_forced_nstart);
 +        CHECK((fep->init_fep_state != 0) && (expand->lmc_forced_nstart > 0) && (expand->elmcmove != elmcmoveNO));
 +        sprintf(err_buf, "lmc-forced-nstart (%d) must not be negative", expand->lmc_forced_nstart);
 +        CHECK((expand->lmc_forced_nstart < 0));
 +        sprintf(err_buf, "init-lambda-state (%d) must be in the interval [0,number of lambdas)", fep->init_fep_state);
 +        CHECK((fep->init_fep_state < 0) || (fep->init_fep_state >= fep->n_lambda));
 +
 +        sprintf(err_buf, "init-wl-delta (%f) must be greater than or equal to 0", expand->init_wl_delta);
 +        CHECK((expand->init_wl_delta < 0));
 +        sprintf(err_buf, "wl-ratio (%f) must be between 0 and 1", expand->wl_ratio);
 +        CHECK((expand->wl_ratio <= 0) || (expand->wl_ratio >= 1));
 +        sprintf(err_buf, "wl-scale (%f) must be between 0 and 1", expand->wl_scale);
 +        CHECK((expand->wl_scale <= 0) || (expand->wl_scale >= 1));
 +
 +        /* if there is no temperature control, we need to specify an MC temperature */
 +        sprintf(err_buf, "If there is no temperature control, and lmc-mcmove!= 'no',mc_temperature must be set to a positive number");
 +        if (expand->nstTij > 0)
 +        {
 +            sprintf(err_buf, "nst-transition-matrix (%d) must be an integer multiple of nstlog (%d)",
 +                    expand->nstTij, ir->nstlog);
 +            CHECK((mod(expand->nstTij, ir->nstlog) != 0));
 +        }
 +    }
 +
 +    /* PBC/WALLS */
 +    sprintf(err_buf, "walls only work with pbc=%s", epbc_names[epbcXY]);
 +    CHECK(ir->nwall && ir->ePBC != epbcXY);
 +
 +    /* VACUUM STUFF */
 +    if (ir->ePBC != epbcXYZ && ir->nwall != 2)
 +    {
 +        if (ir->ePBC == epbcNONE)
 +        {
 +            if (ir->epc != epcNO)
 +            {
 +                warning(wi, "Turning off pressure coupling for vacuum system");
 +                ir->epc = epcNO;
 +            }
 +        }
 +        else
 +        {
 +            sprintf(err_buf, "Can not have pressure coupling with pbc=%s",
 +                    epbc_names[ir->ePBC]);
 +            CHECK(ir->epc != epcNO);
 +        }
 +        sprintf(err_buf, "Can not have Ewald with pbc=%s", epbc_names[ir->ePBC]);
 +        CHECK(EEL_FULL(ir->coulombtype));
 +
 +        sprintf(err_buf, "Can not have dispersion correction with pbc=%s",
 +                epbc_names[ir->ePBC]);
 +        CHECK(ir->eDispCorr != edispcNO);
 +    }
 +
 +    if (ir->rlist == 0.0)
 +    {
 +        sprintf(err_buf, "can only have neighborlist cut-off zero (=infinite)\n"
 +                "with coulombtype = %s or coulombtype = %s\n"
 +                "without periodic boundary conditions (pbc = %s) and\n"
 +                "rcoulomb and rvdw set to zero",
 +                eel_names[eelCUT], eel_names[eelUSER], epbc_names[epbcNONE]);
 +        CHECK(((ir->coulombtype != eelCUT) && (ir->coulombtype != eelUSER)) ||
 +              (ir->ePBC     != epbcNONE) ||
 +              (ir->rcoulomb != 0.0)      || (ir->rvdw != 0.0));
 +
 +        if (ir->nstlist < 0)
 +        {
 +            warning_error(wi, "Can not have heuristic neighborlist updates without cut-off");
 +        }
 +        if (ir->nstlist > 0)
 +        {
 +            warning_note(wi, "Simulating without cut-offs is usually (slightly) faster with nstlist=0, nstype=simple and particle decomposition");
 +        }
 +    }
 +
 +    /* COMM STUFF */
 +    if (ir->nstcomm == 0)
 +    {
 +        ir->comm_mode = ecmNO;
 +    }
 +    if (ir->comm_mode != ecmNO)
 +    {
 +        if (ir->nstcomm < 0)
 +        {
 +            warning(wi, "If you want to remove the rotation around the center of mass, you should set comm_mode = Angular instead of setting nstcomm < 0. nstcomm is modified to its absolute value");
 +            ir->nstcomm = abs(ir->nstcomm);
 +        }
 +
 +        if (ir->nstcalcenergy > 0 && ir->nstcomm < ir->nstcalcenergy)
 +        {
 +            warning_note(wi, "nstcomm < nstcalcenergy defeats the purpose of nstcalcenergy, setting nstcomm to nstcalcenergy");
 +            ir->nstcomm = ir->nstcalcenergy;
 +        }
 +
 +        if (ir->comm_mode == ecmANGULAR)
 +        {
 +            sprintf(err_buf, "Can not remove the rotation around the center of mass with periodic molecules");
 +            CHECK(ir->bPeriodicMols);
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                warning(wi, "Removing the rotation around the center of mass in a periodic system (this is not a problem when you have only one molecule).");
 +            }
 +        }
 +    }
 +
 +    if (EI_STATE_VELOCITY(ir->eI) && ir->ePBC == epbcNONE && ir->comm_mode != ecmANGULAR)
 +    {
 +        warning_note(wi, "Tumbling and or flying ice-cubes: We are not removing rotation around center of mass in a non-periodic system. You should probably set comm_mode = ANGULAR.");
 +    }
 +
 +    sprintf(err_buf, "Twin-range neighbour searching (NS) with simple NS"
 +            " algorithm not implemented");
 +    CHECK(((ir->rcoulomb > ir->rlist) || (ir->rvdw > ir->rlist))
 +          && (ir->ns_type == ensSIMPLE));
 +
 +    /* TEMPERATURE COUPLING */
 +    if (ir->etc == etcYES)
 +    {
 +        ir->etc = etcBERENDSEN;
 +        warning_note(wi, "Old option for temperature coupling given: "
 +                     "changing \"yes\" to \"Berendsen\"\n");
 +    }
 +
 +    if ((ir->etc == etcNOSEHOOVER) || (ir->epc == epcMTTK))
 +    {
 +        if (ir->opts.nhchainlength < 1)
 +        {
 +            sprintf(warn_buf, "number of Nose-Hoover chains (currently %d) cannot be less than 1,reset to 1\n", ir->opts.nhchainlength);
 +            ir->opts.nhchainlength = 1;
 +            warning(wi, warn_buf);
 +        }
 +
 +        if (ir->etc == etcNOSEHOOVER && !EI_VV(ir->eI) && ir->opts.nhchainlength > 1)
 +        {
 +            warning_note(wi, "leapfrog does not yet support Nose-Hoover chains, nhchainlength reset to 1");
 +            ir->opts.nhchainlength = 1;
 +        }
 +    }
 +    else
 +    {
 +        ir->opts.nhchainlength = 0;
 +    }
 +
 +    if (ir->eI == eiVVAK)
 +    {
 +        sprintf(err_buf, "%s implemented primarily for validation, and requires nsttcouple = 1 and nstpcouple = 1.",
 +                ei_names[eiVVAK]);
 +        CHECK((ir->nsttcouple != 1) || (ir->nstpcouple != 1));
 +    }
 +
 +    if (ETC_ANDERSEN(ir->etc))
 +    {
 +        sprintf(err_buf, "%s temperature control not supported for integrator %s.", etcoupl_names[ir->etc], ei_names[ir->eI]);
 +        CHECK(!(EI_VV(ir->eI)));
 +
 +        for (i = 0; i < ir->opts.ngtc; i++)
 +        {
 +            sprintf(err_buf, "all tau_t must currently be equal using Andersen temperature control, violated for group %d", i);
 +            CHECK(ir->opts.tau_t[0] != ir->opts.tau_t[i]);
 +            sprintf(err_buf, "all tau_t must be postive using Andersen temperature control, tau_t[%d]=%10.6f",
 +                    i, ir->opts.tau_t[i]);
 +            CHECK(ir->opts.tau_t[i] < 0);
 +        }
 +        if (ir->nstcomm > 0 && (ir->etc == etcANDERSEN))
 +        {
 +            sprintf(warn_buf, "Center of mass removal not necessary for %s.  All velocities of coupled groups are rerandomized periodically, so flying ice cube errors will not occur.", etcoupl_names[ir->etc]);
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        sprintf(err_buf, "nstcomm must be 1, not %d for %s, as velocities of atoms in coupled groups are randomized every time step", ir->nstcomm, etcoupl_names[ir->etc]);
 +        CHECK(ir->nstcomm > 1 && (ir->etc == etcANDERSEN));
 +
 +        for (i = 0; i < ir->opts.ngtc; i++)
 +        {
 +            int nsteps = (int)(ir->opts.tau_t[i]/ir->delta_t);
 +            sprintf(err_buf, "tau_t/delta_t for group %d for temperature control method %s must be a multiple of nstcomm (%d), as velocities of atoms in coupled groups are randomized every time step. The input tau_t (%8.3f) leads to %d steps per randomization", i, etcoupl_names[ir->etc], ir->nstcomm, ir->opts.tau_t[i], nsteps);
 +            CHECK((nsteps % ir->nstcomm) && (ir->etc == etcANDERSENMASSIVE));
 +        }
 +    }
 +    if (ir->etc == etcBERENDSEN)
 +    {
 +        sprintf(warn_buf, "The %s thermostat does not generate the correct kinetic energy distribution. You might want to consider using the %s thermostat.",
 +                ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    if ((ir->etc == etcNOSEHOOVER || ETC_ANDERSEN(ir->etc))
 +        && ir->epc == epcBERENDSEN)
 +    {
 +        sprintf(warn_buf, "Using Berendsen pressure coupling invalidates the "
 +                "true ensemble for the thermostat");
 +        warning(wi, warn_buf);
 +    }
 +
 +    /* PRESSURE COUPLING */
 +    if (ir->epc == epcISOTROPIC)
 +    {
 +        ir->epc = epcBERENDSEN;
 +        warning_note(wi, "Old option for pressure coupling given: "
 +                     "changing \"Isotropic\" to \"Berendsen\"\n");
 +    }
 +
 +    if (ir->epc != epcNO)
 +    {
 +        dt_pcoupl = ir->nstpcouple*ir->delta_t;
 +
 +        sprintf(err_buf, "tau-p must be > 0 instead of %g\n", ir->tau_p);
 +        CHECK(ir->tau_p <= 0);
 +
 +        if (ir->tau_p/dt_pcoupl < pcouple_min_integration_steps(ir->epc))
 +        {
 +            sprintf(warn_buf, "For proper integration of the %s barostat, tau-p (%g) should be at least %d times larger than nstpcouple*dt (%g)",
 +                    EPCOUPLTYPE(ir->epc), ir->tau_p, pcouple_min_integration_steps(ir->epc), dt_pcoupl);
 +            warning(wi, warn_buf);
 +        }
 +
 +        sprintf(err_buf, "compressibility must be > 0 when using pressure"
 +                " coupling %s\n", EPCOUPLTYPE(ir->epc));
 +        CHECK(ir->compress[XX][XX] < 0 || ir->compress[YY][YY] < 0 ||
 +              ir->compress[ZZ][ZZ] < 0 ||
 +              (trace(ir->compress) == 0 && ir->compress[YY][XX] <= 0 &&
 +               ir->compress[ZZ][XX] <= 0 && ir->compress[ZZ][YY] <= 0));
 +
 +        if (epcPARRINELLORAHMAN == ir->epc && opts->bGenVel)
 +        {
 +            sprintf(warn_buf,
 +                    "You are generating velocities so I am assuming you "
 +                    "are equilibrating a system. You are using "
 +                    "%s pressure coupling, but this can be "
 +                    "unstable for equilibration. If your system crashes, try "
 +                    "equilibrating first with Berendsen pressure coupling. If "
 +                    "you are not equilibrating the system, you can probably "
 +                    "ignore this warning.",
 +                    epcoupl_names[ir->epc]);
 +            warning(wi, warn_buf);
 +        }
 +    }
 +
 +    if (EI_VV(ir->eI))
 +    {
 +        if (ir->epc > epcNO)
 +        {
 +            if ((ir->epc != epcBERENDSEN) && (ir->epc != epcMTTK))
 +            {
 +                warning_error(wi, "for md-vv and md-vv-avek, can only use Berendsen and Martyna-Tuckerman-Tobias-Klein (MTTK) equations for pressure control; MTTK is equivalent to Parrinello-Rahman.");
 +            }
 +        }
 +    }
 +
 +    /* ELECTROSTATICS */
 +    /* More checks are in triple check (grompp.c) */
 +
 +    if (ir->coulombtype == eelSWITCH)
 +    {
 +        sprintf(warn_buf, "coulombtype = %s is only for testing purposes and can lead to serious "
 +                "artifacts, advice: use coulombtype = %s",
 +                eel_names[ir->coulombtype],
 +                eel_names[eelRF_ZERO]);
 +        warning(wi, warn_buf);
 +    }
 +
 +    if (ir->epsilon_r != 1 && ir->implicit_solvent == eisGBSA)
 +    {
 +        sprintf(warn_buf, "epsilon-r = %g with GB implicit solvent, will use this value for inner dielectric", ir->epsilon_r);
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    if (EEL_RF(ir->coulombtype) && ir->epsilon_rf == 1 && ir->epsilon_r != 1)
 +    {
 +        sprintf(warn_buf, "epsilon-r = %g and epsilon-rf = 1 with reaction field, proceeding assuming old format and exchanging epsilon-r and epsilon-rf", ir->epsilon_r);
 +        warning(wi, warn_buf);
 +        ir->epsilon_rf = ir->epsilon_r;
 +        ir->epsilon_r  = 1.0;
 +    }
 +
 +    if (getenv("GALACTIC_DYNAMICS") == NULL)
 +    {
 +        sprintf(err_buf, "epsilon-r must be >= 0 instead of %g\n", ir->epsilon_r);
 +        CHECK(ir->epsilon_r < 0);
 +    }
 +
 +    if (EEL_RF(ir->coulombtype))
 +    {
 +        /* reaction field (at the cut-off) */
 +
 +        if (ir->coulombtype == eelRF_ZERO)
 +        {
 +            sprintf(warn_buf, "With coulombtype = %s, epsilon-rf must be 0, assuming you meant epsilon_rf=0",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->epsilon_rf != 0);
 +            ir->epsilon_rf = 0.0;
 +        }
 +
 +        sprintf(err_buf, "epsilon-rf must be >= epsilon-r");
 +        CHECK((ir->epsilon_rf < ir->epsilon_r && ir->epsilon_rf != 0) ||
 +              (ir->epsilon_r == 0));
 +        if (ir->epsilon_rf == ir->epsilon_r)
 +        {
 +            sprintf(warn_buf, "Using epsilon-rf = epsilon-r with %s does not make sense",
 +                    eel_names[ir->coulombtype]);
 +            warning(wi, warn_buf);
 +        }
 +    }
 +    /* Allow rlist>rcoulomb for tabulated long range stuff. This just
 +     * means the interaction is zero outside rcoulomb, but it helps to
 +     * provide accurate energy conservation.
 +     */
 +    if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype))
 +    {
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            sprintf(err_buf,
 +                    "With coulombtype = %s rcoulomb_switch must be < rcoulomb. Or, better: Use the potential modifier options!",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->rcoulomb_switch >= ir->rcoulomb);
 +        }
 +    }
 +    else if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
 +    {
 +        if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            sprintf(err_buf, "With coulombtype = %s, rcoulomb should be >= rlist unless you use a potential modifier",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->rlist > ir->rcoulomb);
 +        }
 +    }
 +
 +    if (ir->coulombtype == eelSWITCH || ir->coulombtype == eelSHIFT ||
 +        ir->vdwtype == evdwSWITCH || ir->vdwtype == evdwSHIFT)
 +    {
 +        sprintf(warn_buf,
++                "The switch/shift interaction settings are just for compatibility; you will get better "
 +                "performance from applying potential modifiers to your interactions!\n");
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    if (EEL_FULL(ir->coulombtype))
 +    {
 +        if (ir->coulombtype == eelPMESWITCH || ir->coulombtype == eelPMEUSER ||
 +            ir->coulombtype == eelPMEUSERSWITCH)
 +        {
 +            sprintf(err_buf, "With coulombtype = %s, rcoulomb must be <= rlist",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->rcoulomb > ir->rlist);
 +        }
 +        else if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            if (ir->coulombtype == eelPME || ir->coulombtype == eelP3M_AD)
 +            {
 +                sprintf(err_buf,
 +                        "With coulombtype = %s (without modifier), rcoulomb must be equal to rlist,\n"
 +                        "or rlistlong if nstcalclr=1. For optimal energy conservation,consider using\n"
 +                        "a potential modifier.", eel_names[ir->coulombtype]);
 +                if (ir->nstcalclr == 1)
 +                {
 +                    CHECK(ir->rcoulomb != ir->rlist && ir->rcoulomb != ir->rlistlong);
 +                }
 +                else
 +                {
 +                    CHECK(ir->rcoulomb != ir->rlist);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        if (ir->pme_order < 3)
 +        {
 +            warning_error(wi, "pme-order can not be smaller than 3");
 +        }
 +    }
 +
 +    if (ir->nwall == 2 && EEL_FULL(ir->coulombtype))
 +    {
 +        if (ir->ewald_geometry == eewg3D)
 +        {
 +            sprintf(warn_buf, "With pbc=%s you should use ewald-geometry=%s",
 +                    epbc_names[ir->ePBC], eewg_names[eewg3DC]);
 +            warning(wi, warn_buf);
 +        }
 +        /* This check avoids extra pbc coding for exclusion corrections */
 +        sprintf(err_buf, "wall-ewald-zfac should be >= 2");
 +        CHECK(ir->wall_ewald_zfac < 2);
 +    }
 +
 +    if (EVDW_SWITCHED(ir->vdwtype))
 +    {
 +        sprintf(err_buf, "With vdwtype = %s rvdw-switch must be < rvdw. Or, better - use a potential modifier.",
 +                evdw_names[ir->vdwtype]);
 +        CHECK(ir->rvdw_switch >= ir->rvdw);
 +    }
 +    else if (ir->vdwtype == evdwCUT)
 +    {
 +        if (ir->cutoff_scheme == ecutsGROUP && ir->vdw_modifier == eintmodNONE)
 +        {
 +            sprintf(err_buf, "With vdwtype = %s, rvdw must be >= rlist unless you use a potential modifier", evdw_names[ir->vdwtype]);
 +            CHECK(ir->rlist > ir->rvdw);
 +        }
 +    }
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
 +            && (ir->rlistlong <= ir->rcoulomb))
 +        {
 +            sprintf(warn_buf, "For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rcoulomb.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi, warn_buf);
 +        }
 +        if (EVDW_SWITCHED(ir->vdwtype) && (ir->rlistlong <= ir->rvdw))
 +        {
 +            sprintf(warn_buf, "For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rvdw.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi, warn_buf);
 +        }
 +    }
 +
 +    if (ir->vdwtype == evdwUSER && ir->eDispCorr != edispcNO)
 +    {
 +        warning_note(wi, "You have selected user tables with dispersion correction, the dispersion will be corrected to -C6/r^6 beyond rvdw_switch (the tabulated interaction between rvdw_switch and rvdw will not be double counted). Make sure that you really want dispersion correction to -C6/r^6.");
 +    }
 +
 +    if (ir->nstlist == -1)
 +    {
 +        sprintf(err_buf, "With nstlist=-1 rvdw and rcoulomb should be smaller than rlist to account for diffusion and possibly charge-group radii");
 +        CHECK(ir->rvdw >= ir->rlist || ir->rcoulomb >= ir->rlist);
 +    }
 +    sprintf(err_buf, "nstlist can not be smaller than -1");
 +    CHECK(ir->nstlist < -1);
 +
 +    if (ir->eI == eiLBFGS && (ir->coulombtype == eelCUT || ir->vdwtype == evdwCUT)
 +        && ir->rvdw != 0)
 +    {
 +        warning(wi, "For efficient BFGS minimization, use switch/shift/pme instead of cut-off.");
 +    }
 +
 +    if (ir->eI == eiLBFGS && ir->nbfgscorr <= 0)
 +    {
 +        warning(wi, "Using L-BFGS with nbfgscorr<=0 just gets you steepest descent.");
 +    }
 +
 +    /* ENERGY CONSERVATION */
 +    if (ir_NVE(ir) && ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (!EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype) && ir->rvdw > 0 && ir->vdw_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf, "You are using a cut-off for VdW interactions with NVE, for good energy conservation use vdwtype = %s (possibly with DispCorr)",
 +                    evdw_names[evdwSHIFT]);
 +            warning_note(wi, warn_buf);
 +        }
 +        if (!EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > 0 && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf, "You are using a cut-off for electrostatics with NVE, for good energy conservation use coulombtype = %s or %s",
 +                    eel_names[eelPMESWITCH], eel_names[eelRF_ZERO]);
 +            warning_note(wi, warn_buf);
 +        }
 +    }
 +
 +    /* IMPLICIT SOLVENT */
 +    if (ir->coulombtype == eelGB_NOTUSED)
 +    {
 +        ir->coulombtype      = eelCUT;
 +        ir->implicit_solvent = eisGBSA;
 +        fprintf(stderr, "Note: Old option for generalized born electrostatics given:\n"
 +                "Changing coulombtype from \"generalized-born\" to \"cut-off\" and instead\n"
 +                "setting implicit-solvent value to \"GBSA\" in input section.\n");
 +    }
 +
 +    if (ir->sa_algorithm == esaSTILL)
 +    {
 +        sprintf(err_buf, "Still SA algorithm not available yet, use %s or %s instead\n", esa_names[esaAPPROX], esa_names[esaNO]);
 +        CHECK(ir->sa_algorithm == esaSTILL);
 +    }
 +
 +    if (ir->implicit_solvent == eisGBSA)
 +    {
 +        sprintf(err_buf, "With GBSA implicit solvent, rgbradii must be equal to rlist.");
 +        CHECK(ir->rgbradii != ir->rlist);
 +
 +        if (ir->coulombtype != eelCUT)
 +        {
 +            sprintf(err_buf, "With GBSA, coulombtype must be equal to %s\n", eel_names[eelCUT]);
 +            CHECK(ir->coulombtype != eelCUT);
 +        }
 +        if (ir->vdwtype != evdwCUT)
 +        {
 +            sprintf(err_buf, "With GBSA, vdw-type must be equal to %s\n", evdw_names[evdwCUT]);
 +            CHECK(ir->vdwtype != evdwCUT);
 +        }
 +        if (ir->nstgbradii < 1)
 +        {
 +            sprintf(warn_buf, "Using GBSA with nstgbradii<1, setting nstgbradii=1");
 +            warning_note(wi, warn_buf);
 +            ir->nstgbradii = 1;
 +        }
 +        if (ir->sa_algorithm == esaNO)
 +        {
 +            sprintf(warn_buf, "No SA (non-polar) calculation requested together with GB. Are you sure this is what you want?\n");
 +            warning_note(wi, warn_buf);
 +        }
 +        if (ir->sa_surface_tension < 0 && ir->sa_algorithm != esaNO)
 +        {
 +            sprintf(warn_buf, "Value of sa_surface_tension is < 0. Changing it to 2.05016 or 2.25936 kJ/nm^2/mol for Still and HCT/OBC respectively\n");
 +            warning_note(wi, warn_buf);
 +
 +            if (ir->gb_algorithm == egbSTILL)
 +            {
 +                ir->sa_surface_tension = 0.0049 * CAL2JOULE * 100;
 +            }
 +            else
 +            {
 +                ir->sa_surface_tension = 0.0054 * CAL2JOULE * 100;
 +            }
 +        }
 +        if (ir->sa_surface_tension == 0 && ir->sa_algorithm != esaNO)
 +        {
 +            sprintf(err_buf, "Surface tension set to 0 while SA-calculation requested\n");
 +            CHECK(ir->sa_surface_tension == 0 && ir->sa_algorithm != esaNO);
 +        }
 +
 +    }
 +
 +    if (ir->bAdress)
 +    {
 +        if (ir->cutoff_scheme != ecutsGROUP)
 +        {
 +            warning_error(wi, "AdresS simulation supports only cutoff-scheme=group");
 +        }
 +        if (!EI_SD(ir->eI))
 +        {
 +            warning_error(wi, "AdresS simulation supports only stochastic dynamics");
 +        }
 +        if (ir->epc != epcNO)
 +        {
 +            warning_error(wi, "AdresS simulation does not support pressure coupling");
 +        }
 +        if (EEL_FULL(ir->coulombtype))
 +        {
 +            warning_error(wi, "AdresS simulation does not support long-range electrostatics");
 +        }
 +    }
 +}
 +
 +/* count the number of text elemets separated by whitespace in a string.
 +    str = the input string
 +    maxptr = the maximum number of allowed elements
 +    ptr = the output array of pointers to the first character of each element
 +    returns: the number of elements. */
 +int str_nelem(const char *str, int maxptr, char *ptr[])
 +{
 +    int   np = 0;
 +    char *copy0, *copy;
 +
 +    copy0 = strdup(str);
 +    copy  = copy0;
 +    ltrim(copy);
 +    while (*copy != '\0')
 +    {
 +        if (np >= maxptr)
 +        {
 +            gmx_fatal(FARGS, "Too many groups on line: '%s' (max is %d)",
 +                      str, maxptr);
 +        }
 +        if (ptr)
 +        {
 +            ptr[np] = copy;
 +        }
 +        np++;
 +        while ((*copy != '\0') && !isspace(*copy))
 +        {
 +            copy++;
 +        }
 +        if (*copy != '\0')
 +        {
 +            *copy = '\0';
 +            copy++;
 +        }
 +        ltrim(copy);
 +    }
 +    if (ptr == NULL)
 +    {
 +        sfree(copy0);
 +    }
 +
 +    return np;
 +}
 +
 +/* interpret a number of doubles from a string and put them in an array,
 +   after allocating space for them.
 +   str = the input string
 +   n = the (pre-allocated) number of doubles read
 +   r = the output array of doubles. */
 +static void parse_n_real(char *str, int *n, real **r)
 +{
 +    char *ptr[MAXPTR];
 +    int   i;
 +
 +    *n = str_nelem(str, MAXPTR, ptr);
 +
 +    snew(*r, *n);
 +    for (i = 0; i < *n; i++)
 +    {
 +        (*r)[i] = strtod(ptr[i], NULL);
 +    }
 +}
 +
 +static void do_fep_params(t_inputrec *ir, char fep_lambda[][STRLEN], char weights[STRLEN])
 +{
 +
 +    int         i, j, max_n_lambda, nweights, nfep[efptNR];
 +    t_lambda   *fep    = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +    real      **count_fep_lambdas;
 +    gmx_bool    bOneLambda = TRUE;
 +
 +    snew(count_fep_lambdas, efptNR);
 +
 +    /* FEP input processing */
 +    /* first, identify the number of lambda values for each type.
 +       All that are nonzero must have the same number */
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        parse_n_real(fep_lambda[i], &(nfep[i]), &(count_fep_lambdas[i]));
 +    }
 +
 +    /* now, determine the number of components.  All must be either zero, or equal. */
 +
 +    max_n_lambda = 0;
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if (nfep[i] > max_n_lambda)
 +        {
 +            max_n_lambda = nfep[i];  /* here's a nonzero one.  All of them
 +                                        must have the same number if its not zero.*/
 +            break;
 +        }
 +    }
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if (nfep[i] == 0)
 +        {
 +            ir->fepvals->separate_dvdl[i] = FALSE;
 +        }
 +        else if (nfep[i] == max_n_lambda)
 +        {
 +            if (i != efptTEMPERATURE)  /* we treat this differently -- not really a reason to compute the derivative with
 +                                          respect to the temperature currently */
 +            {
 +                ir->fepvals->separate_dvdl[i] = TRUE;
 +            }
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS, "Number of lambdas (%d) for FEP type %s not equal to number of other types (%d)",
 +                      nfep[i], efpt_names[i], max_n_lambda);
 +        }
 +    }
 +    /* we don't print out dhdl if the temperature is changing, since we can't correctly define dhdl in this case */
 +    ir->fepvals->separate_dvdl[efptTEMPERATURE] = FALSE;
 +
 +    /* the number of lambdas is the number we've read in, which is either zero
 +       or the same for all */
 +    fep->n_lambda = max_n_lambda;
 +
 +    /* allocate space for the array of lambda values */
 +    snew(fep->all_lambda, efptNR);
 +    /* if init_lambda is defined, we need to set lambda */
 +    if ((fep->init_lambda > 0) && (fep->n_lambda == 0))
 +    {
 +        ir->fepvals->separate_dvdl[efptFEP] = TRUE;
 +    }
 +    /* otherwise allocate the space for all of the lambdas, and transfer the data */
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        snew(fep->all_lambda[i], fep->n_lambda);
 +        if (nfep[i] > 0)  /* if it's zero, then the count_fep_lambda arrays
 +                             are zero */
 +        {
 +            for (j = 0; j < fep->n_lambda; j++)
 +            {
 +                fep->all_lambda[i][j] = (double)count_fep_lambdas[i][j];
 +            }
 +            sfree(count_fep_lambdas[i]);
 +        }
 +    }
 +    sfree(count_fep_lambdas);
 +
 +    /* "fep-vals" is either zero or the full number. If zero, we'll need to define fep-lambdas for internal
 +       bookkeeping -- for now, init_lambda */
 +
 +    if ((nfep[efptFEP] == 0) && (fep->init_lambda >= 0))
 +    {
 +        for (i = 0; i < fep->n_lambda; i++)
 +        {
 +            fep->all_lambda[efptFEP][i] = fep->init_lambda;
 +        }
 +    }
 +
 +    /* check to see if only a single component lambda is defined, and soft core is defined.
 +       In this case, turn on coulomb soft core */
 +
 +    if (max_n_lambda == 0)
 +    {
 +        bOneLambda = TRUE;
 +    }
 +    else
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            if ((nfep[i] != 0) && (i != efptFEP))
 +            {
 +                bOneLambda = FALSE;
 +            }
 +        }
 +    }
 +    if ((bOneLambda) && (fep->sc_alpha > 0))
 +    {
 +        fep->bScCoul = TRUE;
 +    }
 +
 +    /* Fill in the others with the efptFEP if they are not explicitly
 +       specified (i.e. nfep[i] == 0).  This means if fep is not defined,
 +       they are all zero. */
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if ((nfep[i] == 0) && (i != efptFEP))
 +        {
 +            for (j = 0; j < fep->n_lambda; j++)
 +            {
 +                fep->all_lambda[i][j] = fep->all_lambda[efptFEP][j];
 +            }
 +        }
 +    }
 +
 +
 +    /* make it easier if sc_r_power = 48 by increasing it to the 4th power, to be in the right scale. */
 +    if (fep->sc_r_power == 48)
 +    {
 +        if (fep->sc_alpha > 0.1)
 +        {
 +            gmx_fatal(FARGS, "sc_alpha (%f) for sc_r_power = 48 should usually be between 0.001 and 0.004", fep->sc_alpha);
 +        }
 +    }
 +
 +    expand = ir->expandedvals;
 +    /* now read in the weights */
 +    parse_n_real(weights, &nweights, &(expand->init_lambda_weights));
 +    if (nweights == 0)
 +    {
 +        expand->bInit_weights = FALSE;
 +        snew(expand->init_lambda_weights, fep->n_lambda); /* initialize to zero */
 +    }
 +    else if (nweights != fep->n_lambda)
 +    {
 +        gmx_fatal(FARGS, "Number of weights (%d) is not equal to number of lambda values (%d)",
 +                  nweights, fep->n_lambda);
 +    }
 +    else
 +    {
 +        expand->bInit_weights = TRUE;
 +    }
 +    if ((expand->nstexpanded < 0) && (ir->efep != efepNO))
 +    {
 +        expand->nstexpanded = fep->nstdhdl;
 +        /* if you don't specify nstexpanded when doing expanded ensemble free energy calcs, it is set to nstdhdl */
 +    }
 +    if ((expand->nstexpanded < 0) && ir->bSimTemp)
 +    {
 +        expand->nstexpanded = 2*(int)(ir->opts.tau_t[0]/ir->delta_t);
 +        /* if you don't specify nstexpanded when doing expanded ensemble simulated tempering, it is set to
 +           2*tau_t just to be careful so it's not to frequent  */
 +    }
 +}
 +
 +
 +static void do_simtemp_params(t_inputrec *ir)
 +{
 +
 +    snew(ir->simtempvals->temperatures, ir->fepvals->n_lambda);
 +    GetSimTemps(ir->fepvals->n_lambda, ir->simtempvals, ir->fepvals->all_lambda[efptTEMPERATURE]);
 +
 +    return;
 +}
 +
 +static void do_wall_params(t_inputrec *ir,
 +                           char *wall_atomtype, char *wall_density,
 +                           t_gromppopts *opts)
 +{
 +    int    nstr, i;
 +    char  *names[MAXPTR];
 +    double dbl;
 +
 +    opts->wall_atomtype[0] = NULL;
 +    opts->wall_atomtype[1] = NULL;
 +
 +    ir->wall_atomtype[0] = -1;
 +    ir->wall_atomtype[1] = -1;
 +    ir->wall_density[0]  = 0;
 +    ir->wall_density[1]  = 0;
 +
 +    if (ir->nwall > 0)
 +    {
 +        nstr = str_nelem(wall_atomtype, MAXPTR, names);
 +        if (nstr != ir->nwall)
 +        {
 +            gmx_fatal(FARGS, "Expected %d elements for wall_atomtype, found %d",
 +                      ir->nwall, nstr);
 +        }
 +        for (i = 0; i < ir->nwall; i++)
 +        {
 +            opts->wall_atomtype[i] = strdup(names[i]);
 +        }
 +
 +        if (ir->wall_type == ewt93 || ir->wall_type == ewt104)
 +        {
 +            nstr = str_nelem(wall_density, MAXPTR, names);
 +            if (nstr != ir->nwall)
 +            {
 +                gmx_fatal(FARGS, "Expected %d elements for wall-density, found %d", ir->nwall, nstr);
 +            }
 +            for (i = 0; i < ir->nwall; i++)
 +            {
 +                sscanf(names[i], "%lf", &dbl);
 +                if (dbl <= 0)
 +                {
 +                    gmx_fatal(FARGS, "wall-density[%d] = %f\n", i, dbl);
 +                }
 +                ir->wall_density[i] = dbl;
 +            }
 +        }
 +    }
 +}
 +
 +static void add_wall_energrps(gmx_groups_t *groups, int nwall, t_symtab *symtab)
 +{
 +    int     i;
 +    t_grps *grps;
 +    char    str[STRLEN];
 +
 +    if (nwall > 0)
 +    {
 +        srenew(groups->grpname, groups->ngrpname+nwall);
 +        grps = &(groups->grps[egcENER]);
 +        srenew(grps->nm_ind, grps->nr+nwall);
 +        for (i = 0; i < nwall; i++)
 +        {
 +            sprintf(str, "wall%d", i);
 +            groups->grpname[groups->ngrpname] = put_symtab(symtab, str);
 +            grps->nm_ind[grps->nr++]          = groups->ngrpname++;
 +        }
 +    }
 +}
 +
 +void read_expandedparams(int *ninp_p, t_inpfile **inp_p,
 +                         t_expanded *expand, warninp_t wi)
 +{
 +    int        ninp, nerror = 0;
 +    t_inpfile *inp;
 +
 +    ninp   = *ninp_p;
 +    inp    = *inp_p;
 +
 +    /* read expanded ensemble parameters */
 +    CCTYPE ("expanded ensemble variables");
 +    ITYPE ("nstexpanded", expand->nstexpanded, -1);
 +    EETYPE("lmc-stats", expand->elamstats, elamstats_names);
 +    EETYPE("lmc-move", expand->elmcmove, elmcmove_names);
 +    EETYPE("lmc-weights-equil", expand->elmceq, elmceq_names);
 +    ITYPE ("weight-equil-number-all-lambda", expand->equil_n_at_lam, -1);
 +    ITYPE ("weight-equil-number-samples", expand->equil_samples, -1);
 +    ITYPE ("weight-equil-number-steps", expand->equil_steps, -1);
 +    RTYPE ("weight-equil-wl-delta", expand->equil_wl_delta, -1);
 +    RTYPE ("weight-equil-count-ratio", expand->equil_ratio, -1);
 +    CCTYPE("Seed for Monte Carlo in lambda space");
 +    ITYPE ("lmc-seed", expand->lmc_seed, -1);
 +    RTYPE ("mc-temperature", expand->mc_temp, -1);
 +    ITYPE ("lmc-repeats", expand->lmc_repeats, 1);
 +    ITYPE ("lmc-gibbsdelta", expand->gibbsdeltalam, -1);
 +    ITYPE ("lmc-forced-nstart", expand->lmc_forced_nstart, 0);
 +    EETYPE("symmetrized-transition-matrix", expand->bSymmetrizedTMatrix, yesno_names);
 +    ITYPE("nst-transition-matrix", expand->nstTij, -1);
 +    ITYPE ("mininum-var-min", expand->minvarmin, 100); /*default is reasonable */
 +    ITYPE ("weight-c-range", expand->c_range, 0);      /* default is just C=0 */
 +    RTYPE ("wl-scale", expand->wl_scale, 0.8);
 +    RTYPE ("wl-ratio", expand->wl_ratio, 0.8);
 +    RTYPE ("init-wl-delta", expand->init_wl_delta, 1.0);
 +    EETYPE("wl-oneovert", expand->bWLoneovert, yesno_names);
 +
 +    *ninp_p   = ninp;
 +    *inp_p    = inp;
 +
 +    return;
 +}
 +
 +void get_ir(const char *mdparin, const char *mdparout,
 +            t_inputrec *ir, t_gromppopts *opts,
 +            warninp_t wi)
 +{
 +    char       *dumstr[2];
 +    double      dumdub[2][6];
 +    t_inpfile  *inp;
 +    const char *tmp;
 +    int         i, j, m, ninp;
 +    char        warn_buf[STRLEN];
 +    t_lambda   *fep    = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +
 +    inp = read_inpfile(mdparin, &ninp, NULL, wi);
 +
 +    snew(dumstr[0], STRLEN);
 +    snew(dumstr[1], STRLEN);
 +
 +    /* remove the following deprecated commands */
 +    REM_TYPE("title");
 +    REM_TYPE("cpp");
 +    REM_TYPE("domain-decomposition");
 +    REM_TYPE("andersen-seed");
 +    REM_TYPE("dihre");
 +    REM_TYPE("dihre-fc");
 +    REM_TYPE("dihre-tau");
 +    REM_TYPE("nstdihreout");
 +    REM_TYPE("nstcheckpoint");
 +
 +    /* replace the following commands with the clearer new versions*/
 +    REPL_TYPE("unconstrained-start", "continuation");
 +    REPL_TYPE("foreign-lambda", "fep-lambdas");
 +
 +    CCTYPE ("VARIOUS PREPROCESSING OPTIONS");
 +    CTYPE ("Preprocessor information: use cpp syntax.");
 +    CTYPE ("e.g.: -I/home/joe/doe -I/home/mary/roe");
 +    STYPE ("include", opts->include,  NULL);
 +    CTYPE ("e.g.: -DPOSRES -DFLEXIBLE (note these variable names are case sensitive)");
 +    STYPE ("define",  opts->define,   NULL);
 +
 +    CCTYPE ("RUN CONTROL PARAMETERS");
 +    EETYPE("integrator",  ir->eI,         ei_names);
 +    CTYPE ("Start time and timestep in ps");
 +    RTYPE ("tinit",   ir->init_t, 0.0);
 +    RTYPE ("dt",      ir->delta_t,    0.001);
 +    STEPTYPE ("nsteps",   ir->nsteps,     0);
 +    CTYPE ("For exact run continuation or redoing part of a run");
 +    STEPTYPE ("init-step", ir->init_step,  0);
 +    CTYPE ("Part index is updated automatically on checkpointing (keeps files separate)");
 +    ITYPE ("simulation-part", ir->simulation_part, 1);
 +    CTYPE ("mode for center of mass motion removal");
 +    EETYPE("comm-mode",   ir->comm_mode,  ecm_names);
 +    CTYPE ("number of steps for center of mass motion removal");
 +    ITYPE ("nstcomm", ir->nstcomm,    100);
 +    CTYPE ("group(s) for center of mass motion removal");
 +    STYPE ("comm-grps",   vcm,            NULL);
 +
 +    CCTYPE ("LANGEVIN DYNAMICS OPTIONS");
 +    CTYPE ("Friction coefficient (amu/ps) and random seed");
 +    RTYPE ("bd-fric",     ir->bd_fric,    0.0);
 +    ITYPE ("ld-seed",     ir->ld_seed,    1993);
 +
 +    /* Em stuff */
 +    CCTYPE ("ENERGY MINIMIZATION OPTIONS");
 +    CTYPE ("Force tolerance and initial step-size");
 +    RTYPE ("emtol",       ir->em_tol,     10.0);
 +    RTYPE ("emstep",      ir->em_stepsize, 0.01);
 +    CTYPE ("Max number of iterations in relax-shells");
 +    ITYPE ("niter",       ir->niter,      20);
 +    CTYPE ("Step size (ps^2) for minimization of flexible constraints");
 +    RTYPE ("fcstep",      ir->fc_stepsize, 0);
 +    CTYPE ("Frequency of steepest descents steps when doing CG");
 +    ITYPE ("nstcgsteep",  ir->nstcgsteep, 1000);
 +    ITYPE ("nbfgscorr",   ir->nbfgscorr,  10);
 +
 +    CCTYPE ("TEST PARTICLE INSERTION OPTIONS");
 +    RTYPE ("rtpi",    ir->rtpi,   0.05);
 +
 +    /* Output options */
 +    CCTYPE ("OUTPUT CONTROL OPTIONS");
 +    CTYPE ("Output frequency for coords (x), velocities (v) and forces (f)");
 +    ITYPE ("nstxout", ir->nstxout,    0);
 +    ITYPE ("nstvout", ir->nstvout,    0);
 +    ITYPE ("nstfout", ir->nstfout,    0);
 +    ir->nstcheckpoint = 1000;
 +    CTYPE ("Output frequency for energies to log file and energy file");
 +    ITYPE ("nstlog",  ir->nstlog, 1000);
 +    ITYPE ("nstcalcenergy", ir->nstcalcenergy, 100);
 +    ITYPE ("nstenergy",   ir->nstenergy,  1000);
 +    CTYPE ("Output frequency and precision for .xtc file");
 +    ITYPE ("nstxtcout",   ir->nstxtcout,  0);
 +    RTYPE ("xtc-precision", ir->xtcprec,   1000.0);
 +    CTYPE ("This selects the subset of atoms for the .xtc file. You can");
 +    CTYPE ("select multiple groups. By default all atoms will be written.");
 +    STYPE ("xtc-grps",    xtc_grps,       NULL);
 +    CTYPE ("Selection of energy groups");
 +    STYPE ("energygrps",  energy,         NULL);
 +
 +    /* Neighbor searching */
 +    CCTYPE ("NEIGHBORSEARCHING PARAMETERS");
 +    CTYPE ("cut-off scheme (group: using charge groups, Verlet: particle based cut-offs)");
 +    EETYPE("cutoff-scheme",     ir->cutoff_scheme,    ecutscheme_names);
 +    CTYPE ("nblist update frequency");
 +    ITYPE ("nstlist", ir->nstlist,    10);
 +    CTYPE ("ns algorithm (simple or grid)");
 +    EETYPE("ns-type",     ir->ns_type,    ens_names);
 +    /* set ndelta to the optimal value of 2 */
 +    ir->ndelta = 2;
 +    CTYPE ("Periodic boundary conditions: xyz, no, xy");
 +    EETYPE("pbc",         ir->ePBC,       epbc_names);
 +    EETYPE("periodic-molecules", ir->bPeriodicMols, yesno_names);
 +    CTYPE ("Allowed energy drift due to the Verlet buffer in kJ/mol/ps per atom,");
 +    CTYPE ("a value of -1 means: use rlist");
 +    RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
 +    CTYPE ("nblist cut-off");
 +    RTYPE ("rlist",   ir->rlist,  1.0);
 +    CTYPE ("long-range cut-off for switched potentials");
 +    RTYPE ("rlistlong",   ir->rlistlong,  -1);
 +    ITYPE ("nstcalclr",   ir->nstcalclr,  -1);
 +
 +    /* Electrostatics */
 +    CCTYPE ("OPTIONS FOR ELECTROSTATICS AND VDW");
 +    CTYPE ("Method for doing electrostatics");
 +    EETYPE("coulombtype", ir->coulombtype,    eel_names);
 +    EETYPE("coulomb-modifier",    ir->coulomb_modifier,    eintmod_names);
 +    CTYPE ("cut-off lengths");
 +    RTYPE ("rcoulomb-switch", ir->rcoulomb_switch,    0.0);
 +    RTYPE ("rcoulomb",    ir->rcoulomb,   1.0);
 +    CTYPE ("Relative dielectric constant for the medium and the reaction field");
 +    RTYPE ("epsilon-r",   ir->epsilon_r,  1.0);
 +    RTYPE ("epsilon-rf",  ir->epsilon_rf, 0.0);
 +    CTYPE ("Method for doing Van der Waals");
 +    EETYPE("vdw-type",    ir->vdwtype,    evdw_names);
 +    EETYPE("vdw-modifier",    ir->vdw_modifier,    eintmod_names);
 +    CTYPE ("cut-off lengths");
 +    RTYPE ("rvdw-switch", ir->rvdw_switch,    0.0);
 +    RTYPE ("rvdw",    ir->rvdw,   1.0);
 +    CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
 +    EETYPE("DispCorr",    ir->eDispCorr,  edispc_names);
 +    CTYPE ("Extension of the potential lookup tables beyond the cut-off");
 +    RTYPE ("table-extension", ir->tabext, 1.0);
 +    CTYPE ("Separate tables between energy group pairs");
 +    STYPE ("energygrp-table", egptable,   NULL);
 +    CTYPE ("Spacing for the PME/PPPM FFT grid");
 +    RTYPE ("fourierspacing", ir->fourier_spacing, 0.12);
 +    CTYPE ("FFT grid size, when a value is 0 fourierspacing will be used");
 +    ITYPE ("fourier-nx",  ir->nkx,         0);
 +    ITYPE ("fourier-ny",  ir->nky,         0);
 +    ITYPE ("fourier-nz",  ir->nkz,         0);
 +    CTYPE ("EWALD/PME/PPPM parameters");
 +    ITYPE ("pme-order",   ir->pme_order,   4);
 +    RTYPE ("ewald-rtol",  ir->ewald_rtol, 0.00001);
 +    EETYPE("ewald-geometry", ir->ewald_geometry, eewg_names);
 +    RTYPE ("epsilon-surface", ir->epsilon_surface, 0.0);
 +    EETYPE("optimize-fft", ir->bOptFFT,  yesno_names);
 +
 +    CCTYPE("IMPLICIT SOLVENT ALGORITHM");
 +    EETYPE("implicit-solvent", ir->implicit_solvent, eis_names);
 +
 +    CCTYPE ("GENERALIZED BORN ELECTROSTATICS");
 +    CTYPE ("Algorithm for calculating Born radii");
 +    EETYPE("gb-algorithm", ir->gb_algorithm, egb_names);
 +    CTYPE ("Frequency of calculating the Born radii inside rlist");
 +    ITYPE ("nstgbradii", ir->nstgbradii, 1);
 +    CTYPE ("Cutoff for Born radii calculation; the contribution from atoms");
 +    CTYPE ("between rlist and rgbradii is updated every nstlist steps");
 +    RTYPE ("rgbradii",  ir->rgbradii, 1.0);
 +    CTYPE ("Dielectric coefficient of the implicit solvent");
 +    RTYPE ("gb-epsilon-solvent", ir->gb_epsilon_solvent, 80.0);
 +    CTYPE ("Salt concentration in M for Generalized Born models");
 +    RTYPE ("gb-saltconc",  ir->gb_saltconc, 0.0);
 +    CTYPE ("Scaling factors used in the OBC GB model. Default values are OBC(II)");
 +    RTYPE ("gb-obc-alpha", ir->gb_obc_alpha, 1.0);
 +    RTYPE ("gb-obc-beta", ir->gb_obc_beta, 0.8);
 +    RTYPE ("gb-obc-gamma", ir->gb_obc_gamma, 4.85);
 +    RTYPE ("gb-dielectric-offset", ir->gb_dielectric_offset, 0.009);
 +    EETYPE("sa-algorithm", ir->sa_algorithm, esa_names);
 +    CTYPE ("Surface tension (kJ/mol/nm^2) for the SA (nonpolar surface) part of GBSA");
 +    CTYPE ("The value -1 will set default value for Still/HCT/OBC GB-models.");
 +    RTYPE ("sa-surface-tension", ir->sa_surface_tension, -1);
 +
 +    /* Coupling stuff */
 +    CCTYPE ("OPTIONS FOR WEAK COUPLING ALGORITHMS");
 +    CTYPE ("Temperature coupling");
 +    EETYPE("tcoupl",  ir->etc,        etcoupl_names);
 +    ITYPE ("nsttcouple", ir->nsttcouple,  -1);
 +    ITYPE("nh-chain-length",     ir->opts.nhchainlength, NHCHAINLENGTH);
 +    EETYPE("print-nose-hoover-chain-variables", ir->bPrintNHChains, yesno_names);
 +    CTYPE ("Groups to couple separately");
 +    STYPE ("tc-grps",     tcgrps,         NULL);
 +    CTYPE ("Time constant (ps) and reference temperature (K)");
 +    STYPE ("tau-t",   tau_t,      NULL);
 +    STYPE ("ref-t",   ref_t,      NULL);
 +    CTYPE ("pressure coupling");
 +    EETYPE("pcoupl",  ir->epc,        epcoupl_names);
 +    EETYPE("pcoupltype",  ir->epct,       epcoupltype_names);
 +    ITYPE ("nstpcouple", ir->nstpcouple,  -1);
 +    CTYPE ("Time constant (ps), compressibility (1/bar) and reference P (bar)");
 +    RTYPE ("tau-p",   ir->tau_p,  1.0);
 +    STYPE ("compressibility", dumstr[0],  NULL);
 +    STYPE ("ref-p",       dumstr[1],      NULL);
 +    CTYPE ("Scaling of reference coordinates, No, All or COM");
 +    EETYPE ("refcoord-scaling", ir->refcoord_scaling, erefscaling_names);
 +
 +    /* QMMM */
 +    CCTYPE ("OPTIONS FOR QMMM calculations");
 +    EETYPE("QMMM", ir->bQMMM, yesno_names);
 +    CTYPE ("Groups treated Quantum Mechanically");
 +    STYPE ("QMMM-grps",  QMMM,          NULL);
 +    CTYPE ("QM method");
 +    STYPE("QMmethod",     QMmethod, NULL);
 +    CTYPE ("QMMM scheme");
 +    EETYPE("QMMMscheme",  ir->QMMMscheme,    eQMMMscheme_names);
 +    CTYPE ("QM basisset");
 +    STYPE("QMbasis",      QMbasis, NULL);
 +    CTYPE ("QM charge");
 +    STYPE ("QMcharge",    QMcharge, NULL);
 +    CTYPE ("QM multiplicity");
 +    STYPE ("QMmult",      QMmult, NULL);
 +    CTYPE ("Surface Hopping");
 +    STYPE ("SH",          bSH, NULL);
 +    CTYPE ("CAS space options");
 +    STYPE ("CASorbitals",      CASorbitals,   NULL);
 +    STYPE ("CASelectrons",     CASelectrons,  NULL);
 +    STYPE ("SAon", SAon, NULL);
 +    STYPE ("SAoff", SAoff, NULL);
 +    STYPE ("SAsteps",  SAsteps, NULL);
 +    CTYPE ("Scale factor for MM charges");
 +    RTYPE ("MMChargeScaleFactor", ir->scalefactor, 1.0);
 +    CTYPE ("Optimization of QM subsystem");
 +    STYPE ("bOPT",          bOPT, NULL);
 +    STYPE ("bTS",          bTS, NULL);
 +
 +    /* Simulated annealing */
 +    CCTYPE("SIMULATED ANNEALING");
 +    CTYPE ("Type of annealing for each temperature group (no/single/periodic)");
 +    STYPE ("annealing",   anneal,      NULL);
 +    CTYPE ("Number of time points to use for specifying annealing in each group");
 +    STYPE ("annealing-npoints", anneal_npoints, NULL);
 +    CTYPE ("List of times at the annealing points for each group");
 +    STYPE ("annealing-time",       anneal_time,       NULL);
 +    CTYPE ("Temp. at each annealing point, for each group.");
 +    STYPE ("annealing-temp",  anneal_temp,  NULL);
 +
 +    /* Startup run */
 +    CCTYPE ("GENERATE VELOCITIES FOR STARTUP RUN");
 +    EETYPE("gen-vel",     opts->bGenVel,  yesno_names);
 +    RTYPE ("gen-temp",    opts->tempi,    300.0);
 +    ITYPE ("gen-seed",    opts->seed,     173529);
 +
 +    /* Shake stuff */
 +    CCTYPE ("OPTIONS FOR BONDS");
 +    EETYPE("constraints", opts->nshake,   constraints);
 +    CTYPE ("Type of constraint algorithm");
 +    EETYPE("constraint-algorithm",  ir->eConstrAlg, econstr_names);
 +    CTYPE ("Do not constrain the start configuration");
 +    EETYPE("continuation", ir->bContinuation, yesno_names);
 +    CTYPE ("Use successive overrelaxation to reduce the number of shake iterations");
 +    EETYPE("Shake-SOR", ir->bShakeSOR, yesno_names);
 +    CTYPE ("Relative tolerance of shake");
 +    RTYPE ("shake-tol", ir->shake_tol, 0.0001);
 +    CTYPE ("Highest order in the expansion of the constraint coupling matrix");
 +    ITYPE ("lincs-order", ir->nProjOrder, 4);
 +    CTYPE ("Number of iterations in the final step of LINCS. 1 is fine for");
 +    CTYPE ("normal simulations, but use 2 to conserve energy in NVE runs.");
 +    CTYPE ("For energy minimization with constraints it should be 4 to 8.");
 +    ITYPE ("lincs-iter", ir->nLincsIter, 1);
 +    CTYPE ("Lincs will write a warning to the stderr if in one step a bond");
 +    CTYPE ("rotates over more degrees than");
 +    RTYPE ("lincs-warnangle", ir->LincsWarnAngle, 30.0);
 +    CTYPE ("Convert harmonic bonds to morse potentials");
 +    EETYPE("morse",       opts->bMorse, yesno_names);
 +
 +    /* Energy group exclusions */
 +    CCTYPE ("ENERGY GROUP EXCLUSIONS");
 +    CTYPE ("Pairs of energy groups for which all non-bonded interactions are excluded");
 +    STYPE ("energygrp-excl", egpexcl,     NULL);
 +
 +    /* Walls */
 +    CCTYPE ("WALLS");
 +    CTYPE ("Number of walls, type, atom types, densities and box-z scale factor for Ewald");
 +    ITYPE ("nwall", ir->nwall, 0);
 +    EETYPE("wall-type",     ir->wall_type,   ewt_names);
 +    RTYPE ("wall-r-linpot", ir->wall_r_linpot, -1);
 +    STYPE ("wall-atomtype", wall_atomtype, NULL);
 +    STYPE ("wall-density",  wall_density,  NULL);
 +    RTYPE ("wall-ewald-zfac", ir->wall_ewald_zfac, 3);
 +
 +    /* COM pulling */
 +    CCTYPE("COM PULLING");
 +    CTYPE("Pull type: no, umbrella, constraint or constant-force");
 +    EETYPE("pull",          ir->ePull, epull_names);
 +    if (ir->ePull != epullNO)
 +    {
 +        snew(ir->pull, 1);
 +        pull_grp = read_pullparams(&ninp, &inp, ir->pull, &opts->pull_start, wi);
 +    }
 +
 +    /* Enforced rotation */
 +    CCTYPE("ENFORCED ROTATION");
 +    CTYPE("Enforced rotation: No or Yes");
 +    EETYPE("rotation",       ir->bRot, yesno_names);
 +    if (ir->bRot)
 +    {
 +        snew(ir->rot, 1);
 +        rot_grp = read_rotparams(&ninp, &inp, ir->rot, wi);
 +    }
 +
 +    /* Refinement */
 +    CCTYPE("NMR refinement stuff");
 +    CTYPE ("Distance restraints type: No, Simple or Ensemble");
 +    EETYPE("disre",       ir->eDisre,     edisre_names);
 +    CTYPE ("Force weighting of pairs in one distance restraint: Conservative or Equal");
 +    EETYPE("disre-weighting", ir->eDisreWeighting, edisreweighting_names);
 +    CTYPE ("Use sqrt of the time averaged times the instantaneous violation");
 +    EETYPE("disre-mixed", ir->bDisreMixed, yesno_names);
 +    RTYPE ("disre-fc",    ir->dr_fc,  1000.0);
 +    RTYPE ("disre-tau",   ir->dr_tau, 0.0);
 +    CTYPE ("Output frequency for pair distances to energy file");
 +    ITYPE ("nstdisreout", ir->nstdisreout, 100);
 +    CTYPE ("Orientation restraints: No or Yes");
 +    EETYPE("orire",       opts->bOrire,   yesno_names);
 +    CTYPE ("Orientation restraints force constant and tau for time averaging");
 +    RTYPE ("orire-fc",    ir->orires_fc,  0.0);
 +    RTYPE ("orire-tau",   ir->orires_tau, 0.0);
 +    STYPE ("orire-fitgrp", orirefitgrp,    NULL);
 +    CTYPE ("Output frequency for trace(SD) and S to energy file");
 +    ITYPE ("nstorireout", ir->nstorireout, 100);
 +
 +    /* free energy variables */
 +    CCTYPE ("Free energy variables");
 +    EETYPE("free-energy", ir->efep, efep_names);
 +    STYPE ("couple-moltype",  couple_moltype,  NULL);
 +    EETYPE("couple-lambda0", opts->couple_lam0, couple_lam);
 +    EETYPE("couple-lambda1", opts->couple_lam1, couple_lam);
 +    EETYPE("couple-intramol", opts->bCoupleIntra, yesno_names);
 +
 +    RTYPE ("init-lambda", fep->init_lambda, -1); /* start with -1 so
 +                                                    we can recognize if
 +                                                    it was not entered */
 +    ITYPE ("init-lambda-state", fep->init_fep_state, -1);
 +    RTYPE ("delta-lambda", fep->delta_lambda, 0.0);
 +    ITYPE ("nstdhdl", fep->nstdhdl, 50);
 +    STYPE ("fep-lambdas", fep_lambda[efptFEP], NULL);
 +    STYPE ("mass-lambdas", fep_lambda[efptMASS], NULL);
 +    STYPE ("coul-lambdas", fep_lambda[efptCOUL], NULL);
 +    STYPE ("vdw-lambdas", fep_lambda[efptVDW], NULL);
 +    STYPE ("bonded-lambdas", fep_lambda[efptBONDED], NULL);
 +    STYPE ("restraint-lambdas", fep_lambda[efptRESTRAINT], NULL);
 +    STYPE ("temperature-lambdas", fep_lambda[efptTEMPERATURE], NULL);
 +    ITYPE ("calc-lambda-neighbors", fep->lambda_neighbors, 1);
 +    STYPE ("init-lambda-weights", lambda_weights, NULL);
 +    EETYPE("dhdl-print-energy", fep->bPrintEnergy, yesno_names);
 +    RTYPE ("sc-alpha", fep->sc_alpha, 0.0);
 +    ITYPE ("sc-power", fep->sc_power, 1);
 +    RTYPE ("sc-r-power", fep->sc_r_power, 6.0);
 +    RTYPE ("sc-sigma", fep->sc_sigma, 0.3);
 +    EETYPE("sc-coul", fep->bScCoul, yesno_names);
 +    ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +    RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +    EETYPE("separate-dhdl-file", fep->separate_dhdl_file,
 +           separate_dhdl_file_names);
 +    EETYPE("dhdl-derivatives", fep->dhdl_derivatives, dhdl_derivatives_names);
 +    ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +    RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +
 +    /* Non-equilibrium MD stuff */
 +    CCTYPE("Non-equilibrium MD stuff");
 +    STYPE ("acc-grps",    accgrps,        NULL);
 +    STYPE ("accelerate",  acc,            NULL);
 +    STYPE ("freezegrps",  freeze,         NULL);
 +    STYPE ("freezedim",   frdim,          NULL);
 +    RTYPE ("cos-acceleration", ir->cos_accel, 0);
 +    STYPE ("deform",      deform,         NULL);
 +
 +    /* simulated tempering variables */
 +    CCTYPE("simulated tempering variables");
 +    EETYPE("simulated-tempering", ir->bSimTemp, yesno_names);
 +    EETYPE("simulated-tempering-scaling", ir->simtempvals->eSimTempScale, esimtemp_names);
 +    RTYPE("sim-temp-low", ir->simtempvals->simtemp_low, 300.0);
 +    RTYPE("sim-temp-high", ir->simtempvals->simtemp_high, 300.0);
 +
 +    /* expanded ensemble variables */
 +    if (ir->efep == efepEXPANDED || ir->bSimTemp)
 +    {
 +        read_expandedparams(&ninp, &inp, expand, wi);
 +    }
 +
 +    /* Electric fields */
 +    CCTYPE("Electric fields");
 +    CTYPE ("Format is number of terms (int) and for all terms an amplitude (real)");
 +    CTYPE ("and a phase angle (real)");
 +    STYPE ("E-x",     efield_x,   NULL);
 +    STYPE ("E-xt",    efield_xt,  NULL);
 +    STYPE ("E-y",     efield_y,   NULL);
 +    STYPE ("E-yt",    efield_yt,  NULL);
 +    STYPE ("E-z",     efield_z,   NULL);
 +    STYPE ("E-zt",    efield_zt,  NULL);
 +
 +    /* AdResS defined thingies */
 +    CCTYPE ("AdResS parameters");
 +    EETYPE("adress",       ir->bAdress, yesno_names);
 +    if (ir->bAdress)
 +    {
 +        snew(ir->adress, 1);
 +        read_adressparams(&ninp, &inp, ir->adress, wi);
 +    }
 +
 +    /* User defined thingies */
 +    CCTYPE ("User defined thingies");
 +    STYPE ("user1-grps",  user1,          NULL);
 +    STYPE ("user2-grps",  user2,          NULL);
 +    ITYPE ("userint1",    ir->userint1,   0);
 +    ITYPE ("userint2",    ir->userint2,   0);
 +    ITYPE ("userint3",    ir->userint3,   0);
 +    ITYPE ("userint4",    ir->userint4,   0);
 +    RTYPE ("userreal1",   ir->userreal1,  0);
 +    RTYPE ("userreal2",   ir->userreal2,  0);
 +    RTYPE ("userreal3",   ir->userreal3,  0);
 +    RTYPE ("userreal4",   ir->userreal4,  0);
 +#undef CTYPE
 +
 +    write_inpfile(mdparout, ninp, inp, FALSE, wi);
 +    for (i = 0; (i < ninp); i++)
 +    {
 +        sfree(inp[i].name);
 +        sfree(inp[i].value);
 +    }
 +    sfree(inp);
 +
 +    /* Process options if necessary */
 +    for (m = 0; m < 2; m++)
 +    {
 +        for (i = 0; i < 2*DIM; i++)
 +        {
 +            dumdub[m][i] = 0.0;
 +        }
 +        if (ir->epc)
 +        {
 +            switch (ir->epct)
 +            {
 +                case epctISOTROPIC:
 +                    if (sscanf(dumstr[m], "%lf", &(dumdub[m][XX])) != 1)
 +                    {
 +                        warning_error(wi, "Pressure coupling not enough values (I need 1)");
 +                    }
 +                    dumdub[m][YY] = dumdub[m][ZZ] = dumdub[m][XX];
 +                    break;
 +                case epctSEMIISOTROPIC:
 +                case epctSURFACETENSION:
 +                    if (sscanf(dumstr[m], "%lf%lf",
 +                               &(dumdub[m][XX]), &(dumdub[m][ZZ])) != 2)
 +                    {
 +                        warning_error(wi, "Pressure coupling not enough values (I need 2)");
 +                    }
 +                    dumdub[m][YY] = dumdub[m][XX];
 +                    break;
 +                case epctANISOTROPIC:
 +                    if (sscanf(dumstr[m], "%lf%lf%lf%lf%lf%lf",
 +                               &(dumdub[m][XX]), &(dumdub[m][YY]), &(dumdub[m][ZZ]),
 +                               &(dumdub[m][3]), &(dumdub[m][4]), &(dumdub[m][5])) != 6)
 +                    {
 +                        warning_error(wi, "Pressure coupling not enough values (I need 6)");
 +                    }
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS, "Pressure coupling type %s not implemented yet",
 +                              epcoupltype_names[ir->epct]);
 +            }
 +        }
 +    }
 +    clear_mat(ir->ref_p);
 +    clear_mat(ir->compress);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        ir->ref_p[i][i]    = dumdub[1][i];
 +        ir->compress[i][i] = dumdub[0][i];
 +    }
 +    if (ir->epct == epctANISOTROPIC)
 +    {
 +        ir->ref_p[XX][YY] = dumdub[1][3];
 +        ir->ref_p[XX][ZZ] = dumdub[1][4];
 +        ir->ref_p[YY][ZZ] = dumdub[1][5];
 +        if (ir->ref_p[XX][YY] != 0 && ir->ref_p[XX][ZZ] != 0 && ir->ref_p[YY][ZZ] != 0)
 +        {
 +            warning(wi, "All off-diagonal reference pressures are non-zero. Are you sure you want to apply a threefold shear stress?\n");
 +        }
 +        ir->compress[XX][YY] = dumdub[0][3];
 +        ir->compress[XX][ZZ] = dumdub[0][4];
 +        ir->compress[YY][ZZ] = dumdub[0][5];
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (m = 0; m < i; m++)
 +            {
 +                ir->ref_p[i][m]    = ir->ref_p[m][i];
 +                ir->compress[i][m] = ir->compress[m][i];
 +            }
 +        }
 +    }
 +
 +    if (ir->comm_mode == ecmNO)
 +    {
 +        ir->nstcomm = 0;
 +    }
 +
 +    opts->couple_moltype = NULL;
 +    if (strlen(couple_moltype) > 0)
 +    {
 +        if (ir->efep != efepNO)
 +        {
 +            opts->couple_moltype = strdup(couple_moltype);
 +            if (opts->couple_lam0 == opts->couple_lam1)
 +            {
 +                warning(wi, "The lambda=0 and lambda=1 states for coupling are identical");
 +            }
 +            if (ir->eI == eiMD && (opts->couple_lam0 == ecouplamNONE ||
 +                                   opts->couple_lam1 == ecouplamNONE))
 +            {
 +                warning(wi, "For proper sampling of the (nearly) decoupled state, stochastic dynamics should be used");
 +            }
 +        }
 +        else
 +        {
 +            warning(wi, "Can not couple a molecule with free_energy = no");
 +        }
 +    }
 +    /* FREE ENERGY AND EXPANDED ENSEMBLE OPTIONS */
 +    if (ir->efep != efepNO)
 +    {
 +        if (fep->delta_lambda > 0)
 +        {
 +            ir->efep = efepSLOWGROWTH;
 +        }
 +    }
 +
 +    if (ir->bSimTemp)
 +    {
 +        fep->bPrintEnergy = TRUE;
 +        /* always print out the energy to dhdl if we are doing expanded ensemble, since we need the total energy
 +           if the temperature is changing. */
 +    }
 +
 +    if ((ir->efep != efepNO) || ir->bSimTemp)
 +    {
 +        ir->bExpanded = FALSE;
 +        if ((ir->efep == efepEXPANDED) || ir->bSimTemp)
 +        {
 +            ir->bExpanded = TRUE;
 +        }
 +        do_fep_params(ir, fep_lambda, lambda_weights);
 +        if (ir->bSimTemp) /* done after fep params */
 +        {
 +            do_simtemp_params(ir);
 +        }
 +    }
 +    else
 +    {
 +        ir->fepvals->n_lambda = 0;
 +    }
 +
 +    /* WALL PARAMETERS */
 +
 +    do_wall_params(ir, wall_atomtype, wall_density, opts);
 +
 +    /* ORIENTATION RESTRAINT PARAMETERS */
 +
 +    if (opts->bOrire && str_nelem(orirefitgrp, MAXPTR, NULL) != 1)
 +    {
 +        warning_error(wi, "ERROR: Need one orientation restraint fit group\n");
 +    }
 +
 +    /* DEFORMATION PARAMETERS */
 +
 +    clear_mat(ir->deform);
 +    for (i = 0; i < 6; i++)
 +    {
 +        dumdub[0][i] = 0;
 +    }
 +    m = sscanf(deform, "%lf %lf %lf %lf %lf %lf",
 +               &(dumdub[0][0]), &(dumdub[0][1]), &(dumdub[0][2]),
 +               &(dumdub[0][3]), &(dumdub[0][4]), &(dumdub[0][5]));
 +    for (i = 0; i < 3; i++)
 +    {
 +        ir->deform[i][i] = dumdub[0][i];
 +    }
 +    ir->deform[YY][XX] = dumdub[0][3];
 +    ir->deform[ZZ][XX] = dumdub[0][4];
 +    ir->deform[ZZ][YY] = dumdub[0][5];
 +    if (ir->epc != epcNO)
 +    {
 +        for (i = 0; i < 3; i++)
 +        {
 +            for (j = 0; j <= i; j++)
 +            {
 +                if (ir->deform[i][j] != 0 && ir->compress[i][j] != 0)
 +                {
 +                    warning_error(wi, "A box element has deform set and compressibility > 0");
 +                }
 +            }
 +        }
 +        for (i = 0; i < 3; i++)
 +        {
 +            for (j = 0; j < i; j++)
 +            {
 +                if (ir->deform[i][j] != 0)
 +                {
 +                    for (m = j; m < DIM; m++)
 +                    {
 +                        if (ir->compress[m][j] != 0)
 +                        {
 +                            sprintf(warn_buf, "An off-diagonal box element has deform set while compressibility > 0 for the same component of another box vector, this might lead to spurious periodicity effects.");
 +                            warning(wi, warn_buf);
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    sfree(dumstr[0]);
 +    sfree(dumstr[1]);
 +}
 +
 +static int search_QMstring(char *s, int ng, const char *gn[])
 +{
 +    /* same as normal search_string, but this one searches QM strings */
 +    int i;
 +
 +    for (i = 0; (i < ng); i++)
 +    {
 +        if (gmx_strcasecmp(s, gn[i]) == 0)
 +        {
 +            return i;
 +        }
 +    }
 +
 +    gmx_fatal(FARGS, "this QM method or basisset (%s) is not implemented\n!", s);
 +
 +    return -1;
 +
 +} /* search_QMstring */
 +
 +
 +int search_string(char *s, int ng, char *gn[])
 +{
 +    int i;
 +
 +    for (i = 0; (i < ng); i++)
 +    {
 +        if (gmx_strcasecmp(s, gn[i]) == 0)
 +        {
 +            return i;
 +        }
 +    }
 +
 +    gmx_fatal(FARGS,
 +              "Group %s referenced in the .mdp file was not found in the index file.\n"
 +              "Group names must match either [moleculetype] names or custom index group\n"
 +              "names, in which case you must supply an index file to the '-n' option\n"
 +              "of grompp.",
 +              s);
 +
 +    return -1;
 +}
 +
 +static gmx_bool do_numbering(int natoms, gmx_groups_t *groups, int ng, char *ptrs[],
 +                             t_blocka *block, char *gnames[],
 +                             int gtype, int restnm,
 +                             int grptp, gmx_bool bVerbose,
 +                             warninp_t wi)
 +{
 +    unsigned short *cbuf;
 +    t_grps         *grps = &(groups->grps[gtype]);
 +    int             i, j, gid, aj, ognr, ntot = 0;
 +    const char     *title;
 +    gmx_bool        bRest;
 +    char            warn_buf[STRLEN];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Starting numbering %d groups of type %d\n", ng, gtype);
 +    }
 +
 +    title = gtypes[gtype];
 +
 +    snew(cbuf, natoms);
 +    /* Mark all id's as not set */
 +    for (i = 0; (i < natoms); i++)
 +    {
 +        cbuf[i] = NOGID;
 +    }
 +
 +    snew(grps->nm_ind, ng+1); /* +1 for possible rest group */
 +    for (i = 0; (i < ng); i++)
 +    {
 +        /* Lookup the group name in the block structure */
 +        gid = search_string(ptrs[i], block->nr, gnames);
 +        if ((grptp != egrptpONE) || (i == 0))
 +        {
 +            grps->nm_ind[grps->nr++] = gid;
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Found gid %d for group %s\n", gid, ptrs[i]);
 +        }
 +
 +        /* Now go over the atoms in the group */
 +        for (j = block->index[gid]; (j < block->index[gid+1]); j++)
 +        {
 +
 +            aj = block->a[j];
 +
 +            /* Range checking */
 +            if ((aj < 0) || (aj >= natoms))
 +            {
 +                gmx_fatal(FARGS, "Invalid atom number %d in indexfile", aj);
 +            }
 +            /* Lookup up the old group number */
 +            ognr = cbuf[aj];
 +            if (ognr != NOGID)
 +            {
 +                gmx_fatal(FARGS, "Atom %d in multiple %s groups (%d and %d)",
 +                          aj+1, title, ognr+1, i+1);
 +            }
 +            else
 +            {
 +                /* Store the group number in buffer */
 +                if (grptp == egrptpONE)
 +                {
 +                    cbuf[aj] = 0;
 +                }
 +                else
 +                {
 +                    cbuf[aj] = i;
 +                }
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    /* Now check whether we have done all atoms */
 +    bRest = FALSE;
 +    if (ntot != natoms)
 +    {
 +        if (grptp == egrptpALL)
 +        {
 +            gmx_fatal(FARGS, "%d atoms are not part of any of the %s groups",
 +                      natoms-ntot, title);
 +        }
 +        else if (grptp == egrptpPART)
 +        {
 +            sprintf(warn_buf, "%d atoms are not part of any of the %s groups",
 +                    natoms-ntot, title);
 +            warning_note(wi, warn_buf);
 +        }
 +        /* Assign all atoms currently unassigned to a rest group */
 +        for (j = 0; (j < natoms); j++)
 +        {
 +            if (cbuf[j] == NOGID)
 +            {
 +                cbuf[j] = grps->nr;
 +                bRest   = TRUE;
 +            }
 +        }
 +        if (grptp != egrptpPART)
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,
 +                        "Making dummy/rest group for %s containing %d elements\n",
 +                        title, natoms-ntot);
 +            }
 +            /* Add group name "rest" */
 +            grps->nm_ind[grps->nr] = restnm;
 +
 +            /* Assign the rest name to all atoms not currently assigned to a group */
 +            for (j = 0; (j < natoms); j++)
 +            {
 +                if (cbuf[j] == NOGID)
 +                {
 +                    cbuf[j] = grps->nr;
 +                }
 +            }
 +            grps->nr++;
 +        }
 +    }
 +
 +    if (grps->nr == 1 && (ntot == 0 || ntot == natoms))
 +    {
 +        /* All atoms are part of one (or no) group, no index required */
 +        groups->ngrpnr[gtype] = 0;
 +        groups->grpnr[gtype]  = NULL;
 +    }
 +    else
 +    {
 +        groups->ngrpnr[gtype] = natoms;
 +        snew(groups->grpnr[gtype], natoms);
 +        for (j = 0; (j < natoms); j++)
 +        {
 +            groups->grpnr[gtype][j] = cbuf[j];
 +        }
 +    }
 +
 +    sfree(cbuf);
 +
 +    return (bRest && grptp == egrptpPART);
 +}
 +
 +static void calc_nrdf(gmx_mtop_t *mtop, t_inputrec *ir, char **gnames)
 +{
 +    t_grpopts              *opts;
 +    gmx_groups_t           *groups;
 +    t_pull                 *pull;
 +    int                     natoms, ai, aj, i, j, d, g, imin, jmin, nc;
 +    t_iatom                *ia;
 +    int                    *nrdf2, *na_vcm, na_tot;
 +    double                 *nrdf_tc, *nrdf_vcm, nrdf_uc, n_sub = 0;
 +    gmx_mtop_atomloop_all_t aloop;
 +    t_atom                 *atom;
 +    int                     mb, mol, ftype, as;
 +    gmx_molblock_t         *molb;
 +    gmx_moltype_t          *molt;
 +
 +    /* Calculate nrdf.
 +     * First calc 3xnr-atoms for each group
 +     * then subtract half a degree of freedom for each constraint
 +     *
 +     * Only atoms and nuclei contribute to the degrees of freedom...
 +     */
 +
 +    opts = &ir->opts;
 +
 +    groups = &mtop->groups;
 +    natoms = mtop->natoms;
 +
 +    /* Allocate one more for a possible rest group */
 +    /* We need to sum degrees of freedom into doubles,
 +     * since floats give too low nrdf's above 3 million atoms.
 +     */
 +    snew(nrdf_tc, groups->grps[egcTC].nr+1);
 +    snew(nrdf_vcm, groups->grps[egcVCM].nr+1);
 +    snew(na_vcm, groups->grps[egcVCM].nr+1);
 +
 +    for (i = 0; i < groups->grps[egcTC].nr; i++)
 +    {
 +        nrdf_tc[i] = 0;
 +    }
 +    for (i = 0; i < groups->grps[egcVCM].nr+1; i++)
 +    {
 +        nrdf_vcm[i] = 0;
 +    }
 +
 +    snew(nrdf2, natoms);
 +    aloop = gmx_mtop_atomloop_all_init(mtop);
 +    while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
 +    {
 +        nrdf2[i] = 0;
 +        if (atom->ptype == eptAtom || atom->ptype == eptNucleus)
 +        {
 +            g = ggrpnr(groups, egcFREEZE, i);
 +            /* Double count nrdf for particle i */
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (opts->nFreeze[g][d] == 0)
 +                {
 +                    nrdf2[i] += 2;
 +                }
 +            }
 +            nrdf_tc [ggrpnr(groups, egcTC, i)]  += 0.5*nrdf2[i];
 +            nrdf_vcm[ggrpnr(groups, egcVCM, i)] += 0.5*nrdf2[i];
 +        }
 +    }
 +
 +    as = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        atom = molt->atoms.atom;
 +        for (mol = 0; mol < molb->nmol; mol++)
 +        {
 +            for (ftype = F_CONSTR; ftype <= F_CONSTRNC; ftype++)
 +            {
 +                ia = molt->ilist[ftype].iatoms;
 +                for (i = 0; i < molt->ilist[ftype].nr; )
 +                {
 +                    /* Subtract degrees of freedom for the constraints,
 +                     * if the particles still have degrees of freedom left.
 +                     * If one of the particles is a vsite or a shell, then all
 +                     * constraint motion will go there, but since they do not
 +                     * contribute to the constraints the degrees of freedom do not
 +                     * change.
 +                     */
 +                    ai = as + ia[1];
 +                    aj = as + ia[2];
 +                    if (((atom[ia[1]].ptype == eptNucleus) ||
 +                         (atom[ia[1]].ptype == eptAtom)) &&
 +                        ((atom[ia[2]].ptype == eptNucleus) ||
 +                         (atom[ia[2]].ptype == eptAtom)))
 +                    {
 +                        if (nrdf2[ai] > 0)
 +                        {
 +                            jmin = 1;
 +                        }
 +                        else
 +                        {
 +                            jmin = 2;
 +                        }
 +                        if (nrdf2[aj] > 0)
 +                        {
 +                            imin = 1;
 +                        }
 +                        else
 +                        {
 +                            imin = 2;
 +                        }
 +                        imin       = min(imin, nrdf2[ai]);
 +                        jmin       = min(jmin, nrdf2[aj]);
 +                        nrdf2[ai] -= imin;
 +                        nrdf2[aj] -= jmin;
 +                        nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5*imin;
 +                        nrdf_tc [ggrpnr(groups, egcTC, aj)]  -= 0.5*jmin;
 +                        nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5*imin;
 +                        nrdf_vcm[ggrpnr(groups, egcVCM, aj)] -= 0.5*jmin;
 +                    }
 +                    ia += interaction_function[ftype].nratoms+1;
 +                    i  += interaction_function[ftype].nratoms+1;
 +                }
 +            }
 +            ia = molt->ilist[F_SETTLE].iatoms;
 +            for (i = 0; i < molt->ilist[F_SETTLE].nr; )
 +            {
 +                /* Subtract 1 dof from every atom in the SETTLE */
 +                for (j = 0; j < 3; j++)
 +                {
 +                    ai         = as + ia[1+j];
 +                    imin       = min(2, nrdf2[ai]);
 +                    nrdf2[ai] -= imin;
 +                    nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5*imin;
 +                    nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5*imin;
 +                }
 +                ia += 4;
 +                i  += 4;
 +            }
 +            as += molt->atoms.nr;
 +        }
 +    }
 +
 +    if (ir->ePull == epullCONSTRAINT)
 +    {
 +        /* Correct nrdf for the COM constraints.
 +         * We correct using the TC and VCM group of the first atom
 +         * in the reference and pull group. If atoms in one pull group
 +         * belong to different TC or VCM groups it is anyhow difficult
 +         * to determine the optimal nrdf assignment.
 +         */
 +        pull = ir->pull;
 +        if (pull->eGeom == epullgPOS)
 +        {
 +            nc = 0;
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (pull->dim[i])
 +                {
 +                    nc++;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            nc = 1;
 +        }
 +        for (i = 0; i < pull->ngrp; i++)
 +        {
 +            imin = 2*nc;
 +            if (pull->grp[0].nat > 0)
 +            {
 +                /* Subtract 1/2 dof from the reference group */
 +                ai = pull->grp[0].ind[0];
 +                if (nrdf_tc[ggrpnr(groups, egcTC, ai)] > 1)
 +                {
 +                    nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5;
 +                    nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5;
 +                    imin--;
 +                }
 +            }
 +            /* Subtract 1/2 dof from the pulled group */
 +            ai = pull->grp[1+i].ind[0];
 +            nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5*imin;
 +            nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5*imin;
 +            if (nrdf_tc[ggrpnr(groups, egcTC, ai)] < 0)
 +            {
 +                gmx_fatal(FARGS, "Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative", gnames[groups->grps[egcTC].nm_ind[ggrpnr(groups, egcTC, ai)]]);
 +            }
 +        }
 +    }
 +
 +    if (ir->nstcomm != 0)
 +    {
 +        /* Subtract 3 from the number of degrees of freedom in each vcm group
 +         * when com translation is removed and 6 when rotation is removed
 +         * as well.
 +         */
 +        switch (ir->comm_mode)
 +        {
 +            case ecmLINEAR:
 +                n_sub = ndof_com(ir);
 +                break;
 +            case ecmANGULAR:
 +                n_sub = 6;
 +                break;
 +            default:
 +                n_sub = 0;
 +                gmx_incons("Checking comm_mode");
 +        }
 +
 +        for (i = 0; i < groups->grps[egcTC].nr; i++)
 +        {
 +            /* Count the number of atoms of TC group i for every VCM group */
 +            for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +            {
 +                na_vcm[j] = 0;
 +            }
 +            na_tot = 0;
 +            for (ai = 0; ai < natoms; ai++)
 +            {
 +                if (ggrpnr(groups, egcTC, ai) == i)
 +                {
 +                    na_vcm[ggrpnr(groups, egcVCM, ai)]++;
 +                    na_tot++;
 +                }
 +            }
 +            /* Correct for VCM removal according to the fraction of each VCM
 +             * group present in this TC group.
 +             */
 +            nrdf_uc = nrdf_tc[i];
 +            if (debug)
 +            {
 +                fprintf(debug, "T-group[%d] nrdf_uc = %g, n_sub = %g\n",
 +                        i, nrdf_uc, n_sub);
 +            }
 +            nrdf_tc[i] = 0;
 +            for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +            {
 +                if (nrdf_vcm[j] > n_sub)
 +                {
 +                    nrdf_tc[i] += nrdf_uc*((double)na_vcm[j]/(double)na_tot)*
 +                        (nrdf_vcm[j] - n_sub)/nrdf_vcm[j];
 +                }
 +                if (debug)
 +                {
 +                    fprintf(debug, "  nrdf_vcm[%d] = %g, nrdf = %g\n",
 +                            j, nrdf_vcm[j], nrdf_tc[i]);
 +                }
 +            }
 +        }
 +    }
 +    for (i = 0; (i < groups->grps[egcTC].nr); i++)
 +    {
 +        opts->nrdf[i] = nrdf_tc[i];
 +        if (opts->nrdf[i] < 0)
 +        {
 +            opts->nrdf[i] = 0;
 +        }
 +        fprintf(stderr,
 +                "Number of degrees of freedom in T-Coupling group %s is %.2f\n",
 +                gnames[groups->grps[egcTC].nm_ind[i]], opts->nrdf[i]);
 +    }
 +
 +    sfree(nrdf2);
 +    sfree(nrdf_tc);
 +    sfree(nrdf_vcm);
 +    sfree(na_vcm);
 +}
 +
 +static void decode_cos(char *s, t_cosines *cosine, gmx_bool bTime)
 +{
 +    char   *t;
 +    char    format[STRLEN], f1[STRLEN];
 +    double  a, phi;
 +    int     i;
 +
 +    t = strdup(s);
 +    trim(t);
 +
 +    cosine->n   = 0;
 +    cosine->a   = NULL;
 +    cosine->phi = NULL;
 +    if (strlen(t))
 +    {
 +        sscanf(t, "%d", &(cosine->n));
 +        if (cosine->n <= 0)
 +        {
 +            cosine->n = 0;
 +        }
 +        else
 +        {
 +            snew(cosine->a, cosine->n);
 +            snew(cosine->phi, cosine->n);
 +
 +            sprintf(format, "%%*d");
 +            for (i = 0; (i < cosine->n); i++)
 +            {
 +                strcpy(f1, format);
 +                strcat(f1, "%lf%lf");
 +                if (sscanf(t, f1, &a, &phi) < 2)
 +                {
 +                    gmx_fatal(FARGS, "Invalid input for electric field shift: '%s'", t);
 +                }
 +                cosine->a[i]   = a;
 +                cosine->phi[i] = phi;
 +                strcat(format, "%*lf%*lf");
 +            }
 +        }
 +    }
 +    sfree(t);
 +}
 +
 +static gmx_bool do_egp_flag(t_inputrec *ir, gmx_groups_t *groups,
 +                            const char *option, const char *val, int flag)
 +{
 +    /* The maximum number of energy group pairs would be MAXPTR*(MAXPTR+1)/2.
 +     * But since this is much larger than STRLEN, such a line can not be parsed.
 +     * The real maximum is the number of names that fit in a string: STRLEN/2.
 +     */
 +#define EGP_MAX (STRLEN/2)
 +    int      nelem, i, j, k, nr;
 +    char    *names[EGP_MAX];
 +    char  ***gnames;
 +    gmx_bool bSet;
 +
 +    gnames = groups->grpname;
 +
 +    nelem = str_nelem(val, EGP_MAX, names);
 +    if (nelem % 2 != 0)
 +    {
 +        gmx_fatal(FARGS, "The number of groups for %s is odd", option);
 +    }
 +    nr   = groups->grps[egcENER].nr;
 +    bSet = FALSE;
 +    for (i = 0; i < nelem/2; i++)
 +    {
 +        j = 0;
 +        while ((j < nr) &&
 +               gmx_strcasecmp(names[2*i], *(gnames[groups->grps[egcENER].nm_ind[j]])))
 +        {
 +            j++;
 +        }
 +        if (j == nr)
 +        {
 +            gmx_fatal(FARGS, "%s in %s is not an energy group\n",
 +                      names[2*i], option);
 +        }
 +        k = 0;
 +        while ((k < nr) &&
 +               gmx_strcasecmp(names[2*i+1], *(gnames[groups->grps[egcENER].nm_ind[k]])))
 +        {
 +            k++;
 +        }
 +        if (k == nr)
 +        {
 +            gmx_fatal(FARGS, "%s in %s is not an energy group\n",
 +                      names[2*i+1], option);
 +        }
 +        if ((j < nr) && (k < nr))
 +        {
 +            ir->opts.egp_flags[nr*j+k] |= flag;
 +            ir->opts.egp_flags[nr*k+j] |= flag;
 +            bSet = TRUE;
 +        }
 +    }
 +
 +    return bSet;
 +}
 +
 +void do_index(const char* mdparin, const char *ndx,
 +              gmx_mtop_t *mtop,
 +              gmx_bool bVerbose,
 +              t_inputrec *ir, rvec *v,
 +              warninp_t wi)
 +{
 +    t_blocka     *grps;
 +    gmx_groups_t *groups;
 +    int           natoms;
 +    t_symtab     *symtab;
 +    t_atoms       atoms_all;
 +    char          warnbuf[STRLEN], **gnames;
 +    int           nr, ntcg, ntau_t, nref_t, nacc, nofg, nSA, nSA_points, nSA_time, nSA_temp;
 +    real          tau_min;
 +    int           nstcmin;
 +    int           nacg, nfreeze, nfrdim, nenergy, nvcm, nuser;
 +    char         *ptr1[MAXPTR], *ptr2[MAXPTR], *ptr3[MAXPTR];
 +    int           i, j, k, restnm;
 +    real          SAtime;
 +    gmx_bool      bExcl, bTable, bSetTCpar, bAnneal, bRest;
 +    int           nQMmethod, nQMbasis, nQMcharge, nQMmult, nbSH, nCASorb, nCASelec,
 +                  nSAon, nSAoff, nSAsteps, nQMg, nbOPT, nbTS;
 +    char          warn_buf[STRLEN];
 +
 +    if (bVerbose)
 +    {
 +        fprintf(stderr, "processing index file...\n");
 +    }
 +    debug_gmx();
 +    if (ndx == NULL)
 +    {
 +        snew(grps, 1);
 +        snew(grps->index, 1);
 +        snew(gnames, 1);
 +        atoms_all = gmx_mtop_global_atoms(mtop);
 +        analyse(&atoms_all, grps, &gnames, FALSE, TRUE);
 +        free_t_atoms(&atoms_all, FALSE);
 +    }
 +    else
 +    {
 +        grps = init_index(ndx, &gnames);
 +    }
 +
 +    groups = &mtop->groups;
 +    natoms = mtop->natoms;
 +    symtab = &mtop->symtab;
 +
 +    snew(groups->grpname, grps->nr+1);
 +
 +    for (i = 0; (i < grps->nr); i++)
 +    {
 +        groups->grpname[i] = put_symtab(symtab, gnames[i]);
 +    }
 +    groups->grpname[i] = put_symtab(symtab, "rest");
 +    restnm             = i;
 +    srenew(gnames, grps->nr+1);
 +    gnames[restnm]   = *(groups->grpname[i]);
 +    groups->ngrpname = grps->nr+1;
 +
 +    set_warning_line(wi, mdparin, -1);
 +
 +    ntau_t = str_nelem(tau_t, MAXPTR, ptr1);
 +    nref_t = str_nelem(ref_t, MAXPTR, ptr2);
 +    ntcg   = str_nelem(tcgrps, MAXPTR, ptr3);
 +    if ((ntau_t != ntcg) || (nref_t != ntcg))
 +    {
 +        gmx_fatal(FARGS, "Invalid T coupling input: %d groups, %d ref-t values and "
 +                  "%d tau-t values", ntcg, nref_t, ntau_t);
 +    }
 +
 +    bSetTCpar = (ir->etc || EI_SD(ir->eI) || ir->eI == eiBD || EI_TPI(ir->eI));
 +    do_numbering(natoms, groups, ntcg, ptr3, grps, gnames, egcTC,
 +                 restnm, bSetTCpar ? egrptpALL : egrptpALL_GENREST, bVerbose, wi);
 +    nr            = groups->grps[egcTC].nr;
 +    ir->opts.ngtc = nr;
 +    snew(ir->opts.nrdf, nr);
 +    snew(ir->opts.tau_t, nr);
 +    snew(ir->opts.ref_t, nr);
 +    if (ir->eI == eiBD && ir->bd_fric == 0)
 +    {
 +        fprintf(stderr, "bd-fric=0, so tau-t will be used as the inverse friction constant(s)\n");
 +    }
 +
 +    if (bSetTCpar)
 +    {
 +        if (nr != nref_t)
 +        {
 +            gmx_fatal(FARGS, "Not enough ref-t and tau-t values!");
 +        }
 +
 +        tau_min = 1e20;
 +        for (i = 0; (i < nr); i++)
 +        {
 +            ir->opts.tau_t[i] = strtod(ptr1[i], NULL);
 +            if ((ir->eI == eiBD || ir->eI == eiSD2) && ir->opts.tau_t[i] <= 0)
 +            {
 +                sprintf(warn_buf, "With integrator %s tau-t should be larger than 0", ei_names[ir->eI]);
 +                warning_error(wi, warn_buf);
 +            }
 +
 +            if (ir->etc != etcVRESCALE && ir->opts.tau_t[i] == 0)
 +            {
 +                warning_note(wi, "tau-t = -1 is the value to signal that a group should not have temperature coupling. Treating your use of tau-t = 0 as if you used -1.");
 +            }
 +
 +            if (ir->opts.tau_t[i] >= 0)
 +            {
 +                tau_min = min(tau_min, ir->opts.tau_t[i]);
 +            }
 +        }
 +        if (ir->etc != etcNO && ir->nsttcouple == -1)
 +        {
 +            ir->nsttcouple = ir_optimal_nsttcouple(ir);
 +        }
 +
 +        if (EI_VV(ir->eI))
 +        {
 +            if ((ir->etc == etcNOSEHOOVER) && (ir->epc == epcBERENDSEN))
 +            {
 +                gmx_fatal(FARGS, "Cannot do Nose-Hoover temperature with Berendsen pressure control with md-vv; use either vrescale temperature with berendsen pressure or Nose-Hoover temperature with MTTK pressure");
 +            }
 +            if ((ir->epc == epcMTTK) && (ir->etc > etcNO))
 +            {
 +                if (ir->nstpcouple != ir->nsttcouple)
 +                {
 +                    int mincouple = min(ir->nstpcouple, ir->nsttcouple);
 +                    ir->nstpcouple = ir->nsttcouple = mincouple;
 +                    sprintf(warn_buf, "for current Trotter decomposition methods with vv, nsttcouple and nstpcouple must be equal.  Both have been reset to min(nsttcouple,nstpcouple) = %d", mincouple);
 +                    warning_note(wi, warn_buf);
 +                }
 +            }
 +        }
 +        /* velocity verlet with averaged kinetic energy KE = 0.5*(v(t+1/2) - v(t-1/2)) is implemented
 +           primarily for testing purposes, and does not work with temperature coupling other than 1 */
 +
 +        if (ETC_ANDERSEN(ir->etc))
 +        {
 +            if (ir->nsttcouple != 1)
 +            {
 +                ir->nsttcouple = 1;
 +                sprintf(warn_buf, "Andersen temperature control methods assume nsttcouple = 1; there is no need for larger nsttcouple > 1, since no global parameters are computed. nsttcouple has been reset to 1");
 +                warning_note(wi, warn_buf);
 +            }
 +        }
 +        nstcmin = tcouple_min_integration_steps(ir->etc);
 +        if (nstcmin > 1)
 +        {
 +            if (tau_min/(ir->delta_t*ir->nsttcouple) < nstcmin)
 +            {
 +                sprintf(warn_buf, "For proper integration of the %s thermostat, tau-t (%g) should be at least %d times larger than nsttcouple*dt (%g)",
 +                        ETCOUPLTYPE(ir->etc),
 +                        tau_min, nstcmin,
 +                        ir->nsttcouple*ir->delta_t);
 +                warning(wi, warn_buf);
 +            }
 +        }
 +        for (i = 0; (i < nr); i++)
 +        {
 +            ir->opts.ref_t[i] = strtod(ptr2[i], NULL);
 +            if (ir->opts.ref_t[i] < 0)
 +            {
 +                gmx_fatal(FARGS, "ref-t for group %d negative", i);
 +            }
 +        }
 +        /* set the lambda mc temperature to the md integrator temperature (which should be defined
 +           if we are in this conditional) if mc_temp is negative */
 +        if (ir->expandedvals->mc_temp < 0)
 +        {
 +            ir->expandedvals->mc_temp = ir->opts.ref_t[0]; /*for now, set to the first reft */
 +        }
 +    }
 +
 +    /* Simulated annealing for each group. There are nr groups */
 +    nSA = str_nelem(anneal, MAXPTR, ptr1);
 +    if (nSA == 1 && (ptr1[0][0] == 'n' || ptr1[0][0] == 'N'))
 +    {
 +        nSA = 0;
 +    }
 +    if (nSA > 0 && nSA != nr)
 +    {
 +        gmx_fatal(FARGS, "Not enough annealing values: %d (for %d groups)\n", nSA, nr);
 +    }
 +    else
 +    {
 +        snew(ir->opts.annealing, nr);
 +        snew(ir->opts.anneal_npoints, nr);
 +        snew(ir->opts.anneal_time, nr);
 +        snew(ir->opts.anneal_temp, nr);
 +        for (i = 0; i < nr; i++)
 +        {
 +            ir->opts.annealing[i]      = eannNO;
 +            ir->opts.anneal_npoints[i] = 0;
 +            ir->opts.anneal_time[i]    = NULL;
 +            ir->opts.anneal_temp[i]    = NULL;
 +        }
 +        if (nSA > 0)
 +        {
 +            bAnneal = FALSE;
 +            for (i = 0; i < nr; i++)
 +            {
 +                if (ptr1[i][0] == 'n' || ptr1[i][0] == 'N')
 +                {
 +                    ir->opts.annealing[i] = eannNO;
 +                }
 +                else if (ptr1[i][0] == 's' || ptr1[i][0] == 'S')
 +                {
 +                    ir->opts.annealing[i] = eannSINGLE;
 +                    bAnneal               = TRUE;
 +                }
 +                else if (ptr1[i][0] == 'p' || ptr1[i][0] == 'P')
 +                {
 +                    ir->opts.annealing[i] = eannPERIODIC;
 +                    bAnneal               = TRUE;
 +                }
 +            }
 +            if (bAnneal)
 +            {
 +                /* Read the other fields too */
 +                nSA_points = str_nelem(anneal_npoints, MAXPTR, ptr1);
 +                if (nSA_points != nSA)
 +                {
 +                    gmx_fatal(FARGS, "Found %d annealing-npoints values for %d groups\n", nSA_points, nSA);
 +                }
 +                for (k = 0, i = 0; i < nr; i++)
 +                {
 +                    ir->opts.anneal_npoints[i] = strtol(ptr1[i], NULL, 10);
 +                    if (ir->opts.anneal_npoints[i] == 1)
 +                    {
 +                        gmx_fatal(FARGS, "Please specify at least a start and an end point for annealing\n");
 +                    }
 +                    snew(ir->opts.anneal_time[i], ir->opts.anneal_npoints[i]);
 +                    snew(ir->opts.anneal_temp[i], ir->opts.anneal_npoints[i]);
 +                    k += ir->opts.anneal_npoints[i];
 +                }
 +
 +                nSA_time = str_nelem(anneal_time, MAXPTR, ptr1);
 +                if (nSA_time != k)
 +                {
 +                    gmx_fatal(FARGS, "Found %d annealing-time values, wanter %d\n", nSA_time, k);
 +                }
 +                nSA_temp = str_nelem(anneal_temp, MAXPTR, ptr2);
 +                if (nSA_temp != k)
 +                {
 +                    gmx_fatal(FARGS, "Found %d annealing-temp values, wanted %d\n", nSA_temp, k);
 +                }
 +
 +                for (i = 0, k = 0; i < nr; i++)
 +                {
 +
 +                    for (j = 0; j < ir->opts.anneal_npoints[i]; j++)
 +                    {
 +                        ir->opts.anneal_time[i][j] = strtod(ptr1[k], NULL);
 +                        ir->opts.anneal_temp[i][j] = strtod(ptr2[k], NULL);
 +                        if (j == 0)
 +                        {
 +                            if (ir->opts.anneal_time[i][0] > (ir->init_t+GMX_REAL_EPS))
 +                            {
 +                                gmx_fatal(FARGS, "First time point for annealing > init_t.\n");
 +                            }
 +                        }
 +                        else
 +                        {
 +                            /* j>0 */
 +                            if (ir->opts.anneal_time[i][j] < ir->opts.anneal_time[i][j-1])
 +                            {
 +                                gmx_fatal(FARGS, "Annealing timepoints out of order: t=%f comes after t=%f\n",
 +                                          ir->opts.anneal_time[i][j], ir->opts.anneal_time[i][j-1]);
 +                            }
 +                        }
 +                        if (ir->opts.anneal_temp[i][j] < 0)
 +                        {
 +                            gmx_fatal(FARGS, "Found negative temperature in annealing: %f\n", ir->opts.anneal_temp[i][j]);
 +                        }
 +                        k++;
 +                    }
 +                }
 +                /* Print out some summary information, to make sure we got it right */
 +                for (i = 0, k = 0; i < nr; i++)
 +                {
 +                    if (ir->opts.annealing[i] != eannNO)
 +                    {
 +                        j = groups->grps[egcTC].nm_ind[i];
 +                        fprintf(stderr, "Simulated annealing for group %s: %s, %d timepoints\n",
 +                                *(groups->grpname[j]), eann_names[ir->opts.annealing[i]],
 +                                ir->opts.anneal_npoints[i]);
 +                        fprintf(stderr, "Time (ps)   Temperature (K)\n");
 +                        /* All terms except the last one */
 +                        for (j = 0; j < (ir->opts.anneal_npoints[i]-1); j++)
 +                        {
 +                            fprintf(stderr, "%9.1f      %5.1f\n", ir->opts.anneal_time[i][j], ir->opts.anneal_temp[i][j]);
 +                        }
 +
 +                        /* Finally the last one */
 +                        j = ir->opts.anneal_npoints[i]-1;
 +                        if (ir->opts.annealing[i] == eannSINGLE)
 +                        {
 +                            fprintf(stderr, "%9.1f-     %5.1f\n", ir->opts.anneal_time[i][j], ir->opts.anneal_temp[i][j]);
 +                        }
 +                        else
 +                        {
 +                            fprintf(stderr, "%9.1f      %5.1f\n", ir->opts.anneal_time[i][j], ir->opts.anneal_temp[i][j]);
 +                            if (fabs(ir->opts.anneal_temp[i][j]-ir->opts.anneal_temp[i][0]) > GMX_REAL_EPS)
 +                            {
 +                                warning_note(wi, "There is a temperature jump when your annealing loops back.\n");
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        make_pull_groups(ir->pull, pull_grp, grps, gnames);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        make_rotation_groups(ir->rot, rot_grp, grps, gnames);
 +    }
 +
 +    nacc = str_nelem(acc, MAXPTR, ptr1);
 +    nacg = str_nelem(accgrps, MAXPTR, ptr2);
 +    if (nacg*DIM != nacc)
 +    {
 +        gmx_fatal(FARGS, "Invalid Acceleration input: %d groups and %d acc. values",
 +                  nacg, nacc);
 +    }
 +    do_numbering(natoms, groups, nacg, ptr2, grps, gnames, egcACC,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nr = groups->grps[egcACC].nr;
 +    snew(ir->opts.acc, nr);
 +    ir->opts.ngacc = nr;
 +
 +    for (i = k = 0; (i < nacg); i++)
 +    {
 +        for (j = 0; (j < DIM); j++, k++)
 +        {
 +            ir->opts.acc[i][j] = strtod(ptr1[k], NULL);
 +        }
 +    }
 +    for (; (i < nr); i++)
 +    {
 +        for (j = 0; (j < DIM); j++)
 +        {
 +            ir->opts.acc[i][j] = 0;
 +        }
 +    }
 +
 +    nfrdim  = str_nelem(frdim, MAXPTR, ptr1);
 +    nfreeze = str_nelem(freeze, MAXPTR, ptr2);
 +    if (nfrdim != DIM*nfreeze)
 +    {
 +        gmx_fatal(FARGS, "Invalid Freezing input: %d groups and %d freeze values",
 +                  nfreeze, nfrdim);
 +    }
 +    do_numbering(natoms, groups, nfreeze, ptr2, grps, gnames, egcFREEZE,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nr             = groups->grps[egcFREEZE].nr;
 +    ir->opts.ngfrz = nr;
 +    snew(ir->opts.nFreeze, nr);
 +    for (i = k = 0; (i < nfreeze); i++)
 +    {
 +        for (j = 0; (j < DIM); j++, k++)
 +        {
 +            ir->opts.nFreeze[i][j] = (gmx_strncasecmp(ptr1[k], "Y", 1) == 0);
 +            if (!ir->opts.nFreeze[i][j])
 +            {
 +                if (gmx_strncasecmp(ptr1[k], "N", 1) != 0)
 +                {
 +                    sprintf(warnbuf, "Please use Y(ES) or N(O) for freezedim only "
 +                            "(not %s)", ptr1[k]);
 +                    warning(wi, warn_buf);
 +                }
 +            }
 +        }
 +    }
 +    for (; (i < nr); i++)
 +    {
 +        for (j = 0; (j < DIM); j++)
 +        {
 +            ir->opts.nFreeze[i][j] = 0;
 +        }
 +    }
 +
 +    nenergy = str_nelem(energy, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nenergy, ptr1, grps, gnames, egcENER,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    add_wall_energrps(groups, ir->nwall, symtab);
 +    ir->opts.ngener = groups->grps[egcENER].nr;
 +    nvcm            = str_nelem(vcm, MAXPTR, ptr1);
 +    bRest           =
 +        do_numbering(natoms, groups, nvcm, ptr1, grps, gnames, egcVCM,
 +                     restnm, nvcm == 0 ? egrptpALL_GENREST : egrptpPART, bVerbose, wi);
 +    if (bRest)
 +    {
 +        warning(wi, "Some atoms are not part of any center of mass motion removal group.\n"
 +                "This may lead to artifacts.\n"
 +                "In most cases one should use one group for the whole system.");
 +    }
 +
 +    /* Now we have filled the freeze struct, so we can calculate NRDF */
 +    calc_nrdf(mtop, ir, gnames);
 +
 +    if (v && NULL)
 +    {
 +        real fac, ntot = 0;
 +
 +        /* Must check per group! */
 +        for (i = 0; (i < ir->opts.ngtc); i++)
 +        {
 +            ntot += ir->opts.nrdf[i];
 +        }
 +        if (ntot != (DIM*natoms))
 +        {
 +            fac = sqrt(ntot/(DIM*natoms));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Scaling velocities by a factor of %.3f to account for constraints\n"
 +                        "and removal of center of mass motion\n", fac);
 +            }
 +            for (i = 0; (i < natoms); i++)
 +            {
 +                svmul(fac, v[i], v[i]);
 +            }
 +        }
 +    }
 +
 +    nuser = str_nelem(user1, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nuser, ptr1, grps, gnames, egcUser1,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nuser = str_nelem(user2, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nuser, ptr1, grps, gnames, egcUser2,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nuser = str_nelem(xtc_grps, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nuser, ptr1, grps, gnames, egcXTC,
 +                 restnm, egrptpONE, bVerbose, wi);
 +    nofg = str_nelem(orirefitgrp, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nofg, ptr1, grps, gnames, egcORFIT,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +
 +    /* QMMM input processing */
 +    nQMg          = str_nelem(QMMM, MAXPTR, ptr1);
 +    nQMmethod     = str_nelem(QMmethod, MAXPTR, ptr2);
 +    nQMbasis      = str_nelem(QMbasis, MAXPTR, ptr3);
 +    if ((nQMmethod != nQMg) || (nQMbasis != nQMg))
 +    {
 +        gmx_fatal(FARGS, "Invalid QMMM input: %d groups %d basissets"
 +                  " and %d methods\n", nQMg, nQMbasis, nQMmethod);
 +    }
 +    /* group rest, if any, is always MM! */
 +    do_numbering(natoms, groups, nQMg, ptr1, grps, gnames, egcQMMM,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nr            = nQMg; /*atoms->grps[egcQMMM].nr;*/
 +    ir->opts.ngQM = nQMg;
 +    snew(ir->opts.QMmethod, nr);
 +    snew(ir->opts.QMbasis, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        /* input consists of strings: RHF CASSCF PM3 .. These need to be
 +         * converted to the corresponding enum in names.c
 +         */
 +        ir->opts.QMmethod[i] = search_QMstring(ptr2[i], eQMmethodNR,
 +                                               eQMmethod_names);
 +        ir->opts.QMbasis[i]  = search_QMstring(ptr3[i], eQMbasisNR,
 +                                               eQMbasis_names);
 +
 +    }
 +    nQMmult   = str_nelem(QMmult, MAXPTR, ptr1);
 +    nQMcharge = str_nelem(QMcharge, MAXPTR, ptr2);
 +    nbSH      = str_nelem(bSH, MAXPTR, ptr3);
 +    snew(ir->opts.QMmult, nr);
 +    snew(ir->opts.QMcharge, nr);
 +    snew(ir->opts.bSH, nr);
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.QMmult[i]   = strtol(ptr1[i], NULL, 10);
 +        ir->opts.QMcharge[i] = strtol(ptr2[i], NULL, 10);
 +        ir->opts.bSH[i]      = (gmx_strncasecmp(ptr3[i], "Y", 1) == 0);
 +    }
 +
 +    nCASelec  = str_nelem(CASelectrons, MAXPTR, ptr1);
 +    nCASorb   = str_nelem(CASorbitals, MAXPTR, ptr2);
 +    snew(ir->opts.CASelectrons, nr);
 +    snew(ir->opts.CASorbitals, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.CASelectrons[i] = strtol(ptr1[i], NULL, 10);
 +        ir->opts.CASorbitals[i]  = strtol(ptr2[i], NULL, 10);
 +    }
 +    /* special optimization options */
 +
 +    nbOPT = str_nelem(bOPT, MAXPTR, ptr1);
 +    nbTS  = str_nelem(bTS, MAXPTR, ptr2);
 +    snew(ir->opts.bOPT, nr);
 +    snew(ir->opts.bTS, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.bOPT[i] = (gmx_strncasecmp(ptr1[i], "Y", 1) == 0);
 +        ir->opts.bTS[i]  = (gmx_strncasecmp(ptr2[i], "Y", 1) == 0);
 +    }
 +    nSAon     = str_nelem(SAon, MAXPTR, ptr1);
 +    nSAoff    = str_nelem(SAoff, MAXPTR, ptr2);
 +    nSAsteps  = str_nelem(SAsteps, MAXPTR, ptr3);
 +    snew(ir->opts.SAon, nr);
 +    snew(ir->opts.SAoff, nr);
 +    snew(ir->opts.SAsteps, nr);
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.SAon[i]    = strtod(ptr1[i], NULL);
 +        ir->opts.SAoff[i]   = strtod(ptr2[i], NULL);
 +        ir->opts.SAsteps[i] = strtol(ptr3[i], NULL, 10);
 +    }
 +    /* end of QMMM input */
 +
 +    if (bVerbose)
 +    {
 +        for (i = 0; (i < egcNR); i++)
 +        {
 +            fprintf(stderr, "%-16s has %d element(s):", gtypes[i], groups->grps[i].nr);
 +            for (j = 0; (j < groups->grps[i].nr); j++)
 +            {
 +                fprintf(stderr, " %s", *(groups->grpname[groups->grps[i].nm_ind[j]]));
 +            }
 +            fprintf(stderr, "\n");
 +        }
 +    }
 +
 +    nr = groups->grps[egcENER].nr;
 +    snew(ir->opts.egp_flags, nr*nr);
 +
 +    bExcl = do_egp_flag(ir, groups, "energygrp-excl", egpexcl, EGP_EXCL);
 +    if (bExcl && ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        warning_error(wi, "Energy group exclusions are not (yet) implemented for the Verlet scheme");
 +    }
 +    if (bExcl && EEL_FULL(ir->coulombtype))
 +    {
 +        warning(wi, "Can not exclude the lattice Coulomb energy between energy groups");
 +    }
 +
 +    bTable = do_egp_flag(ir, groups, "energygrp-table", egptable, EGP_TABLE);
 +    if (bTable && !(ir->vdwtype == evdwUSER) &&
 +        !(ir->coulombtype == eelUSER) && !(ir->coulombtype == eelPMEUSER) &&
 +        !(ir->coulombtype == eelPMEUSERSWITCH))
 +    {
 +        gmx_fatal(FARGS, "Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
 +    }
 +
 +    decode_cos(efield_x, &(ir->ex[XX]), FALSE);
 +    decode_cos(efield_xt, &(ir->et[XX]), TRUE);
 +    decode_cos(efield_y, &(ir->ex[YY]), FALSE);
 +    decode_cos(efield_yt, &(ir->et[YY]), TRUE);
 +    decode_cos(efield_z, &(ir->ex[ZZ]), FALSE);
 +    decode_cos(efield_zt, &(ir->et[ZZ]), TRUE);
 +
 +    if (ir->bAdress)
 +    {
 +        do_adress_index(ir->adress, groups, gnames, &(ir->opts), wi);
 +    }
 +
 +    for (i = 0; (i < grps->nr); i++)
 +    {
 +        sfree(gnames[i]);
 +    }
 +    sfree(gnames);
 +    done_blocka(grps);
 +    sfree(grps);
 +
 +}
 +
 +
 +
 +static void check_disre(gmx_mtop_t *mtop)
 +{
 +    gmx_ffparams_t *ffparams;
 +    t_functype     *functype;
 +    t_iparams      *ip;
 +    int             i, ndouble, ftype;
 +    int             label, old_label;
 +
 +    if (gmx_mtop_ftype_count(mtop, F_DISRES) > 0)
 +    {
 +        ffparams  = &mtop->ffparams;
 +        functype  = ffparams->functype;
 +        ip        = ffparams->iparams;
 +        ndouble   = 0;
 +        old_label = -1;
 +        for (i = 0; i < ffparams->ntypes; i++)
 +        {
 +            ftype = functype[i];
 +            if (ftype == F_DISRES)
 +            {
 +                label = ip[i].disres.label;
 +                if (label == old_label)
 +                {
 +                    fprintf(stderr, "Distance restraint index %d occurs twice\n", label);
 +                    ndouble++;
 +                }
 +                old_label = label;
 +            }
 +        }
 +        if (ndouble > 0)
 +        {
 +            gmx_fatal(FARGS, "Found %d double distance restraint indices,\n"
 +                      "probably the parameters for multiple pairs in one restraint "
 +                      "are not identical\n", ndouble);
 +        }
 +    }
 +}
 +
 +static gmx_bool absolute_reference(t_inputrec *ir, gmx_mtop_t *sys,
 +                                   gmx_bool posres_only,
 +                                   ivec AbsRef)
 +{
 +    int                  d, g, i;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *ilist;
 +    int                  nmol;
 +    t_iparams           *pr;
 +
 +    clear_ivec(AbsRef);
 +
 +    if (!posres_only)
 +    {
 +        /* Check the COM */
 +        for (d = 0; d < DIM; d++)
 +        {
 +            AbsRef[d] = (d < ndof_com(ir) ? 0 : 1);
 +        }
 +        /* Check for freeze groups */
 +        for (g = 0; g < ir->opts.ngfrz; g++)
 +        {
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (ir->opts.nFreeze[g][d] != 0)
 +                {
 +                    AbsRef[d] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Check for position restraints */
 +    iloop = gmx_mtop_ilistloop_init(sys);
 +    while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
 +    {
 +        if (nmol > 0 &&
 +            (AbsRef[XX] == 0 || AbsRef[YY] == 0 || AbsRef[ZZ] == 0))
 +        {
 +            for (i = 0; i < ilist[F_POSRES].nr; i += 2)
 +            {
 +                pr = &sys->ffparams.iparams[ilist[F_POSRES].iatoms[i]];
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    if (pr->posres.fcA[d] != 0)
 +                    {
 +                        AbsRef[d] = 1;
 +                    }
 +                }
 +            }
 +            for (i = 0; i < ilist[F_FBPOSRES].nr; i += 2)
 +            {
 +                /* Check for flat-bottom posres */
 +                pr = &sys->ffparams.iparams[ilist[F_FBPOSRES].iatoms[i]];
 +                if (pr->fbposres.k != 0)
 +                {
 +                    switch (pr->fbposres.geom)
 +                    {
 +                        case efbposresSPHERE:
 +                            AbsRef[XX] = AbsRef[YY] = AbsRef[ZZ] = 1;
 +                            break;
 +                        case efbposresCYLINDER:
 +                            AbsRef[XX] = AbsRef[YY] = 1;
 +                            break;
 +                        case efbposresX: /* d=XX */
 +                        case efbposresY: /* d=YY */
 +                        case efbposresZ: /* d=ZZ */
 +                            d         = pr->fbposres.geom - efbposresX;
 +                            AbsRef[d] = 1;
 +                            break;
 +                        default:
 +                            gmx_fatal(FARGS, " Invalid geometry for flat-bottom position restraint.\n"
 +                                      "Expected nr between 1 and %d. Found %d\n", efbposresNR-1,
 +                                      pr->fbposres.geom);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return (AbsRef[XX] != 0 && AbsRef[YY] != 0 && AbsRef[ZZ] != 0);
 +}
 +
 +void triple_check(const char *mdparin, t_inputrec *ir, gmx_mtop_t *sys,
 +                  warninp_t wi)
 +{
 +    char                      err_buf[256];
 +    int                       i, m, g, nmol, npct;
 +    gmx_bool                  bCharge, bAcc;
 +    real                      gdt_max, *mgrp, mt;
 +    rvec                      acc;
 +    gmx_mtop_atomloop_block_t aloopb;
 +    gmx_mtop_atomloop_all_t   aloop;
 +    t_atom                   *atom;
 +    ivec                      AbsRef;
 +    char                      warn_buf[STRLEN];
 +
 +    set_warning_line(wi, mdparin, -1);
 +
 +    if (EI_DYNAMICS(ir->eI) && !EI_SD(ir->eI) && ir->eI != eiBD &&
 +        ir->comm_mode == ecmNO &&
 +        !(absolute_reference(ir, sys, FALSE, AbsRef) || ir->nsteps <= 10))
 +    {
 +        warning(wi, "You are not using center of mass motion removal (mdp option comm-mode), numerical rounding errors can lead to build up of kinetic energy of the center of mass");
 +    }
 +
 +    /* Check for pressure coupling with absolute position restraints */
 +    if (ir->epc != epcNO && ir->refcoord_scaling == erscNO)
 +    {
 +        absolute_reference(ir, sys, TRUE, AbsRef);
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (AbsRef[m] && norm2(ir->compress[m]) > 0)
 +                {
 +                    warning(wi, "You are using pressure coupling with absolute position restraints, this will give artifacts. Use the refcoord_scaling option.");
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +
 +    bCharge = FALSE;
 +    aloopb  = gmx_mtop_atomloop_block_init(sys);
 +    while (gmx_mtop_atomloop_block_next(aloopb, &atom, &nmol))
 +    {
 +        if (atom->q != 0 || atom->qB != 0)
 +        {
 +            bCharge = TRUE;
 +        }
 +    }
 +
 +    if (!bCharge)
 +    {
 +        if (EEL_FULL(ir->coulombtype))
 +        {
 +            sprintf(err_buf,
 +                    "You are using full electrostatics treatment %s for a system without charges.\n"
 +                    "This costs a lot of performance for just processing zeros, consider using %s instead.\n",
 +                    EELTYPE(ir->coulombtype), EELTYPE(eelCUT));
 +            warning(wi, err_buf);
 +        }
 +    }
 +    else
 +    {
 +        if (ir->coulombtype == eelCUT && ir->rcoulomb > 0 && !ir->implicit_solvent)
 +        {
 +            sprintf(err_buf,
 +                    "You are using a plain Coulomb cut-off, which might produce artifacts.\n"
 +                    "You might want to consider using %s electrostatics.\n",
 +                    EELTYPE(eelPME));
 +            warning_note(wi, err_buf);
 +        }
 +    }
 +
 +    /* Generalized reaction field */
 +    if (ir->opts.ngtc == 0)
 +    {
 +        sprintf(err_buf, "No temperature coupling while using coulombtype %s",
 +                eel_names[eelGRF]);
 +        CHECK(ir->coulombtype == eelGRF);
 +    }
 +    else
 +    {
 +        sprintf(err_buf, "When using coulombtype = %s"
 +                " ref-t for temperature coupling should be > 0",
 +                eel_names[eelGRF]);
 +        CHECK((ir->coulombtype == eelGRF) && (ir->opts.ref_t[0] <= 0));
 +    }
 +
 +    if (ir->eI == eiSD1 &&
 +        (gmx_mtop_ftype_count(sys, F_CONSTR) > 0 ||
 +         gmx_mtop_ftype_count(sys, F_SETTLE) > 0))
 +    {
 +        sprintf(warn_buf, "With constraints integrator %s is less accurate, consider using %s instead", ei_names[ir->eI], ei_names[eiSD2]);
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    bAcc = FALSE;
 +    for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            if (fabs(ir->opts.acc[i][m]) > 1e-6)
 +            {
 +                bAcc = TRUE;
 +            }
 +        }
 +    }
 +    if (bAcc)
 +    {
 +        clear_rvec(acc);
 +        snew(mgrp, sys->groups.grps[egcACC].nr);
 +        aloop = gmx_mtop_atomloop_all_init(sys);
 +        while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
 +        {
 +            mgrp[ggrpnr(&sys->groups, egcACC, i)] += atom->m;
 +        }
 +        mt = 0.0;
 +        for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
 +        {
 +            for (m = 0; (m < DIM); m++)
 +            {
 +                acc[m] += ir->opts.acc[i][m]*mgrp[i];
 +            }
 +            mt += mgrp[i];
 +        }
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            if (fabs(acc[m]) > 1e-6)
 +            {
 +                const char *dim[DIM] = { "X", "Y", "Z" };
 +                fprintf(stderr,
 +                        "Net Acceleration in %s direction, will %s be corrected\n",
 +                        dim[m], ir->nstcomm != 0 ? "" : "not");
 +                if (ir->nstcomm != 0 && m < ndof_com(ir))
 +                {
 +                    acc[m] /= mt;
 +                    for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
 +                    {
 +                        ir->opts.acc[i][m] -= acc[m];
 +                    }
 +                }
 +            }
 +        }
 +        sfree(mgrp);
 +    }
 +
 +    if (ir->efep != efepNO && ir->fepvals->sc_alpha != 0 &&
 +        !gmx_within_tol(sys->ffparams.reppow, 12.0, 10*GMX_DOUBLE_EPS))
 +    {
 +        gmx_fatal(FARGS, "Soft-core interactions are only supported with VdW repulsion power 12");
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        if (ir->pull->grp[0].nat == 0)
 +        {
 +            absolute_reference(ir, sys, FALSE, AbsRef);
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (ir->pull->dim[m] && !AbsRef[m])
 +                {
 +                    warning(wi, "You are using an absolute reference for pulling, but the rest of the system does not have an absolute reference. This will lead to artifacts.");
 +                    break;
 +                }
 +            }
 +        }
 +
 +        if (ir->pull->eGeom == epullgDIRPBC)
 +        {
 +            for (i = 0; i < 3; i++)
 +            {
 +                for (m = 0; m <= i; m++)
 +                {
 +                    if ((ir->epc != epcNO && ir->compress[i][m] != 0) ||
 +                        ir->deform[i][m] != 0)
 +                    {
 +                        for (g = 1; g < ir->pull->ngrp; g++)
 +                        {
 +                            if (ir->pull->grp[g].vec[m] != 0)
 +                            {
 +                                gmx_fatal(FARGS, "Can not have dynamic box while using pull geometry '%s' (dim %c)", EPULLGEOM(ir->pull->eGeom), 'x'+m);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    check_disre(sys);
 +}
 +
 +void double_check(t_inputrec *ir, matrix box, gmx_bool bConstr, warninp_t wi)
 +{
 +    real        min_size;
 +    gmx_bool    bTWIN;
 +    char        warn_buf[STRLEN];
 +    const char *ptr;
 +
 +    ptr = check_box(ir->ePBC, box);
 +    if (ptr)
 +    {
 +        warning_error(wi, ptr);
 +    }
 +
 +    if (bConstr && ir->eConstrAlg == econtSHAKE)
 +    {
 +        if (ir->shake_tol <= 0.0)
 +        {
 +            sprintf(warn_buf, "ERROR: shake-tol must be > 0 instead of %g\n",
 +                    ir->shake_tol);
 +            warning_error(wi, warn_buf);
 +        }
 +
 +        if (IR_TWINRANGE(*ir) && ir->nstlist > 1)
 +        {
 +            sprintf(warn_buf, "With twin-range cut-off's and SHAKE the virial and the pressure are incorrect.");
 +            if (ir->epc == epcNO)
 +            {
 +                warning(wi, warn_buf);
 +            }
 +            else
 +            {
 +                warning_error(wi, warn_buf);
 +            }
 +        }
 +    }
 +
 +    if ( (ir->eConstrAlg == econtLINCS) && bConstr)
 +    {
 +        /* If we have Lincs constraints: */
 +        if (ir->eI == eiMD && ir->etc == etcNO &&
 +            ir->eConstrAlg == econtLINCS && ir->nLincsIter == 1)
 +        {
 +            sprintf(warn_buf, "For energy conservation with LINCS, lincs_iter should be 2 or larger.\n");
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        if ((ir->eI == eiCG || ir->eI == eiLBFGS) && (ir->nProjOrder < 8))
 +        {
 +            sprintf(warn_buf, "For accurate %s with LINCS constraints, lincs-order should be 8 or more.", ei_names[ir->eI]);
 +            warning_note(wi, warn_buf);
 +        }
 +        if (ir->epc == epcMTTK)
 +        {
 +            warning_error(wi, "MTTK not compatible with lincs -- use shake instead.");
 +        }
 +    }
 +
 +    if (ir->LincsWarnAngle > 90.0)
 +    {
 +        sprintf(warn_buf, "lincs-warnangle can not be larger than 90 degrees, setting it to 90.\n");
 +        warning(wi, warn_buf);
 +        ir->LincsWarnAngle = 90.0;
 +    }
 +
 +    if (ir->ePBC != epbcNONE)
 +    {
 +        if (ir->nstlist == 0)
 +        {
 +            warning(wi, "With nstlist=0 atoms are only put into the box at step 0, therefore drifting atoms might cause the simulation to crash.");
 +        }
 +        bTWIN = (ir->rlistlong > ir->rlist);
 +        if (ir->ns_type == ensGRID)
 +        {
 +            if (sqr(ir->rlistlong) >= max_cutoff2(ir->ePBC, box))
 +            {
 +                sprintf(warn_buf, "ERROR: The cut-off length is longer than half the shortest box vector or longer than the smallest box diagonal element. Increase the box size or decrease %s.\n",
 +                        bTWIN ? (ir->rcoulomb == ir->rlistlong ? "rcoulomb" : "rvdw") : "rlist");
 +                warning_error(wi, warn_buf);
 +            }
 +        }
 +        else
 +        {
 +            min_size = min(box[XX][XX], min(box[YY][YY], box[ZZ][ZZ]));
 +            if (2*ir->rlistlong >= min_size)
 +            {
 +                sprintf(warn_buf, "ERROR: One of the box lengths is smaller than twice the cut-off length. Increase the box size or decrease rlist.");
 +                warning_error(wi, warn_buf);
 +                if (TRICLINIC(box))
 +                {
 +                    fprintf(stderr, "Grid search might allow larger cut-off's than simple search with triclinic boxes.");
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void check_chargegroup_radii(const gmx_mtop_t *mtop, const t_inputrec *ir,
 +                             rvec *x,
 +                             warninp_t wi)
 +{
 +    real rvdw1, rvdw2, rcoul1, rcoul2;
 +    char warn_buf[STRLEN];
 +
 +    calc_chargegroup_radii(mtop, x, &rvdw1, &rvdw2, &rcoul1, &rcoul2);
 +
 +    if (rvdw1 > 0)
 +    {
 +        printf("Largest charge group radii for Van der Waals: %5.3f, %5.3f nm\n",
 +               rvdw1, rvdw2);
 +    }
 +    if (rcoul1 > 0)
 +    {
 +        printf("Largest charge group radii for Coulomb:       %5.3f, %5.3f nm\n",
 +               rcoul1, rcoul2);
 +    }
 +
 +    if (ir->rlist > 0)
 +    {
 +        if (rvdw1  + rvdw2  > ir->rlist ||
 +            rcoul1 + rcoul2 > ir->rlist)
 +        {
 +            sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than rlist (%f)\n", max(rvdw1+rvdw2, rcoul1+rcoul2), ir->rlist);
 +            warning(wi, warn_buf);
 +        }
 +        else
 +        {
 +            /* Here we do not use the zero at cut-off macro,
 +             * since user defined interactions might purposely
 +             * not be zero at the cut-off.
 +             */
 +            if (EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) &&
 +                rvdw1 + rvdw2 > ir->rlist - ir->rvdw)
 +            {
 +                sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than rlist (%f) - rvdw (%f)\n",
 +                        rvdw1+rvdw2,
 +                        ir->rlist, ir->rvdw);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi, warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi, warn_buf);
 +                }
 +            }
 +            if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) &&
 +                rcoul1 + rcoul2 > ir->rlistlong - ir->rcoulomb)
 +            {
 +                sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f)\n",
 +                        rcoul1+rcoul2,
 +                        ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
 +                        ir->rlistlong, ir->rcoulomb);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi, warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi, warn_buf);
 +                }
 +            }
 +        }
 +    }
 +}
index d2a3ce575d622bf456466e65d5fccfd04efbc6df,0000000000000000000000000000000000000000..945d4a3b0ec74f840345f2eb7de4c62f62951f87
mode 100644,000000..100644
--- /dev/null
@@@ -1,135 -1,0 +1,137 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +#ifndef _types_nrnb_h
 +#define _types_nrnb_h
 +
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +} /* fixes auto-indentation problems */
 +#endif
 +
 +
 +#define eNR_NBKERNEL_NONE -1
 +
 +enum
 +{
 +    eNR_NBKERNEL_VDW_VF,
 +    eNR_NBKERNEL_VDW_F,
 +    eNR_NBKERNEL_ELEC_VF,
 +    eNR_NBKERNEL_ELEC_F,
 +    eNR_NBKERNEL_ELEC_W3_VF,
 +    eNR_NBKERNEL_ELEC_W3_F,
 +    eNR_NBKERNEL_ELEC_W3W3_VF,
 +    eNR_NBKERNEL_ELEC_W3W3_F,
 +    eNR_NBKERNEL_ELEC_W4_VF,
 +    eNR_NBKERNEL_ELEC_W4_F,
 +    eNR_NBKERNEL_ELEC_W4W4_VF,
 +    eNR_NBKERNEL_ELEC_W4W4_F,
 +    eNR_NBKERNEL_ELEC_VDW_VF,
 +    eNR_NBKERNEL_ELEC_VDW_F,
 +    eNR_NBKERNEL_ELEC_VDW_W3_VF,
 +    eNR_NBKERNEL_ELEC_VDW_W3_F,
 +    eNR_NBKERNEL_ELEC_VDW_W3W3_VF,
 +    eNR_NBKERNEL_ELEC_VDW_W3W3_F,
 +    eNR_NBKERNEL_ELEC_VDW_W4_VF,
 +    eNR_NBKERNEL_ELEC_VDW_W4_F,
 +    eNR_NBKERNEL_ELEC_VDW_W4W4_VF,
 +    eNR_NBKERNEL_ELEC_VDW_W4W4_F,
 +
 +    eNR_NBKERNEL_NR,                        /* Total number of interaction-specific kernel entries */
 +
 +    eNR_NBKERNEL_GENERIC = eNR_NBKERNEL_NR, /* Reuse number; KERNEL_NR is not an entry itself */
++    eNR_NBKERNEL_GENERIC_CG,
++    eNR_NBKERNEL_GENERIC_ADRESS,
 +    eNR_NBKERNEL_FREE_ENERGY,               /* Add other generic kernels _before_ the free energy one */
 +
 +    eNR_NBKERNEL_ALLVSALL,
 +    eNR_NBKERNEL_ALLVSALLGB,
 +
 +    eNR_NBNXN_DIST2,
 +    eNR_NBNXN_LJ_RF,    eNR_NBNXN_LJ_RF_E,
 +    eNR_NBNXN_LJ_TAB,   eNR_NBNXN_LJ_TAB_E,
 +    eNR_NBNXN_LJ_EWALD, eNR_NBNXN_LJ_EWALD_E,
 +    eNR_NBNXN_LJ,       eNR_NBNXN_LJ_E,
 +    eNR_NBNXN_RF,       eNR_NBNXN_RF_E,
 +    eNR_NBNXN_TAB,      eNR_NBNXN_TAB_E,
 +    eNR_NBNXN_EWALD,    eNR_NBNXN_EWALD_E,
 +    eNR_NB14,
 +    eNR_BORN_RADII_STILL,     eNR_BORN_RADII_HCT_OBC,
 +    eNR_BORN_CHAINRULE,
 +    eNR_BORN_AVA_RADII_STILL, eNR_BORN_AVA_RADII_HCT_OBC,
 +    eNR_BORN_AVA_CHAINRULE,
 +    eNR_WEIGHTS,              eNR_SPREADQ,              eNR_SPREADQBSP,
 +    eNR_GATHERF,              eNR_GATHERFBSP,           eNR_FFT,
 +    eNR_CONV,                 eNR_SOLVEPME, eNR_NS,      eNR_RESETX,
 +    eNR_SHIFTX,               eNR_CGCM,                 eNR_FSUM,
 +    eNR_BONDS,                eNR_G96BONDS,             eNR_FENEBONDS,
 +    eNR_TABBONDS,             eNR_RESTRBONDS,           eNR_LINEAR_ANGLES,
 +    eNR_ANGLES,               eNR_G96ANGLES,            eNR_QANGLES,
 +    eNR_TABANGLES,            eNR_PROPER,               eNR_IMPROPER,
 +    eNR_RB,                   eNR_FOURDIH,              eNR_TABDIHS,
 +    eNR_DISRES,               eNR_ORIRES,               eNR_DIHRES,
 +    eNR_POSRES,               eNR_FBPOSRES,
 +    eNR_ANGRES,               eNR_ANGRESZ,
 +    eNR_MORSE,                eNR_CUBICBONDS,           eNR_WALLS,
 +    eNR_POLARIZE,             eNR_ANHARM_POL,
 +    eNR_WPOL,                 eNR_THOLE,                eNR_VIRIAL,
 +    eNR_UPDATE,               eNR_EXTUPDATE,            eNR_STOPCM,
 +    eNR_PCOUPL,               eNR_EKIN,                 eNR_LINCS,
 +    eNR_LINCSMAT,             eNR_SHAKE,                eNR_CONSTR_V,
 +    eNR_SHAKE_RIJ,            eNR_CONSTR_VIR,           eNR_SETTLE,
 +    eNR_VSITE2,               eNR_VSITE3,               eNR_VSITE3FD,
 +    eNR_VSITE3FAD,            eNR_VSITE3OUT,            eNR_VSITE4FD,
 +    eNR_VSITE4FDN,            eNR_VSITEN,               eNR_GB,
 +    eNR_CMAP,
 +    eNRNB
 +};
 +
 +
 +typedef struct
 +{
 +    double n[eNRNB];
 +}
 +t_nrnb;
 +
 +
 +typedef struct gmx_wallcycle *gmx_wallcycle_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 909006c56f3198c35bfe67b8f53931d321118e15,0000000000000000000000000000000000000000..fec67cfaf63364d7de310adc3adc0145a0f3ff9f
mode 100644,000000..100644
--- /dev/null
@@@ -1,2942 -1,0 +1,2954 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "tables.h"
 +#include "nonbonded.h"
 +#include "invblock.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "copyrite.h"
 +#include "mtop_util.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_consts.h"
 +#include "statutil.h"
 +#include "gmx_omp_nthreads.h"
++#include "gmx_detect_hardware.h"
 +
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +#endif
 +
 +#include "types/nbnxn_cuda_types_ext.h"
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +
 +t_forcerec *mk_forcerec(void)
 +{
 +    t_forcerec *fr;
 +
 +    snew(fr, 1);
 +
 +    return fr;
 +}
 +
 +#ifdef DEBUG
 +static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
 +{
 +    int i, j;
 +
 +    for (i = 0; (i < atnr); i++)
 +    {
 +        for (j = 0; (j < atnr); j++)
 +        {
 +            fprintf(fp, "%2d - %2d", i, j);
 +            if (bBHAM)
 +            {
 +                fprintf(fp, "  a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
 +                        BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
 +            }
 +            else
 +            {
 +                fprintf(fp, "  c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
 +                        C12(nbfp, atnr, i, j)/12.0);
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
 +{
 +    real *nbfp;
 +    int   i, j, k, atnr;
 +
 +    atnr = idef->atnr;
 +    if (bBHAM)
 +    {
 +        snew(nbfp, 3*atnr*atnr);
 +        for (i = k = 0; (i < atnr); i++)
 +        {
 +            for (j = 0; (j < atnr); j++, k++)
 +            {
 +                BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
 +                BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
 +                /* nbfp now includes the 6.0 derivative prefactor */
 +                BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        snew(nbfp, 2*atnr*atnr);
 +        for (i = k = 0; (i < atnr); i++)
 +        {
 +            for (j = 0; (j < atnr); j++, k++)
 +            {
 +                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                C6(nbfp, atnr, i, j)   = idef->iparams[k].lj.c6*6.0;
 +                C12(nbfp, atnr, i, j)  = idef->iparams[k].lj.c12*12.0;
 +            }
 +        }
 +    }
 +
 +    return nbfp;
 +}
 +
 +/* This routine sets fr->solvent_opt to the most common solvent in the
 + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
 + * the fr->solvent_type array with the correct type (or esolNO).
 + *
 + * Charge groups that fulfill the conditions but are not identical to the
 + * most common one will be marked as esolNO in the solvent_type array.
 + *
 + * TIP3p is identical to SPC for these purposes, so we call it
 + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 + *
 + * NOTE: QM particle should not
 + * become an optimized solvent. Not even if there is only one charge
 + * group in the Qm
 + */
 +
 +typedef struct
 +{
 +    int    model;
 +    int    count;
 +    int    vdwtype[4];
 +    real   charge[4];
 +} solvent_parameters_t;
 +
 +static void
 +check_solvent_cg(const gmx_moltype_t    *molt,
 +                 int                     cg0,
 +                 int                     nmol,
 +                 const unsigned char    *qm_grpnr,
 +                 const t_grps           *qm_grps,
 +                 t_forcerec   *          fr,
 +                 int                    *n_solvent_parameters,
 +                 solvent_parameters_t  **solvent_parameters_p,
 +                 int                     cginfo,
 +                 int                    *cg_sp)
 +{
 +    const t_blocka     *  excl;
 +    t_atom               *atom;
 +    int                   j, k;
 +    int                   j0, j1, nj;
 +    gmx_bool              perturbed;
 +    gmx_bool              has_vdw[4];
 +    gmx_bool              match;
 +    real                  tmp_charge[4];
 +    int                   tmp_vdwtype[4];
 +    int                   tjA;
 +    gmx_bool              qm;
 +    solvent_parameters_t *solvent_parameters;
 +
 +    /* We use a list with parameters for each solvent type.
 +     * Every time we discover a new molecule that fulfills the basic
 +     * conditions for a solvent we compare with the previous entries
 +     * in these lists. If the parameters are the same we just increment
 +     * the counter for that type, and otherwise we create a new type
 +     * based on the current molecule.
 +     *
 +     * Once we've finished going through all molecules we check which
 +     * solvent is most common, and mark all those molecules while we
 +     * clear the flag on all others.
 +     */
 +
 +    solvent_parameters = *solvent_parameters_p;
 +
 +    /* Mark the cg first as non optimized */
 +    *cg_sp = -1;
 +
 +    /* Check if this cg has no exclusions with atoms in other charge groups
 +     * and all atoms inside the charge group excluded.
 +     * We only have 3 or 4 atom solvent loops.
 +     */
 +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
 +        !GET_CGINFO_EXCL_INTRA(cginfo))
 +    {
 +        return;
 +    }
 +
 +    /* Get the indices of the first atom in this charge group */
 +    j0     = molt->cgs.index[cg0];
 +    j1     = molt->cgs.index[cg0+1];
 +
 +    /* Number of atoms in our molecule */
 +    nj     = j1 - j0;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Moltype '%s': there are %d atoms in this charge group\n",
 +                *molt->name, nj);
 +    }
 +
 +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 +     * otherwise skip it.
 +     */
 +    if (nj < 3 || nj > 4)
 +    {
 +        return;
 +    }
 +
 +    /* Check if we are doing QM on this group */
 +    qm = FALSE;
 +    if (qm_grpnr != NULL)
 +    {
 +        for (j = j0; j < j1 && !qm; j++)
 +        {
 +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
 +        }
 +    }
 +    /* Cannot use solvent optimization with QM */
 +    if (qm)
 +    {
 +        return;
 +    }
 +
 +    atom = molt->atoms.atom;
 +
 +    /* Still looks like a solvent, time to check parameters */
 +
 +    /* If it is perturbed (free energy) we can't use the solvent loops,
 +     * so then we just skip to the next molecule.
 +     */
 +    perturbed = FALSE;
 +
 +    for (j = j0; j < j1 && !perturbed; j++)
 +    {
 +        perturbed = PERTURBED(atom[j]);
 +    }
 +
 +    if (perturbed)
 +    {
 +        return;
 +    }
 +
 +    /* Now it's only a question if the VdW and charge parameters
 +     * are OK. Before doing the check we compare and see if they are
 +     * identical to a possible previous solvent type.
 +     * First we assign the current types and charges.
 +     */
 +    for (j = 0; j < nj; j++)
 +    {
 +        tmp_vdwtype[j] = atom[j0+j].type;
 +        tmp_charge[j]  = atom[j0+j].q;
 +    }
 +
 +    /* Does it match any previous solvent type? */
 +    for (k = 0; k < *n_solvent_parameters; k++)
 +    {
 +        match = TRUE;
 +
 +
 +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 +        if ( (solvent_parameters[k].model == esolSPC   && nj != 3)  ||
 +             (solvent_parameters[k].model == esolTIP4P && nj != 4) )
 +        {
 +            match = FALSE;
 +        }
 +
 +        /* Check that types & charges match for all atoms in molecule */
 +        for (j = 0; j < nj && match == TRUE; j++)
 +        {
 +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 +            {
 +                match = FALSE;
 +            }
 +            if (tmp_charge[j] != solvent_parameters[k].charge[j])
 +            {
 +                match = FALSE;
 +            }
 +        }
 +        if (match == TRUE)
 +        {
 +            /* Congratulations! We have a matched solvent.
 +             * Flag it with this type for later processing.
 +             */
 +            *cg_sp = k;
 +            solvent_parameters[k].count += nmol;
 +
 +            /* We are done with this charge group */
 +            return;
 +        }
 +    }
 +
 +    /* If we get here, we have a tentative new solvent type.
 +     * Before we add it we must check that it fulfills the requirements
 +     * of the solvent optimized loops. First determine which atoms have
 +     * VdW interactions.
 +     */
 +    for (j = 0; j < nj; j++)
 +    {
 +        has_vdw[j] = FALSE;
 +        tjA        = tmp_vdwtype[j];
 +
 +        /* Go through all other tpes and see if any have non-zero
 +         * VdW parameters when combined with this one.
 +         */
 +        for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
 +        {
 +            /* We already checked that the atoms weren't perturbed,
 +             * so we only need to check state A now.
 +             */
 +            if (fr->bBHAM)
 +            {
 +                has_vdw[j] = (has_vdw[j] ||
 +                              (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
 +                              (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
 +                              (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
 +            }
 +            else
 +            {
 +                /* Standard LJ */
 +                has_vdw[j] = (has_vdw[j] ||
 +                              (C6(fr->nbfp, fr->ntype, tjA, k)  != 0.0) ||
 +                              (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
 +            }
 +        }
 +    }
 +
 +    /* Now we know all we need to make the final check and assignment. */
 +    if (nj == 3)
 +    {
 +        /* So, is it an SPC?
 +         * For this we require thatn all atoms have charge,
 +         * the charges on atom 2 & 3 should be the same, and only
 +         * atom 1 might have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            tmp_charge[0]  != 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1])
 +        {
 +            srenew(solvent_parameters, *n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for (k = 0; k < 3; k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +    else if (nj == 4)
 +    {
 +        /* Or could it be a TIP4P?
 +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
 +         * Only atom 1 mght have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            has_vdw[3] == FALSE &&
 +            tmp_charge[0]  == 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1] &&
 +            tmp_charge[3]  != 0)
 +        {
 +            srenew(solvent_parameters, *n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for (k = 0; k < 4; k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +
 +    *solvent_parameters_p = solvent_parameters;
 +}
 +
 +static void
 +check_solvent(FILE  *                fp,
 +              const gmx_mtop_t  *    mtop,
 +              t_forcerec  *          fr,
 +              cginfo_mb_t           *cginfo_mb)
 +{
 +    const t_block     *   cgs;
 +    const t_block     *   mols;
 +    const gmx_moltype_t  *molt;
 +    int                   mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
 +    int                   n_solvent_parameters;
 +    solvent_parameters_t *solvent_parameters;
 +    int                 **cg_sp;
 +    int                   bestsp, bestsol;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Going to determine what solvent types we have.\n");
 +    }
 +
 +    mols = &mtop->mols;
 +
 +    n_solvent_parameters = 0;
 +    solvent_parameters   = NULL;
 +    /* Allocate temporary array for solvent type */
 +    snew(cg_sp, mtop->nmolblock);
 +
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        cgs  = &molt->cgs;
 +        /* Here we have to loop over all individual molecules
 +         * because we need to check for QMMM particles.
 +         */
 +        snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
 +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
 +        for (mol = 0; mol < nmol_ch; mol++)
 +        {
 +            cgm = mol*cgs->nr;
 +            am  = mol*cgs->index[cgs->nr];
 +            for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
 +            {
 +                check_solvent_cg(molt, cg_mol, nmol,
 +                                 mtop->groups.grpnr[egcQMMM] ?
 +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 +                                 &mtop->groups.grps[egcQMMM],
 +                                 fr,
 +                                 &n_solvent_parameters, &solvent_parameters,
 +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
 +                                 &cg_sp[mb][cgm+cg_mol]);
 +            }
 +        }
 +        cg_offset += cgs->nr;
 +        at_offset += cgs->index[cgs->nr];
 +    }
 +
 +    /* Puh! We finished going through all charge groups.
 +     * Now find the most common solvent model.
 +     */
 +
 +    /* Most common solvent this far */
 +    bestsp = -2;
 +    for (i = 0; i < n_solvent_parameters; i++)
 +    {
 +        if (bestsp == -2 ||
 +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
 +        {
 +            bestsp = i;
 +        }
 +    }
 +
 +    if (bestsp >= 0)
 +    {
 +        bestsol = solvent_parameters[bestsp].model;
 +    }
 +    else
 +    {
 +        bestsol = esolNO;
 +    }
 +
 +#ifdef DISABLE_WATER_NLIST
 +    bestsol = esolNO;
 +#endif
 +
 +    fr->nWatMol = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        cgs  = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 +        for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
 +        {
 +            if (cg_sp[mb][i] == bestsp)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
 +                fr->nWatMol += nmol;
 +            }
 +            else
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
 +            }
 +        }
 +        sfree(cg_sp[mb]);
 +    }
 +    sfree(cg_sp);
 +
 +    if (bestsol != esolNO && fp != NULL)
 +    {
 +        fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
 +                esol_names[bestsol],
 +                solvent_parameters[bestsp].count);
 +    }
 +
 +    sfree(solvent_parameters);
 +    fr->solvent_opt = bestsol;
 +}
 +
 +enum {
 +    acNONE = 0, acCONSTRAINT, acSETTLE
 +};
 +
 +static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
 +                                   t_forcerec *fr, gmx_bool bNoSolvOpt,
 +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 +{
 +    const t_block        *cgs;
 +    const t_blocka       *excl;
 +    const gmx_moltype_t  *molt;
 +    const gmx_molblock_t *molb;
 +    cginfo_mb_t          *cginfo_mb;
 +    gmx_bool             *type_VDW;
 +    int                  *cginfo;
 +    int                   cg_offset, a_offset, cgm, am;
 +    int                   mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
 +    int                  *a_con;
 +    int                   ftype;
 +    int                   ia;
 +    gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
 +
 +    ncg_tot = ncg_mtop(mtop);
 +    snew(cginfo_mb, mtop->nmolblock);
 +
 +    snew(type_VDW, fr->ntype);
 +    for (ai = 0; ai < fr->ntype; ai++)
 +    {
 +        type_VDW[ai] = FALSE;
 +        for (j = 0; j < fr->ntype; j++)
 +        {
 +            type_VDW[ai] = type_VDW[ai] ||
 +                fr->bBHAM ||
 +                C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
 +                C12(fr->nbfp, fr->ntype, ai, j) != 0;
 +        }
 +    }
 +
 +    *bExcl_IntraCGAll_InterCGNone = TRUE;
 +
 +    excl_nalloc = 10;
 +    snew(bExcl, excl_nalloc);
 +    cg_offset = 0;
 +    a_offset  = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        cgs  = &molt->cgs;
 +        excl = &molt->excls;
 +
 +        /* Check if the cginfo is identical for all molecules in this block.
 +         * If so, we only need an array of the size of one molecule.
 +         * Otherwise we make an array of #mol times #cgs per molecule.
 +         */
 +        bId = TRUE;
 +        am  = 0;
 +        for (m = 0; m < molb->nmol; m++)
 +        {
 +            am = m*cgs->index[cgs->nr];
 +            for (cg = 0; cg < cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
 +                    ggrpnr(&mtop->groups, egcENER, a_offset   +a0))
 +                {
 +                    bId = FALSE;
 +                }
 +                if (mtop->groups.grpnr[egcQMMM] != NULL)
 +                {
 +                    for (ai = a0; ai < a1; ai++)
 +                    {
 +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 +                        {
 +                            bId = FALSE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        cginfo_mb[mb].cg_start = cg_offset;
 +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 +        snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
 +        cginfo = cginfo_mb[mb].cginfo;
 +
 +        /* Set constraints flags for constrained atoms */
 +        snew(a_con, molt->atoms.nr);
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
 +            {
 +                int nral;
 +
 +                nral = NRAL(ftype);
 +                for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
 +                {
 +                    int a;
 +
 +                    for (a = 0; a < nral; a++)
 +                    {
 +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 +                    }
 +                }
 +            }
 +        }
 +
 +        for (m = 0; m < (bId ? 1 : molb->nmol); m++)
 +        {
 +            cgm = m*cgs->nr;
 +            am  = m*cgs->index[cgs->nr];
 +            for (cg = 0; cg < cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +
 +                /* Store the energy group in cginfo */
 +                gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
 +                SET_CGINFO_GID(cginfo[cgm+cg], gid);
 +
 +                /* Check the intra/inter charge group exclusions */
 +                if (a1-a0 > excl_nalloc)
 +                {
 +                    excl_nalloc = a1 - a0;
 +                    srenew(bExcl, excl_nalloc);
 +                }
 +                /* bExclIntraAll: all intra cg interactions excluded
 +                 * bExclInter:    any inter cg interactions excluded
 +                 */
 +                bExclIntraAll = TRUE;
 +                bExclInter    = FALSE;
 +                bHaveVDW      = FALSE;
 +                bHaveQ        = FALSE;
 +                for (ai = a0; ai < a1; ai++)
 +                {
 +                    /* Check VDW and electrostatic interactions */
 +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 +                                            type_VDW[molt->atoms.atom[ai].typeB]);
 +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 +                                            molt->atoms.atom[ai].qB != 0);
 +
 +                    /* Clear the exclusion list for atom ai */
 +                    for (aj = a0; aj < a1; aj++)
 +                    {
 +                        bExcl[aj-a0] = FALSE;
 +                    }
 +                    /* Loop over all the exclusions of atom ai */
 +                    for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
 +                    {
 +                        aj = excl->a[j];
 +                        if (aj < a0 || aj >= a1)
 +                        {
 +                            bExclInter = TRUE;
 +                        }
 +                        else
 +                        {
 +                            bExcl[aj-a0] = TRUE;
 +                        }
 +                    }
 +                    /* Check if ai excludes a0 to a1 */
 +                    for (aj = a0; aj < a1; aj++)
 +                    {
 +                        if (!bExcl[aj-a0])
 +                        {
 +                            bExclIntraAll = FALSE;
 +                        }
 +                    }
 +
 +                    switch (a_con[ai])
 +                    {
 +                        case acCONSTRAINT:
 +                            SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 +                            break;
 +                        case acSETTLE:
 +                            SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 +                            break;
 +                        default:
 +                            break;
 +                    }
 +                }
 +                if (bExclIntraAll)
 +                {
 +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 +                }
 +                if (bExclInter)
 +                {
 +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 +                }
 +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 +                {
 +                    /* The size in cginfo is currently only read with DD */
 +                    gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
 +                }
 +                if (bHaveVDW)
 +                {
 +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 +                }
 +                if (bHaveQ)
 +                {
 +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 +                }
 +                /* Store the charge group size */
 +                SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
 +
 +                if (!bExclIntraAll || bExclInter)
 +                {
 +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
 +                }
 +            }
 +        }
 +
 +        sfree(a_con);
 +
 +        cg_offset += molb->nmol*cgs->nr;
 +        a_offset  += molb->nmol*cgs->index[cgs->nr];
 +    }
 +    sfree(bExcl);
 +
 +    /* the solvent optimizer is called after the QM is initialized,
 +     * because we don't want to have the QM subsystemto become an
 +     * optimized solvent
 +     */
 +
 +    check_solvent(fplog, mtop, fr, cginfo_mb);
 +
 +    if (getenv("GMX_NO_SOLV_OPT"))
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
 +                    "Disabling all solvent optimization\n");
 +        }
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (bNoSolvOpt)
 +    {
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (!fr->solvent_opt)
 +    {
 +        for (mb = 0; mb < mtop->nmolblock; mb++)
 +        {
 +            for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
 +            }
 +        }
 +    }
 +
 +    return cginfo_mb;
 +}
 +
 +static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
 +{
 +    int  ncg, mb, cg;
 +    int *cginfo;
 +
 +    ncg = cgi_mb[nmb-1].cg_end;
 +    snew(cginfo, ncg);
 +    mb = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        while (cg >= cgi_mb[mb].cg_end)
 +        {
 +            mb++;
 +        }
 +        cginfo[cg] =
 +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 +    }
 +
 +    return cginfo;
 +}
 +
 +static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
 +{
 +    double         qsum, q2sum, q;
 +    int            mb, nmol, i;
 +    const t_atoms *atoms;
 +
 +    qsum  = 0;
 +    q2sum = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        nmol  = mtop->molblock[mb].nmol;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        for (i = 0; i < atoms->nr; i++)
 +        {
 +            q      = atoms->atom[i].q;
 +            qsum  += nmol*q;
 +            q2sum += nmol*q*q;
 +        }
 +    }
 +    fr->qsum[0]  = qsum;
 +    fr->q2sum[0] = q2sum;
 +    if (fr->efep != efepNO)
 +    {
 +        qsum  = 0;
 +        q2sum = 0;
 +        for (mb = 0; mb < mtop->nmolblock; mb++)
 +        {
 +            nmol  = mtop->molblock[mb].nmol;
 +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +            for (i = 0; i < atoms->nr; i++)
 +            {
 +                q      = atoms->atom[i].qB;
 +                qsum  += nmol*q;
 +                q2sum += nmol*q*q;
 +            }
 +            fr->qsum[1]  = qsum;
 +            fr->q2sum[1] = q2sum;
 +        }
 +    }
 +    else
 +    {
 +        fr->qsum[1]  = fr->qsum[0];
 +        fr->q2sum[1] = fr->q2sum[0];
 +    }
 +    if (log)
 +    {
 +        if (fr->efep == efepNO)
 +        {
 +            fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
 +        }
 +        else
 +        {
 +            fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
 +                    fr->qsum[0], fr->qsum[1]);
 +        }
 +    }
 +}
 +
 +void update_forcerec(FILE *log, t_forcerec *fr, matrix box)
 +{
 +    if (fr->eeltype == eelGRF)
 +    {
 +        calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
 +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
 +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
 +    }
 +}
 +
 +void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
 +{
 +    const t_atoms  *atoms, *atoms_tpi;
 +    const t_blocka *excl;
 +    int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
 +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
 +    long long int   npair, npair_ij, tmpi, tmpj;
 +#else
 +    double          npair, npair_ij, tmpi, tmpj;
 +#endif
 +    double          csix, ctwelve;
 +    int             ntp, *typecount;
 +    gmx_bool        bBHAM;
 +    real           *nbfp;
 +
 +    ntp   = fr->ntype;
 +    bBHAM = fr->bBHAM;
 +    nbfp  = fr->nbfp;
 +
 +    for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
 +    {
 +        csix    = 0;
 +        ctwelve = 0;
 +        npair   = 0;
 +        nexcl   = 0;
 +        if (!fr->n_tpi)
 +        {
 +            /* Count the types so we avoid natoms^2 operations */
 +            snew(typecount, ntp);
 +            for (mb = 0; mb < mtop->nmolblock; mb++)
 +            {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for (i = 0; i < atoms->nr; i++)
 +                {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    typecount[tpi] += nmol;
 +                }
 +            }
 +            for (tpi = 0; tpi < ntp; tpi++)
 +            {
 +                for (tpj = tpi; tpj < ntp; tpj++)
 +                {
 +                    tmpi = typecount[tpi];
 +                    tmpj = typecount[tpj];
 +                    if (tpi != tpj)
 +                    {
 +                        npair_ij = tmpi*tmpj;
 +                    }
 +                    else
 +                    {
 +                        npair_ij = tmpi*(tmpi - 1)/2;
 +                    }
 +                    if (bBHAM)
 +                    {
 +                        /* nbfp now includes the 6.0 derivative prefactor */
 +                        csix    += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
 +                    }
 +                    else
 +                    {
 +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                        csix    += npair_ij*   C6(nbfp, ntp, tpi, tpj)/6.0;
 +                        ctwelve += npair_ij*  C12(nbfp, ntp, tpi, tpj)/12.0;
 +                    }
 +                    npair += npair_ij;
 +                }
 +            }
 +            sfree(typecount);
 +            /* Subtract the excluded pairs.
 +             * The main reason for substracting exclusions is that in some cases
 +             * some combinations might never occur and the parameters could have
 +             * any value. These unused values should not influence the dispersion
 +             * correction.
 +             */
 +            for (mb = 0; mb < mtop->nmolblock; mb++)
 +            {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
 +                for (i = 0; (i < atoms->nr); i++)
 +                {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    j1  = excl->index[i];
 +                    j2  = excl->index[i+1];
 +                    for (j = j1; j < j2; j++)
 +                    {
 +                        k = excl->a[j];
 +                        if (k > i)
 +                        {
 +                            if (q == 0)
 +                            {
 +                                tpj = atoms->atom[k].type;
 +                            }
 +                            else
 +                            {
 +                                tpj = atoms->atom[k].typeB;
 +                            }
 +                            if (bBHAM)
 +                            {
 +                                /* nbfp now includes the 6.0 derivative prefactor */
 +                                csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
 +                            }
 +                            else
 +                            {
 +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                                csix    -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
 +                                ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
 +                            }
 +                            nexcl += nmol;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Only correct for the interaction of the test particle
 +             * with the rest of the system.
 +             */
 +            atoms_tpi =
 +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
 +
 +            npair = 0;
 +            for (mb = 0; mb < mtop->nmolblock; mb++)
 +            {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for (j = 0; j < atoms->nr; j++)
 +                {
 +                    nmolc = nmol;
 +                    /* Remove the interaction of the test charge group
 +                     * with itself.
 +                     */
 +                    if (mb == mtop->nmolblock-1)
 +                    {
 +                        nmolc--;
 +
 +                        if (mb == 0 && nmol == 1)
 +                        {
 +                            gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
 +                        }
 +                    }
 +                    if (q == 0)
 +                    {
 +                        tpj = atoms->atom[j].type;
 +                    }
 +                    else
 +                    {
 +                        tpj = atoms->atom[j].typeB;
 +                    }
 +                    for (i = 0; i < fr->n_tpi; i++)
 +                    {
 +                        if (q == 0)
 +                        {
 +                            tpi = atoms_tpi->atom[i].type;
 +                        }
 +                        else
 +                        {
 +                            tpi = atoms_tpi->atom[i].typeB;
 +                        }
 +                        if (bBHAM)
 +                        {
 +                            /* nbfp now includes the 6.0 derivative prefactor */
 +                            csix    += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
 +                        }
 +                        else
 +                        {
 +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                            csix    += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
 +                            ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
 +                        }
 +                        npair += nmolc;
 +                    }
 +                }
 +            }
 +        }
 +        if (npair - nexcl <= 0 && fplog)
 +        {
 +            fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
 +            csix     = 0;
 +            ctwelve  = 0;
 +        }
 +        else
 +        {
 +            csix    /= npair - nexcl;
 +            ctwelve /= npair - nexcl;
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Counted %d exclusions\n", nexcl);
 +            fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
 +            fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
 +        }
 +        fr->avcsix[q]    = csix;
 +        fr->avctwelve[q] = ctwelve;
 +    }
 +    if (fplog != NULL)
 +    {
 +        if (fr->eDispCorr == edispcAllEner ||
 +            fr->eDispCorr == edispcAllEnerPres)
 +        {
 +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                    fr->avcsix[0], fr->avctwelve[0]);
 +        }
 +        else
 +        {
 +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
 +        }
 +    }
 +}
 +
 +
 +static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
 +                           const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *at1, *at2;
 +    int            mt1, mt2, i, j, tpi, tpj, ntypes;
 +    real           b, bmin;
 +    real          *nbfp;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
 +    }
 +    nbfp   = fr->nbfp;
 +    ntypes = fr->ntype;
 +
 +    bmin           = -1;
 +    fr->bham_b_max = 0;
 +    for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
 +    {
 +        at1 = &mtop->moltype[mt1].atoms;
 +        for (i = 0; (i < at1->nr); i++)
 +        {
 +            tpi = at1->atom[i].type;
 +            if (tpi >= ntypes)
 +            {
 +                gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
 +            }
 +
 +            for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
 +            {
 +                at2 = &mtop->moltype[mt2].atoms;
 +                for (j = 0; (j < at2->nr); j++)
 +                {
 +                    tpj = at2->atom[j].type;
 +                    if (tpj >= ntypes)
 +                    {
 +                        gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
 +                    }
 +                    b = BHAMB(nbfp, ntypes, tpi, tpj);
 +                    if (b > fr->bham_b_max)
 +                    {
 +                        fr->bham_b_max = b;
 +                    }
 +                    if ((b < bmin) || (bmin == -1))
 +                    {
 +                        bmin = b;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
 +                bmin, fr->bham_b_max);
 +    }
 +}
 +
 +static void make_nbf_tables(FILE *fp, const output_env_t oenv,
 +                            t_forcerec *fr, real rtab,
 +                            const t_commrec *cr,
 +                            const char *tabfn, char *eg1, char *eg2,
 +                            t_nblists *nbl)
 +{
 +    char buf[STRLEN];
 +    int  i, j;
 +
 +    if (tabfn == NULL)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
 +        }
 +        return;
 +    }
 +
 +    sprintf(buf, "%s", tabfn);
 +    if (eg1 && eg2)
 +    {
 +        /* Append the two energy group names */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
 +                eg1, eg2, ftp2ext(efXVG));
 +    }
 +    nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
 +    /* Copy the contents of the table to separate coulomb and LJ tables too,
 +     * to improve cache performance.
 +     */
 +    /* For performance reasons we want
 +     * the table data to be aligned to 16-byte. The pointers could be freed
 +     * but currently aren't.
 +     */
 +    nbl->table_elec.interaction   = GMX_TABLE_INTERACTION_ELEC;
 +    nbl->table_elec.format        = nbl->table_elec_vdw.format;
 +    nbl->table_elec.r             = nbl->table_elec_vdw.r;
 +    nbl->table_elec.n             = nbl->table_elec_vdw.n;
 +    nbl->table_elec.scale         = nbl->table_elec_vdw.scale;
 +    nbl->table_elec.scale_exp     = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_elec.formatsize    = nbl->table_elec_vdw.formatsize;
 +    nbl->table_elec.ninteractions = 1;
 +    nbl->table_elec.stride        = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
 +    snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
 +
 +    nbl->table_vdw.interaction   = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
 +    nbl->table_vdw.format        = nbl->table_elec_vdw.format;
 +    nbl->table_vdw.r             = nbl->table_elec_vdw.r;
 +    nbl->table_vdw.n             = nbl->table_elec_vdw.n;
 +    nbl->table_vdw.scale         = nbl->table_elec_vdw.scale;
 +    nbl->table_vdw.scale_exp     = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_vdw.formatsize    = nbl->table_elec_vdw.formatsize;
 +    nbl->table_vdw.ninteractions = 2;
 +    nbl->table_vdw.stride        = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
 +    snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
 +
 +    for (i = 0; i <= nbl->table_elec_vdw.n; i++)
 +    {
 +        for (j = 0; j < 4; j++)
 +        {
 +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
 +        }
 +        for (j = 0; j < 8; j++)
 +        {
 +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
 +        }
 +    }
 +}
 +
 +static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
 +                         int *ncount, int **count)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_ilist       *il;
 +    int                  mt, ftype, stride, i, j, tabnr;
 +
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if (ftype == ftype1 || ftype == ftype2)
 +            {
 +                il     = &molt->ilist[ftype];
 +                stride = 1 + NRAL(ftype);
 +                for (i = 0; i < il->nr; i += stride)
 +                {
 +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
 +                    if (tabnr < 0)
 +                    {
 +                        gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
 +                    }
 +                    if (tabnr >= *ncount)
 +                    {
 +                        srenew(*count, tabnr+1);
 +                        for (j = *ncount; j < tabnr+1; j++)
 +                        {
 +                            (*count)[j] = 0;
 +                        }
 +                        *ncount = tabnr+1;
 +                    }
 +                    (*count)[tabnr]++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static bondedtable_t *make_bonded_tables(FILE *fplog,
 +                                         int ftype1, int ftype2,
 +                                         const gmx_mtop_t *mtop,
 +                                         const char *basefn, const char *tabext)
 +{
 +    int            i, ncount, *count;
 +    char           tabfn[STRLEN];
 +    bondedtable_t *tab;
 +
 +    tab = NULL;
 +
 +    ncount = 0;
 +    count  = NULL;
 +    count_tables(ftype1, ftype2, mtop, &ncount, &count);
 +
 +    if (ncount > 0)
 +    {
 +        snew(tab, ncount);
 +        for (i = 0; i < ncount; i++)
 +        {
 +            if (count[i] > 0)
 +            {
 +                sprintf(tabfn, "%s", basefn);
 +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
 +                        tabext, i, ftp2ext(efXVG));
 +                tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
 +            }
 +        }
 +        sfree(count);
 +    }
 +
 +    return tab;
 +}
 +
 +void forcerec_set_ranges(t_forcerec *fr,
 +                         int ncg_home, int ncg_force,
 +                         int natoms_force,
 +                         int natoms_force_constr, int natoms_f_novirsum)
 +{
 +    fr->cg0 = 0;
 +    fr->hcg = ncg_home;
 +
 +    /* fr->ncg_force is unused in the standard code,
 +     * but it can be useful for modified code dealing with charge groups.
 +     */
 +    fr->ncg_force           = ncg_force;
 +    fr->natoms_force        = natoms_force;
 +    fr->natoms_force_constr = natoms_force_constr;
 +
 +    if (fr->natoms_force_constr > fr->nalloc_force)
 +    {
 +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 +
 +        if (fr->bTwinRange)
 +        {
 +            srenew(fr->f_twin, fr->nalloc_force);
 +        }
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        fr->f_novirsum_n = natoms_f_novirsum;
 +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
 +        {
 +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
 +            srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
 +        }
 +    }
 +    else
 +    {
 +        fr->f_novirsum_n = 0;
 +    }
 +}
 +
 +static real cutoff_inf(real cutoff)
 +{
 +    if (cutoff == 0)
 +    {
 +        cutoff = GMX_CUTOFF_INF;
 +    }
 +
 +    return cutoff;
 +}
 +
 +static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
 +                                  t_forcerec *fr, const t_inputrec *ir,
 +                                  const char *tabfn, const gmx_mtop_t *mtop,
 +                                  matrix     box)
 +{
 +    char buf[STRLEN];
 +    int  i, j;
 +
 +    if (tabfn == NULL)
 +    {
 +        gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
 +        return;
 +    }
 +
 +    snew(fr->atf_tabs, ir->adress->n_tf_grps);
 +
 +    for (i = 0; i < ir->adress->n_tf_grps; i++)
 +    {
 +        j = ir->adress->tf_table_index[i]; /* get energy group index */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
 +                *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
 +        printf("loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[j], buf);
 +        fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
 +    }
 +
 +}
 +
 +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 +                          gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
 +{
 +    gmx_bool bAllvsAll;
 +
 +    bAllvsAll =
 +        (
 +            ir->rlist == 0            &&
 +            ir->rcoulomb == 0         &&
 +            ir->rvdw == 0             &&
 +            ir->ePBC == epbcNONE      &&
 +            ir->vdwtype == evdwCUT    &&
 +            ir->coulombtype == eelCUT &&
 +            ir->efep == efepNO        &&
 +            (ir->implicit_solvent == eisNO ||
 +             (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
 +                                                  ir->gb_algorithm == egbHCT   ||
 +                                                  ir->gb_algorithm == egbOBC))) &&
 +            getenv("GMX_NO_ALLVSALL") == NULL
 +        );
 +
 +    if (bAllvsAll && ir->opts.ngener > 1)
 +    {
 +        const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
 +
 +        if (bPrintNote)
 +        {
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr, "\n%s\n", note);
 +            }
 +            if (fp != NULL)
 +            {
 +                fprintf(fp, "\n%s\n", note);
 +            }
 +        }
 +        bAllvsAll = FALSE;
 +    }
 +
 +    if (bAllvsAll && fp && MASTER(cr))
 +    {
 +        fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
 +    }
 +
 +    return bAllvsAll;
 +}
 +
 +
 +static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
 +{
 +    int t, i;
 +
 +    /* These thread local data structures are used for bondeds only */
 +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 +
 +    if (fr->nthreads > 1)
 +    {
 +        snew(fr->f_t, fr->nthreads);
 +        /* Thread 0 uses the global force and energy arrays */
 +        for (t = 1; t < fr->nthreads; t++)
 +        {
 +            fr->f_t[t].f        = NULL;
 +            fr->f_t[t].f_nalloc = 0;
 +            snew(fr->f_t[t].fshift, SHIFTS);
 +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
 +            for (i = 0; i < egNR; i++)
 +            {
 +                snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void pick_nbnxn_kernel_cpu(FILE             *fp,
 +                                  const t_commrec  *cr,
 +                                  const gmx_cpuid_t cpuid_info,
 +                                  const t_inputrec *ir,
 +                                  int              *kernel_type,
 +                                  int              *ewald_excl)
 +{
 +    *kernel_type = nbnxnk4x4_PlainC;
 +    *ewald_excl  = ewaldexclTable;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +        *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +        /* We expect the 2xNN kernels to be faster in most cases */
 +        *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#endif
 +
 +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
 +        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
 +        {
 +            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
 +             * 10% with HT, 50% without HT, but extra zeros interactions
 +             * can compensate. As we currently don't detect the actual use
 +             * of HT, switch to 4x8 to avoid a potential performance hit.
 +             */
 +            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +        }
 +#endif
 +        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#else
 +            gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
 +        {
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#else
 +            gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +
 +        /* Analytical Ewald exclusion correction is only an option in the
 +         * x86 SIMD kernel. This is faster in single precision
 +         * on Bulldozer and slightly faster on Sandy Bridge.
 +         */
 +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
 +        *ewald_excl = ewaldexclAnalytical;
 +#endif
 +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
 +        {
 +            *ewald_excl = ewaldexclTable;
 +        }
 +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
 +        {
 +            *ewald_excl = ewaldexclAnalytical;
 +        }
 +
 +    }
 +#endif /* GMX_X86_SSE2 */
 +}
 +
 +
 +const char *lookup_nbnxn_kernel_name(int kernel_type)
 +{
 +    const char *returnvalue = NULL;
 +    switch (kernel_type)
 +    {
 +        case nbnxnkNotSet: returnvalue     = "not set"; break;
 +        case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
 +#ifndef GMX_NBNXN_SIMD
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "not available"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
 +#else
 +#ifdef GMX_X86_SSE2
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +            /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
 +             * on compiler flags. As we use nearly identical intrinsics, using an AVX
 +             * compiler flag without an AVX macro effectively results in AVX kernels.
 +             * For gcc we check for __AVX__
 +             * At least a check for icc should be added (if there is a macro)
 +             */
 +#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 +#ifndef GMX_X86_SSE4_1
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE2"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
 +#else
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE4.1"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
 +#endif
 +#else
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-128"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
 +#endif
 +#endif
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-256"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
 +#endif
 +#else   /* not GMX_X86_SSE2 */
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SIMD"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
 +#endif
 +#endif
 +        case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
 +        case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
 +
 +        case nbnxnkNR:
 +        default:
 +            gmx_fatal(FARGS, "Illegal kernel type selected");
 +            returnvalue = NULL;
 +            break;
 +    }
 +    return returnvalue;
 +};
 +
 +static void pick_nbnxn_kernel(FILE                *fp,
 +                              const t_commrec     *cr,
 +                              const gmx_hw_info_t *hwinfo,
 +                              gmx_bool             use_cpu_acceleration,
 +                              gmx_bool             bUseGPU,
 +                              gmx_bool             bEmulateGPU,
 +                              const t_inputrec    *ir,
 +                              int                 *kernel_type,
 +                              int                 *ewald_excl,
 +                              gmx_bool             bDoNonbonded)
 +{
 +    assert(kernel_type);
 +
 +    *kernel_type = nbnxnkNotSet;
 +    *ewald_excl  = ewaldexclTable;
 +
 +    if (bEmulateGPU)
 +    {
 +        *kernel_type = nbnxnk8x8x8_PlainC;
 +
 +        if (bDoNonbonded)
 +        {
 +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
 +        }
 +    }
 +    else if (bUseGPU)
 +    {
 +        *kernel_type = nbnxnk8x8x8_CUDA;
 +    }
 +
 +    if (*kernel_type == nbnxnkNotSet)
 +    {
 +        if (use_cpu_acceleration)
 +        {
 +            pick_nbnxn_kernel_cpu(fp, cr, hwinfo->cpuid_info, ir,
 +                                  kernel_type, ewald_excl);
 +        }
 +        else
 +        {
 +            *kernel_type = nbnxnk4x4_PlainC;
 +        }
 +    }
 +
 +    if (bDoNonbonded && fp != NULL)
 +    {
 +        fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
 +                lookup_nbnxn_kernel_name(*kernel_type),
 +                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
 +                nbnxn_kernel_to_cj_size(*kernel_type));
 +    }
 +}
 +
 +static void pick_nbnxn_resources(FILE                *fp,
 +                                 const t_commrec     *cr,
 +                                 const gmx_hw_info_t *hwinfo,
 +                                 gmx_bool             bDoNonbonded,
 +                                 gmx_bool            *bUseGPU,
 +                                 gmx_bool            *bEmulateGPU)
 +{
 +    gmx_bool bEmulateGPUEnvVarSet;
 +    char     gpu_err_str[STRLEN];
 +
 +    *bUseGPU = FALSE;
 +
 +    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
 +     * GPUs (currently) only handle non-bonded calculations, we will
 +     * automatically switch to emulation if non-bonded calculations are
 +     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
 +     * way to turn off GPU initialization, data movement, and cleanup.
 +     *
 +     * GPU emulation can be useful to assess the performance one can expect by
 +     * adding GPU(s) to the machine. The conditional below allows this even
 +     * if mdrun is compiled without GPU acceleration support.
 +     * Note that you should freezing the system as otherwise it will explode.
 +     */
 +    *bEmulateGPU = (bEmulateGPUEnvVarSet ||
 +                    (!bDoNonbonded && hwinfo->bCanUseGPU));
 +
 +    /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
 +     */
 +    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
 +    {
 +        /* Each PP node will use the intra-node id-th device from the
 +         * list of detected/selected GPUs. */
 +        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
 +        {
 +            /* At this point the init should never fail as we made sure that
 +             * we have all the GPUs we need. If it still does, we'll bail. */
 +            gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
 +                      cr->nodeid,
 +                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
 +                      gpu_err_str);
 +        }
 +
 +        /* Here we actually turn on hardware GPU acceleration */
 +        *bUseGPU = TRUE;
 +    }
 +}
 +
 +gmx_bool uses_simple_tables(int                 cutoff_scheme,
 +                            nonbonded_verlet_t *nbv,
 +                            int                 group)
 +{
 +    gmx_bool bUsesSimpleTables = TRUE;
 +    int      grp_index;
 +
 +    switch (cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            bUsesSimpleTables = TRUE;
 +            break;
 +        case ecutsVERLET:
 +            assert(NULL != nbv && NULL != nbv->grp);
 +            grp_index         = (group < 0) ? 0 : (nbv->ngrp - 1);
 +            bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    return bUsesSimpleTables;
 +}
 +
 +static void init_ewald_f_table(interaction_const_t *ic,
 +                               gmx_bool             bUsesSimpleTables,
 +                               real                 rtab)
 +{
 +    real maxr;
 +
 +    if (bUsesSimpleTables)
 +    {
 +        /* With a spacing of 0.0005 we are at the force summation accuracy
 +         * for the SSE kernels for "normal" atomistic simulations.
 +         */
 +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
 +                                                   ic->rcoulomb);
 +
 +        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
 +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
 +    }
 +    else
 +    {
 +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
 +    }
 +
 +    sfree_aligned(ic->tabq_coul_FDV0);
 +    sfree_aligned(ic->tabq_coul_F);
 +    sfree_aligned(ic->tabq_coul_V);
 +
 +    /* Create the original table data in FDV0 */
 +    snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
 +    snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
 +    snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
 +    table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
 +                                ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
 +}
 +
 +void init_interaction_const_tables(FILE                *fp,
 +                                   interaction_const_t *ic,
 +                                   gmx_bool             bUsesSimpleTables,
 +                                   real                 rtab)
 +{
 +    real spacing;
 +
 +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
 +    {
 +        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
 +
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
 +                    1/ic->tabq_scale, ic->tabq_size);
 +        }
 +    }
 +}
 +
 +void init_interaction_const(FILE                 *fp,
 +                            interaction_const_t **interaction_const,
 +                            const t_forcerec     *fr,
 +                            real                  rtab)
 +{
 +    interaction_const_t *ic;
 +    gmx_bool             bUsesSimpleTables = TRUE;
 +
 +    snew(ic, 1);
 +
 +    /* Just allocate something so we can free it */
 +    snew_aligned(ic->tabq_coul_FDV0, 16, 32);
 +    snew_aligned(ic->tabq_coul_F, 16, 32);
 +    snew_aligned(ic->tabq_coul_V, 16, 32);
 +
 +    ic->rlist       = fr->rlist;
 +    ic->rlistlong   = fr->rlistlong;
 +
 +    /* Lennard-Jones */
 +    ic->rvdw        = fr->rvdw;
 +    if (fr->vdw_modifier == eintmodPOTSHIFT)
 +    {
 +        ic->sh_invrc6 = pow(ic->rvdw, -6.0);
 +    }
 +    else
 +    {
 +        ic->sh_invrc6 = 0;
 +    }
 +
 +    /* Electrostatics */
 +    ic->eeltype     = fr->eeltype;
 +    ic->rcoulomb    = fr->rcoulomb;
 +    ic->epsilon_r   = fr->epsilon_r;
 +    ic->epsfac      = fr->epsfac;
 +
 +    /* Ewald */
 +    ic->ewaldcoeff  = fr->ewaldcoeff;
 +    if (fr->coulomb_modifier == eintmodPOTSHIFT)
 +    {
 +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
 +    }
 +    else
 +    {
 +        ic->sh_ewald = 0;
 +    }
 +
 +    /* Reaction-field */
 +    if (EEL_RF(ic->eeltype))
 +    {
 +        ic->epsilon_rf = fr->epsilon_rf;
 +        ic->k_rf       = fr->k_rf;
 +        ic->c_rf       = fr->c_rf;
 +    }
 +    else
 +    {
 +        /* For plain cut-off we might use the reaction-field kernels */
 +        ic->epsilon_rf = ic->epsilon_r;
 +        ic->k_rf       = 0;
 +        if (fr->coulomb_modifier == eintmodPOTSHIFT)
 +        {
 +            ic->c_rf   = 1/ic->rcoulomb;
 +        }
 +        else
 +        {
 +            ic->c_rf   = 0;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
 +                sqr(ic->sh_invrc6), ic->sh_invrc6);
 +        if (ic->eeltype == eelCUT)
 +        {
 +            fprintf(fp, ", Coulomb %.3f", ic->c_rf);
 +        }
 +        else if (EEL_PME(ic->eeltype))
 +        {
 +            fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
 +        }
 +        fprintf(fp, "\n");
 +    }
 +
 +    *interaction_const = ic;
 +
 +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
 +    }
 +
 +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
 +    init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
 +}
 +
 +static void init_nb_verlet(FILE                *fp,
 +                           nonbonded_verlet_t **nb_verlet,
 +                           const t_inputrec    *ir,
 +                           const t_forcerec    *fr,
 +                           const t_commrec     *cr,
 +                           const char          *nbpu_opt)
 +{
 +    nonbonded_verlet_t *nbv;
 +    int                 i;
 +    char               *env;
 +    gmx_bool            bEmulateGPU, bHybridGPURun = FALSE;
 +
 +    nbnxn_alloc_t      *nb_alloc;
 +    nbnxn_free_t       *nb_free;
 +
 +    snew(nbv, 1);
 +
 +    pick_nbnxn_resources(fp, cr, fr->hwinfo,
 +                         fr->bNonbonded,
 +                         &nbv->bUseGPU,
 +                         &bEmulateGPU);
 +
 +    nbv->nbs = NULL;
 +
 +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
 +    for (i = 0; i < nbv->ngrp; i++)
 +    {
 +        nbv->grp[i].nbl_lists.nnbl = 0;
 +        nbv->grp[i].nbat           = NULL;
 +        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 +
 +        if (i == 0) /* local */
 +        {
 +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                              nbv->bUseGPU, bEmulateGPU,
 +                              ir,
 +                              &nbv->grp[i].kernel_type,
 +                              &nbv->grp[i].ewald_excl,
 +                              fr->bNonbonded);
 +        }
 +        else /* non-local */
 +        {
 +            if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
 +            {
 +                /* Use GPU for local, select a CPU kernel for non-local */
 +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                                  FALSE, FALSE,
 +                                  ir,
 +                                  &nbv->grp[i].kernel_type,
 +                                  &nbv->grp[i].ewald_excl,
 +                                  fr->bNonbonded);
 +
 +                bHybridGPURun = TRUE;
 +            }
 +            else
 +            {
 +                /* Use the same kernel for local and non-local interactions */
 +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
 +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
 +            }
 +        }
 +    }
 +
 +    if (nbv->bUseGPU)
 +    {
 +        /* init the NxN GPU data; the last argument tells whether we'll have
 +         * both local and non-local NB calculation on GPU */
 +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
 +                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
 +                        (nbv->ngrp > 1) && !bHybridGPURun);
 +
 +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
 +        {
 +            char *end;
 +
 +            nbv->min_ci_balanced = strtol(env, &end, 10);
 +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
 +            {
 +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +        else
 +        {
 +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nbv->min_ci_balanced = 0;
 +    }
 +
 +    *nb_verlet = nbv;
 +
 +    nbnxn_init_search(&nbv->nbs,
 +                      DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
 +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
 +                      gmx_omp_nthreads_get(emntNonbonded));
 +
 +    for (i = 0; i < nbv->ngrp; i++)
 +    {
 +        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +        {
 +            nb_alloc = &pmalloc;
 +            nb_free  = &pfree;
 +        }
 +        else
 +        {
 +            nb_alloc = NULL;
 +            nb_free  = NULL;
 +        }
 +
 +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                /* 8x8x8 "non-simple" lists are ATM always combined */
 +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                nb_alloc, nb_free);
 +
 +        if (i == 0 ||
 +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
 +        {
 +            snew(nbv->grp[i].nbat, 1);
 +            nbnxn_atomdata_init(fp,
 +                                nbv->grp[i].nbat,
 +                                nbv->grp[i].kernel_type,
 +                                fr->ntype, fr->nbfp,
 +                                ir->opts.ngener,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
 +                                nb_alloc, nb_free);
 +        }
 +        else
 +        {
 +            nbv->grp[i].nbat = nbv->grp[0].nbat;
 +        }
 +    }
 +}
 +
 +void init_forcerec(FILE              *fp,
 +                   const output_env_t oenv,
 +                   t_forcerec        *fr,
 +                   t_fcdata          *fcd,
 +                   const t_inputrec  *ir,
 +                   const gmx_mtop_t  *mtop,
 +                   const t_commrec   *cr,
 +                   matrix             box,
 +                   gmx_bool           bMolEpot,
 +                   const char        *tabfn,
 +                   const char        *tabafn,
 +                   const char        *tabpfn,
 +                   const char        *tabbfn,
 +                   const char        *nbpu_opt,
 +                   gmx_bool           bNoSolvOpt,
 +                   real               print_force)
 +{
 +    int            i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
 +    real           rtab;
 +    char          *env;
 +    double         dbl;
 +    rvec           box_size;
 +    const t_block *cgs;
 +    gmx_bool       bGenericKernelOnly;
 +    gmx_bool       bTab, bSep14tab, bNormalnblists;
 +    t_nblists     *nbl;
 +    int           *nm_ind, egp_flags;
 +
++    if (fr->hwinfo == NULL)
++    {
++        /* Detect hardware, gather information.
++         * In mdrun, hwinfo has already been set before calling init_forcerec.
++         * Here we ignore GPUs, as tools will not use them anyhow.
++         */
++        snew(fr->hwinfo, 1);
++        gmx_detect_hardware(fp, fr->hwinfo, cr,
++                            FALSE, FALSE, NULL);
++    }
++
 +    /* By default we turn acceleration on, but it might be turned off further down... */
 +    fr->use_cpu_acceleration = TRUE;
 +
 +    fr->bDomDec = DOMAINDECOMP(cr);
 +
 +    natoms = mtop->natoms;
 +
 +    if (check_box(ir->ePBC, box))
 +    {
 +        gmx_fatal(FARGS, check_box(ir->ePBC, box));
 +    }
 +
 +    /* Test particle insertion ? */
 +    if (EI_TPI(ir->eI))
 +    {
 +        /* Set to the size of the molecule to be inserted (the last one) */
 +        /* Because of old style topologies, we have to use the last cg
 +         * instead of the last molecule type.
 +         */
 +        cgs       = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
 +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
 +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
 +        {
 +            gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
 +        }
 +    }
 +    else
 +    {
 +        fr->n_tpi = 0;
 +    }
 +
 +    /* Copy AdResS parameters */
 +    if (ir->bAdress)
 +    {
 +        fr->adress_type           = ir->adress->type;
 +        fr->adress_const_wf       = ir->adress->const_wf;
 +        fr->adress_ex_width       = ir->adress->ex_width;
 +        fr->adress_hy_width       = ir->adress->hy_width;
 +        fr->adress_icor           = ir->adress->icor;
 +        fr->adress_site           = ir->adress->site;
 +        fr->adress_ex_forcecap    = ir->adress->ex_forcecap;
 +        fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
 +
 +
 +        snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
 +        for (i = 0; i < ir->adress->n_energy_grps; i++)
 +        {
 +            fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
 +        }
 +
 +        fr->n_adress_tf_grps = ir->adress->n_tf_grps;
 +        snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
 +        for (i = 0; i < fr->n_adress_tf_grps; i++)
 +        {
 +            fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
 +        }
 +        copy_rvec(ir->adress->refs, fr->adress_refs);
 +    }
 +    else
 +    {
 +        fr->adress_type           = eAdressOff;
 +        fr->adress_do_hybridpairs = FALSE;
 +    }
 +
 +    /* Copy the user determined parameters */
 +    fr->userint1  = ir->userint1;
 +    fr->userint2  = ir->userint2;
 +    fr->userint3  = ir->userint3;
 +    fr->userint4  = ir->userint4;
 +    fr->userreal1 = ir->userreal1;
 +    fr->userreal2 = ir->userreal2;
 +    fr->userreal3 = ir->userreal3;
 +    fr->userreal4 = ir->userreal4;
 +
 +    /* Shell stuff */
 +    fr->fc_stepsize = ir->fc_stepsize;
 +
 +    /* Free energy */
 +    fr->efep        = ir->efep;
 +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
 +    if (ir->fepvals->bScCoul)
 +    {
 +        fr->sc_alphacoul  = ir->fepvals->sc_alpha;
 +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
 +    }
 +    else
 +    {
 +        fr->sc_alphacoul  = 0;
 +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
 +    }
 +    fr->sc_power      = ir->fepvals->sc_power;
 +    fr->sc_r_power    = ir->fepvals->sc_r_power;
 +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
 +
 +    env = getenv("GMX_SCSIGMA_MIN");
 +    if (env != NULL)
 +    {
 +        dbl = 0;
 +        sscanf(env, "%lf", &dbl);
 +        fr->sc_sigma6_min = pow(dbl, 6);
 +        if (fp)
 +        {
 +            fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
 +        }
 +    }
 +
 +    fr->bNonbonded = TRUE;
 +    if (getenv("GMX_NO_NONBONDED") != NULL)
 +    {
 +        /* turn off non-bonded calculations */
 +        fr->bNonbonded = FALSE;
 +        md_print_warn(cr, fp,
 +                      "Found environment variable GMX_NO_NONBONDED.\n"
 +                      "Disabling nonbonded calculations.\n");
 +    }
 +
 +    bGenericKernelOnly = FALSE;
 +
 +    /* We now check in the NS code whether a particular combination of interactions
 +     * can be used with water optimization, and disable it if that is not the case.
 +     */
 +
 +    if (getenv("GMX_NB_GENERIC") != NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "Found environment variable GMX_NB_GENERIC.\n"
 +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
 +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
 +        }
 +        bGenericKernelOnly = TRUE;
 +    }
 +
 +    if (bGenericKernelOnly == TRUE)
 +    {
 +        bNoSolvOpt         = TRUE;
 +    }
 +
 +    if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
 +    {
 +        fr->use_cpu_acceleration = FALSE;
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
 +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
 +        }
 +    }
 +
 +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +
 +    /* Check if we can/should do all-vs-all kernels */
 +    fr->bAllvsAll       = can_use_allvsall(ir, mtop, FALSE, NULL, NULL);
 +    fr->AllvsAll_work   = NULL;
 +    fr->AllvsAll_workgb = NULL;
 +
 +
 +    /* Neighbour searching stuff */
 +    fr->cutoff_scheme = ir->cutoff_scheme;
 +    fr->bGrid         = (ir->ns_type == ensGRID);
 +    fr->ePBC          = ir->ePBC;
 +
 +    /* Determine if we will do PBC for distances in bonded interactions */
 +    if (fr->ePBC == epbcNONE)
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* The group cut-off scheme and SHAKE assume charge groups
 +             * are whole, but not using molpbc is faster in most cases.
 +             */
 +            if (fr->cutoff_scheme == ecutsGROUP ||
 +                (ir->eConstrAlg == econtSHAKE &&
 +                 (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
 +                  gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
 +            {
 +                fr->bMolPBC = ir->bPeriodicMols;
 +            }
 +            else
 +            {
 +                fr->bMolPBC = TRUE;
 +                if (getenv("GMX_USE_GRAPH") != NULL)
 +                {
 +                    fr->bMolPBC = FALSE;
 +                    if (fp)
 +                    {
 +                        fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
 +        }
 +    }
 +    fr->bGB = (ir->implicit_solvent == eisGBSA);
 +
 +    fr->rc_scaling = ir->refcoord_scaling;
 +    copy_rvec(ir->posres_com, fr->posres_com);
 +    copy_rvec(ir->posres_comB, fr->posres_comB);
 +    fr->rlist      = cutoff_inf(ir->rlist);
 +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
 +    fr->eeltype    = ir->coulombtype;
 +    fr->vdwtype    = ir->vdwtype;
 +
 +    fr->coulomb_modifier = ir->coulomb_modifier;
 +    fr->vdw_modifier     = ir->vdw_modifier;
 +
 +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
 +    switch (fr->eeltype)
 +    {
 +        case eelCUT:
 +            fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
 +            break;
 +
 +        case eelRF:
 +        case eelGRF:
 +        case eelRF_NEC:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            break;
 +
 +        case eelRF_ZERO:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
 +            break;
 +
 +        case eelSWITCH:
 +        case eelSHIFT:
 +        case eelUSER:
 +        case eelENCADSHIFT:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            break;
 +
 +        case eelPME:
 +        case eelEWALD:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
 +            break;
 +    }
 +
 +    /* Vdw: Translate from mdp settings to kernel format */
 +    switch (fr->vdwtype)
 +    {
 +        case evdwCUT:
 +            if (fr->bBHAM)
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
 +            }
 +            else
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
 +            }
 +            break;
 +
 +        case evdwSWITCH:
 +        case evdwSHIFT:
 +        case evdwUSER:
 +        case evdwENCADSHIFT:
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
 +            break;
 +    }
 +
 +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
 +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
 +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
 +
 +    fr->bTwinRange = fr->rlistlong > fr->rlist;
 +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
 +
 +    fr->reppow     = mtop->ffparams.reppow;
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
 +                          !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
 +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
 +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
 +                           fr->eeltype == eelEWALD ||
 +                           fr->eeltype == eelPME ||
 +                           fr->eeltype == eelRF ||
 +                           fr->eeltype == eelRF_ZERO);
 +
 +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
 +         * going to be faster to tabulate the interaction than calling the generic kernel.
 +         */
 +        if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
 +        {
 +            if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +        else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
 +                 ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
 +                   fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
 +                   (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
 +        {
 +            if (fr->rcoulomb != fr->rvdw)
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +
 +        if (getenv("GMX_REQUIRE_TABLES"))
 +        {
 +            fr->bvdwtab  = TRUE;
 +            fr->bcoultab = TRUE;
 +        }
 +
 +        if (fp)
 +        {
 +            fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
 +            fprintf(fp, "Table routines are used for vdw:     %s\n", bool_names[fr->bvdwtab ]);
 +        }
 +
 +        if (fr->bvdwtab == TRUE)
 +        {
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            fr->nbkernel_vdw_modifier    = eintmodNONE;
 +        }
 +        if (fr->bcoultab == TRUE)
 +        {
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            fr->nbkernel_elec_modifier    = eintmodNONE;
 +        }
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
 +        {
 +            gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
 +        }
 +        fr->bvdwtab  = FALSE;
 +        fr->bcoultab = FALSE;
 +    }
 +
 +    /* Tables are used for direct ewald sum */
 +    if (fr->bEwald)
 +    {
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (fp)
 +            {
 +                fprintf(fp, "Will do PME sum in reciprocal space.\n");
 +            }
 +            if (ir->coulombtype == eelP3M_AD)
 +            {
 +                please_cite(fp, "Hockney1988");
 +                please_cite(fp, "Ballenegger2012");
 +            }
 +            else
 +            {
 +                please_cite(fp, "Essmann95a");
 +            }
 +
 +            if (ir->ewald_geometry == eewg3DC)
 +            {
 +                if (fp)
 +                {
 +                    fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
 +                }
 +                please_cite(fp, "In-Chul99a");
 +            }
 +        }
 +        fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
 +        init_ewald_tab(&(fr->ewald_table), cr, ir, fp);
 +        if (fp)
 +        {
 +            fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
 +                    1/fr->ewaldcoeff);
 +        }
 +    }
 +
 +    /* Electrostatics */
 +    fr->epsilon_r       = ir->epsilon_r;
 +    fr->epsilon_rf      = ir->epsilon_rf;
 +    fr->fudgeQQ         = mtop->ffparams.fudgeQQ;
 +    fr->rcoulomb_switch = ir->rcoulomb_switch;
 +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
 +
 +    /* Parameters for generalized RF */
 +    fr->zsquare = 0.0;
 +    fr->temp    = 0.0;
 +
 +    if (fr->eeltype == eelGRF)
 +    {
 +        init_generalized_rf(fp, mtop, ir, fr);
 +    }
 +    else if (fr->eeltype == eelSHIFT)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            box_size[m] = box[m][m];
 +        }
 +
 +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
 +        {
 +            set_shift_consts(fp, fr->rcoulomb_switch, fr->rcoulomb, box_size, fr);
 +        }
 +    }
 +
 +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
 +                       gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
 +                       gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
 +                       IR_ELEC_FIELD(*ir) ||
 +                       (fr->adress_icor != eAdressICOff)
 +                       );
 +
 +    if (fr->cutoff_scheme == ecutsGROUP &&
 +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
 +    {
 +        /* Count the total number of charge groups */
 +        fr->cg_nalloc = ncg_mtop(mtop);
 +        srenew(fr->cg_cm, fr->cg_nalloc);
 +    }
 +    if (fr->shift_vec == NULL)
 +    {
 +        snew(fr->shift_vec, SHIFTS);
 +    }
 +
 +    if (fr->fshift == NULL)
 +    {
 +        snew(fr->fshift, SHIFTS);
 +    }
 +
 +    if (fr->nbfp == NULL)
 +    {
 +        fr->ntype = mtop->ffparams.atnr;
 +        fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
 +    }
 +
 +    /* Copy the energy group exclusions */
 +    fr->egp_flags = ir->opts.egp_flags;
 +
 +    /* Van der Waals stuff */
 +    fr->rvdw        = cutoff_inf(ir->rvdw);
 +    fr->rvdw_switch = ir->rvdw_switch;
 +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
 +    {
 +        if (fr->rvdw_switch >= fr->rvdw)
 +        {
 +            gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
 +                      fr->rvdw_switch, fr->rvdw);
 +        }
 +        if (fp)
 +        {
 +            fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
 +                    (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
 +                    fr->rvdw_switch, fr->rvdw);
 +        }
 +    }
 +
 +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
 +    {
 +        gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
 +    }
 +
 +    if (fp)
 +    {
 +        fprintf(fp, "Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
 +                fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
 +    }
 +
 +    fr->eDispCorr = ir->eDispCorr;
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        set_avcsixtwelve(fp, fr, mtop);
 +    }
 +
 +    if (fr->bBHAM)
 +    {
 +        set_bham_b_max(fp, fr, mtop);
 +    }
 +
 +    fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +
 +    /* Copy the GBSA data (radius, volume and surftens for each
 +     * atomtype) from the topology atomtype section to forcerec.
 +     */
 +    snew(fr->atype_radius, fr->ntype);
 +    snew(fr->atype_vol, fr->ntype);
 +    snew(fr->atype_surftens, fr->ntype);
 +    snew(fr->atype_gb_radius, fr->ntype);
 +    snew(fr->atype_S_hct, fr->ntype);
 +
 +    if (mtop->atomtypes.nr > 0)
 +    {
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_radius[i] = mtop->atomtypes.radius[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
 +        }
 +    }
 +
 +    /* Generate the GB table if needed */
 +    if (fr->bGB)
 +    {
 +#ifdef GMX_DOUBLE
 +        fr->gbtabscale = 2000;
 +#else
 +        fr->gbtabscale = 500;
 +#endif
 +
 +        fr->gbtabr = 100;
 +        fr->gbtab  = make_gb_table(fp, oenv, fr, tabpfn, fr->gbtabscale);
 +
 +        init_gb(&fr->born, cr, fr, ir, mtop, ir->rgbradii, ir->gb_algorithm);
 +
 +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            make_local_gb(cr, fr->born, ir->gb_algorithm);
 +        }
 +    }
 +
 +    /* Set the charge scaling */
 +    if (fr->epsilon_r != 0)
 +    {
 +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
 +    }
 +    else
 +    {
 +        /* eps = 0 is infinite dieletric: no coulomb interactions */
 +        fr->epsfac = 0;
 +    }
 +
 +    /* Reaction field constants */
 +    if (EEL_RF(fr->eeltype))
 +    {
 +        calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
 +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
 +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
 +    }
 +
 +    set_chargesum(fp, fr, mtop);
 +
 +    /* if we are using LR electrostatics, and they are tabulated,
 +     * the tables will contain modified coulomb interactions.
 +     * Since we want to use the non-shifted ones for 1-4
 +     * coulombic interactions, we must have an extra set of tables.
 +     */
 +
 +    /* Construct tables.
 +     * A little unnecessary to make both vdw and coul tables sometimes,
 +     * but what the heck... */
 +
 +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
 +
 +    bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
 +                  fr->bBHAM || fr->bEwald) &&
 +                 (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
 +                  gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
 +                  gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
 +
 +    negp_pp   = ir->opts.ngener - ir->nwall;
 +    negptable = 0;
 +    if (!bTab)
 +    {
 +        bNormalnblists = TRUE;
 +        fr->nnblists   = 1;
 +    }
 +    else
 +    {
 +        bNormalnblists = (ir->eDispCorr != edispcNO);
 +        for (egi = 0; egi < negp_pp; egi++)
 +        {
 +            for (egj = egi; egj < negp_pp; egj++)
 +            {
 +                egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
 +                if (!(egp_flags & EGP_EXCL))
 +                {
 +                    if (egp_flags & EGP_TABLE)
 +                    {
 +                        negptable++;
 +                    }
 +                    else
 +                    {
 +                        bNormalnblists = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (bNormalnblists)
 +        {
 +            fr->nnblists = negptable + 1;
 +        }
 +        else
 +        {
 +            fr->nnblists = negptable;
 +        }
 +        if (fr->nnblists > 1)
 +        {
 +            snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
 +        }
 +    }
 +
 +    if (ir->adress)
 +    {
 +        fr->nnblists *= 2;
 +    }
 +
 +    snew(fr->nblists, fr->nnblists);
 +
 +    /* This code automatically gives table length tabext without cut-off's,
 +     * in that case grompp should already have checked that we do not need
 +     * normal tables and we only generate tables for 1-4 interactions.
 +     */
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (bTab)
 +    {
 +        /* make tables for ordinary interactions */
 +        if (bNormalnblists)
 +        {
 +            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
 +            if (ir->adress)
 +            {
 +                make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
 +            }
 +            if (!bSep14tab)
 +            {
 +                fr->tab14 = fr->nblists[0].table_elec_vdw;
 +            }
 +            m = 1;
 +        }
 +        else
 +        {
 +            m = 0;
 +        }
 +        if (negptable > 0)
 +        {
 +            /* Read the special tables for certain energy group pairs */
 +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
 +            for (egi = 0; egi < negp_pp; egi++)
 +            {
 +                for (egj = egi; egj < negp_pp; egj++)
 +                {
 +                    egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
 +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
 +                    {
 +                        nbl = &(fr->nblists[m]);
 +                        if (fr->nnblists > 1)
 +                        {
 +                            fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
 +                        }
 +                        /* Read the table file with the two energy groups names appended */
 +                        make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[m]);
 +                        if (ir->adress)
 +                        {
 +                            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
 +                                            *mtop->groups.grpname[nm_ind[egi]],
 +                                            *mtop->groups.grpname[nm_ind[egj]],
 +                                            &fr->nblists[fr->nnblists/2+m]);
 +                        }
 +                        m++;
 +                    }
 +                    else if (fr->nnblists > 1)
 +                    {
 +                        fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (bSep14tab)
 +    {
 +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
 +        fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
 +                                GMX_MAKETABLES_14ONLY);
 +    }
 +
 +    /* Read AdResS Thermo Force table if needed */
 +    if (fr->adress_icor == eAdressICThermoForce)
 +    {
 +        /* old todo replace */
 +
 +        if (ir->adress->n_tf_grps > 0)
 +        {
 +            make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
 +
 +        }
 +        else
 +        {
 +            /* load the default table */
 +            snew(fr->atf_tabs, 1);
 +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
 +        }
 +    }
 +
 +    /* Wall stuff */
 +    fr->nwall = ir->nwall;
 +    if (ir->nwall && ir->wall_type == ewtTABLE)
 +    {
 +        make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
 +    }
 +
 +    if (fcd && tabbfn)
 +    {
 +        fcd->bondtab  = make_bonded_tables(fp,
 +                                           F_TABBONDS, F_TABBONDSNC,
 +                                           mtop, tabbfn, "b");
 +        fcd->angletab = make_bonded_tables(fp,
 +                                           F_TABANGLES, -1,
 +                                           mtop, tabbfn, "a");
 +        fcd->dihtab   = make_bonded_tables(fp,
 +                                           F_TABDIHS, -1,
 +                                           mtop, tabbfn, "d");
 +    }
 +    else
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
 +        }
 +    }
 +
 +    /* QM/MM initialization if requested
 +     */
 +    if (ir->bQMMM)
 +    {
 +        fprintf(stderr, "QM/MM calculation requested.\n");
 +    }
 +
 +    fr->bQMMM      = ir->bQMMM;
 +    fr->qr         = mk_QMMMrec();
 +
 +    /* Set all the static charge group info */
 +    fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
 +                                   &fr->bExcl_IntraCGAll_InterCGNone);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        fr->cginfo = NULL;
 +    }
 +    else
 +    {
 +        fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
 +    }
 +
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* When using particle decomposition, the effect of the second argument,
 +         * which sets fr->hcg, is corrected later in do_md and init_em.
 +         */
 +        forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
 +                            mtop->natoms, mtop->natoms, mtop->natoms);
 +    }
 +
 +    fr->print_force = print_force;
 +
 +
 +    /* coarse load balancing vars */
 +    fr->t_fnbf    = 0.;
 +    fr->t_wait    = 0.;
 +    fr->timesteps = 0;
 +
 +    /* Initialize neighbor search */
 +    init_ns(fp, cr, &fr->ns, fr, mtop, box);
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        gmx_nonbonded_setup(fp, fr, bGenericKernelOnly);
 +        /*
 +           if (ir->bAdress)
 +            {
 +                gmx_setup_adress_kernels(fp,bGenericKernelOnly);
 +            }
 +         */
 +    }
 +
 +    /* Initialize the thread working data for bonded interactions */
 +    init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
 +
 +    snew(fr->excl_load, fr->nthreads+1);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
 +        }
 +
 +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
 +    }
 +
 +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
 +    init_interaction_const(fp, &fr->ic, fr, rtab);
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        calc_enervirdiff(fp, ir->eDispCorr, fr);
 +    }
 +}
 +
 +#define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
 +#define pr_int(fp, i)  fprintf((fp), "%s: %d\n",#i, i)
 +#define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
 +
 +void pr_forcerec(FILE *fp, t_forcerec *fr, t_commrec *cr)
 +{
 +    int i;
 +
 +    pr_real(fp, fr->rlist);
 +    pr_real(fp, fr->rcoulomb);
 +    pr_real(fp, fr->fudgeQQ);
 +    pr_bool(fp, fr->bGrid);
 +    pr_bool(fp, fr->bTwinRange);
 +    /*pr_int(fp,fr->cg0);
 +       pr_int(fp,fr->hcg);*/
 +    for (i = 0; i < fr->nnblists; i++)
 +    {
 +        pr_int(fp, fr->nblists[i].table_elec_vdw.n);
 +    }
 +    pr_real(fp, fr->rcoulomb_switch);
 +    pr_real(fp, fr->rcoulomb);
 +
 +    fflush(fp);
 +}
 +
 +void forcerec_set_excl_load(t_forcerec *fr,
 +                            const gmx_localtop_t *top, const t_commrec *cr)
 +{
 +    const int *ind, *a;
 +    int        t, i, j, ntot, n, ntarget;
 +
 +    if (cr != NULL && PARTDECOMP(cr))
 +    {
 +        /* No OpenMP with particle decomposition */
 +        pd_at_range(cr,
 +                    &fr->excl_load[0],
 +                    &fr->excl_load[1]);
 +
 +        return;
 +    }
 +
 +    ind = top->excls.index;
 +    a   = top->excls.a;
 +
 +    ntot = 0;
 +    for (i = 0; i < top->excls.nr; i++)
 +    {
 +        for (j = ind[i]; j < ind[i+1]; j++)
 +        {
 +            if (a[j] > i)
 +            {
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    fr->excl_load[0] = 0;
 +    n                = 0;
 +    i                = 0;
 +    for (t = 1; t <= fr->nthreads; t++)
 +    {
 +        ntarget = (ntot*t)/fr->nthreads;
 +        while (i < top->excls.nr && n < ntarget)
 +        {
 +            for (j = ind[i]; j < ind[i+1]; j++)
 +            {
 +                if (a[j] > i)
 +                {
 +                    n++;
 +                }
 +            }
 +            i++;
 +        }
 +        fr->excl_load[t] = i;
 +    }
 +}
index 27ac6045c005580d195faf174e665b795d2abdf8,0000000000000000000000000000000000000000..1788eee771ab5c50af3e8dd2333b63b1174a31fc
mode 100644,000000..100644
--- /dev/null
@@@ -1,673 -1,0 +1,675 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#if defined(_MSVC)
 +#include <limits>
 +#endif
 +
++#include <cuda.h>
++
 +#include "types/simple.h" 
 +#include "types/nbnxn_pairlist.h"
 +#include "types/nb_verlet.h"
 +#include "types/ishift.h"
 +#include "types/force_flags.h"
 +#include "../nbnxn_consts.h"
 +
 +#ifdef TMPI_ATOMICS
 +#include "thread_mpi/atomic.h"
 +#endif
 +
 +#include "nbnxn_cuda_types.h"
 +#include "../../gmxlib/cuda_tools/cudautils.cuh"
 +#include "nbnxn_cuda.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +
 +/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
 +texture<float, 1, cudaReadModeElementType> tex_nbfp;
 +
 +/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
 +texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
 +
 +/* Convenience defines */
 +#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 +
 +/***** The kernels come here *****/
 +#include "nbnxn_cuda_kernel_utils.cuh"
 +
 +/* Generate all combinations of kernels through multiple inclusion:
 +   F, F + E, F + prune, F + E + prune. */
 +/** Force only **/
 +#include "nbnxn_cuda_kernels.cuh"
 +/** Force & energy **/
 +#define CALC_ENERGIES
 +#include "nbnxn_cuda_kernels.cuh"
 +#undef CALC_ENERGIES
 +
 +/*** Pair-list pruning kernels ***/
 +/** Force only **/
 +#define PRUNE_NBL
 +#include "nbnxn_cuda_kernels.cuh"
 +/** Force & energy **/
 +#define CALC_ENERGIES
 +#include "nbnxn_cuda_kernels.cuh"
 +#undef CALC_ENERGIES
 +#undef PRUNE_NBL
 +
 +/*! Nonbonded kernel function pointer type */
 +typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 +                                     const cu_nbparam_t,
 +                                     const cu_plist_t,
 +                                     bool);
 +
 +/*********************************/
 +
 +/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 +static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 +static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 +static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 +
 +
 +/* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 + * and corresponds to having the exponent set to the maximum (127 -- single
 + * precision) and the mantissa to 0.
 + */
 +static unsigned int poll_wait_pattern = (0x7FU << 23);
 +
 +/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 +static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 +{
 +    int max_grid_x_size;
 +
 +    assert(dinfo);
 +
 +    max_grid_x_size = dinfo->prop.maxGridSize[0];
 +
 +    /* do we exceed the grid x dimension limit? */
 +    if (nwork_units > max_grid_x_size)
 +    {
 +        gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 +                  "The number of nonbonded work units (=number of super-clusters) exceeds the"
 +                  "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 +    }
 +
 +    return nwork_units;
 +}
 +
 +
 +/* Constant arrays listing all kernel function pointers and enabling selection
 +   of a kernel in an elegant manner. */
 +
 +static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 +static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 +
 +/* Default kernels */
 +static const nbnxn_cu_kfunc_ptr_t
 +nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 +{
 +    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 +      { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 +    { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 +      { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 +    { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 +      { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 +    { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 +      { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 +};
 +
 +/* Legacy kernels */
 +static const nbnxn_cu_kfunc_ptr_t
 +nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 +{
 +    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 +      { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 +    { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 +      { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 +    { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 +      { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 +    { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 +      { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 +};
 +
 +/*! Return a pointer to the kernel version to be executed at the current step. */
 +static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 +                                                       bool bDoEne, bool bDoPrune)
 +{
 +    assert(kver < eNbnxnCuKNR);
 +    assert(eeltype < eelCuNR);
 +
 +    if (NBNXN_KVER_LEGACY(kver))
 +    {
 +        return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 +    }
 +    else
 +    {
 +        return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 +    }
 +}
 +
 +/*! Calculates the amount of shared memory required for kernel version in use. */
 +static inline int calc_shmem_required(int kver)
 +{
 +    int shmem;
 +
 +    /* size of shmem (force-buffers/xq/atom type preloading) */
 +    if (NBNXN_KVER_LEGACY(kver))
 +    {
 +        /* i-atom x+q in shared memory */
 +        shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 +        /* force reduction buffers in shared memory */
 +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 +    }
 +    else
 +    {
 +        /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 +        /* i-atom x+q in shared memory */
 +        shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 +        /* cj in shared memory, for both warps separately */
 +        shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 +#ifdef IATYPE_SHMEM
 +        /* i-atom types in shared memory */
 +        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 +#endif
 +#if __CUDA_ARCH__ < 300
 +        /* force reduction buffers in shared memory */
 +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 +#endif
 +    }
 +
 +    return shmem;
 +}
 +
 +/*! As we execute nonbonded workload in separate streams, before launching 
 +   the kernel we need to make sure that he following operations have completed:
 +   - atomdata allocation and related H2D transfers (every nstlist step);
 +   - pair list H2D transfer (every nstlist step);
 +   - shift vector H2D transfer (every nstlist step);
 +   - force (+shift force and energy) output clearing (every step).
 +
 +   These operations are issued in the local stream at the beginning of the step
 +   and therefore always complete before the local kernel launch. The non-local
 +   kernel is launched after the local on the same device/context, so this is
 +   inherently scheduled after the operations in the local stream (including the
 +   above "misc_ops").
 +   However, for the sake of having a future-proof implementation, we use the
 +   misc_ops_done event to record the point in time when the above  operations
 +   are finished and synchronize with this event in the non-local stream.
 +*/
 +void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_atomdata_t *nbatom,
 +                              int flags,
 +                              int iloc)
 +{
 +    cudaError_t stat;
 +    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 +    /* CUDA kernel launch-related stuff */
 +    int  shmem, nblock;
 +    dim3 dim_block, dim_grid;
 +    nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 +
 +    cu_atomdata_t   *adat   = cu_nb->atdat;
 +    cu_nbparam_t    *nbp    = cu_nb->nbparam;
 +    cu_plist_t      *plist  = cu_nb->plist[iloc];
 +    cu_timers_t     *t      = cu_nb->timers;
 +    cudaStream_t    stream  = cu_nb->stream[iloc];
 +
 +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +    bool bDoTime     = cu_nb->bDoTime;
 +
 +    /* turn energy calculation always on/off (for debugging/testing only) */
 +    bCalcEner = (bCalcEner || always_ener) && !never_ener;
 +
 +    /* don't launch the kernel if there is no work to do */
 +    if (plist->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_I(iloc))
 +    {
 +        adat_begin  = 0;
 +        adat_len    = adat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_begin  = adat->natoms_local;
 +        adat_len    = adat->natoms - adat->natoms_local;
 +    }
 +
 +    /* When we get here all misc operations issues in the local stream are done,
 +       so we record that in the local stream and wait for it in the nonlocal one. */
 +    if (cu_nb->bUseTwoStreams)
 +    {
 +        if (iloc == eintLocal)
 +        {
 +            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 +            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 +        }
 +        else
 +        {
 +            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 +            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 +        }
 +    }
 +
 +    /* beginning of timed HtoD section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* HtoD x, q */
 +    cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 +                      adat_len * sizeof(*adat->xq), stream); 
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* beginning of timed nonbonded calculation section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* get the pointer to the kernel flavor we need to use */
 +    nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 +                                    plist->bDoPrune || always_prune);
 +
 +    /* kernel launch config */
 +    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 +    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 +    dim_grid  = dim3(nblock, 1, 1);
 +    shmem     = calc_shmem_required(cu_nb->kernel_ver);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 +                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 +                dim_block.x, dim_block.y, dim_block.z,
 +                dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 +                NCL_PER_SUPERCL, plist->na_c);
 +    }
 +
 +    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 +    CU_LAUNCH_ERR("k_calc_nb");
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 +                               const nbnxn_atomdata_t *nbatom,
 +                               int flags,
 +                               int aloc)
 +{
 +    cudaError_t stat;
 +    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 +    int iloc = -1;
 +
 +    /* determine interaction locality from atom locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        iloc = eintLocal;
 +    }
 +    else if (NONLOCAL_A(aloc))
 +    {
 +        iloc = eintNonlocal;
 +    }
 +    else
 +    {
 +        char stmp[STRLEN];
 +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 +        gmx_incons(stmp);
 +    }
 +
 +    cu_atomdata_t   *adat   = cu_nb->atdat;
 +    cu_timers_t     *t      = cu_nb->timers;
 +    bool            bDoTime = cu_nb->bDoTime;
 +    cudaStream_t    stream  = cu_nb->stream[iloc];
 +
 +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +
 +    /* don't launch copy-back if there was no work to do */
 +    if (cu_nb->plist[iloc]->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        adat_begin  = 0;
 +        adat_len    = adat->natoms_local;
 +        adat_end    = cu_nb->atdat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_begin  = adat->natoms_local;
 +        adat_len    = adat->natoms - adat->natoms_local;
 +        adat_end    = cu_nb->atdat->natoms;
 +    }
 +
 +    /* beginning of timed D2H section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    if (!cu_nb->bUseStreamSync)
 +    {
 +        /* For safety reasons set a few (5%) forces to NaN. This way even if the
 +           polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 +        for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 +        {
 +#ifdef NAN
 +            nbatom->out[0].f[i] = NAN;
 +#else
 +#  ifdef _MSVC
 +            if (numeric_limits<float>::has_quiet_NaN)
 +            {
 +                nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 +            }
 +            else
 +#  endif
 +            {
 +                nbatom->out[0].f[i] = GMX_REAL_MAX;
 +            }
 +#endif
 +        }
 +
 +        /* Set the last four bytes of the force array to a bit pattern
 +           which can't be the result of the force calculation:
 +           max exponent (127) and zero mantissa. */
 +        *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 +    }
 +
 +    /* With DD the local D2H transfer can only start after the non-local 
 +       has been launched. */
 +    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 +    {
 +        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 +        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 +    }
 +
 +    /* DtoH f */
 +    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
 +                      (adat_len)*sizeof(*adat->f), stream);
 +
 +    /* After the non-local D2H is launched the nonlocal_done event can be
 +       recorded which signals that the local D2H can proceed. This event is not
 +       placed after the non-local kernel because we first need the non-local
 +       data back first. */
 +    if (iloc == eintNonlocal)
 +    {
 +        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 +        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 +    }
 +
 +    /* only transfer energies in the local stream */
 +    if (LOCAL_I(iloc))
 +    {
 +        /* DtoH fshift */
 +        if (bCalcFshift)
 +        {
 +            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 +                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 +        }
 +
 +        /* DtoH energies */
 +        if (bCalcEner)
 +        {
 +            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 +                              sizeof(*cu_nb->nbst.e_lj), stream);
 +            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 +                              sizeof(*cu_nb->nbst.e_el), stream);
 +        }
 +    }
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +/* Atomic compare-exchange operation on unsigned values. It is used in
 + * polling wait for the GPU.
 + */
 +static inline bool atomic_cas(volatile unsigned int *ptr,
 +                              unsigned int oldval,
 +                              unsigned int newval)
 +{
 +    assert(ptr);
 +
 +#ifdef TMPI_ATOMICS
 +    return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 +#else
 +    gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 +    return true;
 +#endif
 +}
 +
 +void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 +                         const nbnxn_atomdata_t *nbatom,
 +                         int flags, int aloc,
 +                         float *e_lj, float *e_el, rvec *fshift)
 +{
 +    cudaError_t stat;
 +    int i, adat_end, iloc = -1;
 +    volatile unsigned int *poll_word;
 +
 +    /* determine interaction locality from atom locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        iloc = eintLocal;
 +    }
 +    else if (NONLOCAL_A(aloc))
 +    {
 +        iloc = eintNonlocal;
 +    }
 +    else
 +    {
 +        char stmp[STRLEN];
 +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 +        gmx_incons(stmp);
 +    }
 +
 +    cu_plist_t      *plist   = cu_nb->plist[iloc];
 +    cu_timers_t     *timers  = cu_nb->timers;
 +    wallclock_gpu_t *timings = cu_nb->timings;
 +    nb_staging      nbst     = cu_nb->nbst;
 +
 +    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +
 +    /* turn energy calculation always on/off (for debugging/testing only) */
 +    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
 +
 +    /* don't launch wait/update timers & counters if there was no work to do
 +
 +       NOTE: if timing with multiple GPUs (streams) becomes possible, the
 +       counters could end up being inconsistent due to not being incremented
 +       on some of the nodes! */
 +    if (cu_nb->plist[iloc]->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        adat_end = cu_nb->atdat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_end = cu_nb->atdat->natoms;
 +    }
 +
 +    if (cu_nb->bUseStreamSync)
 +    {
 +        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 +        CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 +    }
 +    else 
 +    {
 +        /* Busy-wait until we get the signal pattern set in last byte
 +         * of the l/nl float vector. This pattern corresponds to a floating
 +         * point number which can't be the result of the force calculation
 +         * (maximum, 127 exponent and 0 mantissa).
 +         * The polling uses atomic compare-exchange.
 +         */
 +        poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 +        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 +    }
 +
 +    /* timing data accumulation */
 +    if (cu_nb->bDoTime)
 +    {
 +        /* only increase counter once (at local F wait) */
 +        if (LOCAL_I(iloc))
 +        {
 +            timings->nb_c++;
 +            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 +        }
 +
 +        /* kernel timings */
 +        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 +            cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 +
 +        /* X/q H2D and F D2H timings */
 +        timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 +                                                 timers->stop_nb_h2d[iloc]);
 +        timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 +                                                 timers->stop_nb_d2h[iloc]);
 +
 +        /* only count atdat and pair-list H2D at pair-search step */
 +        if (plist->bDoPrune)
 +        {
 +            /* atdat transfer timing (add only once, at local F wait) */
 +            if (LOCAL_A(aloc))
 +            {
 +                timings->pl_h2d_c++;
 +                timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 +                                                         timers->stop_atdat);
 +            }
 +
 +            timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 +                                                     timers->stop_pl_h2d[iloc]);
 +        }
 +    }
 +
 +    /* add up energies and shift forces (only once at local F wait) */
 +    if (LOCAL_I(iloc))
 +    {
 +        if (bCalcEner)
 +        {
 +            *e_lj += *nbst.e_lj;
 +            *e_el += *nbst.e_el;
 +        }
 +
 +        if (bCalcFshift)
 +        {
 +            for (i = 0; i < SHIFTS; i++)
 +            {
 +                fshift[i][0] += nbst.fshift[i].x;
 +                fshift[i][1] += nbst.fshift[i].y;
 +                fshift[i][2] += nbst.fshift[i].z;
 +            }
 +        }
 +    }
 +
 +    /* turn off pruning (doesn't matter if this is pair-search step or not) */
 +    plist->bDoPrune = false;
 +}
 +
 +/*! Return the reference to the nbfp texture. */
 +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 +{
 +    return tex_nbfp;
 +}
 +
 +/*! Return the reference to the coulomb_tab. */
 +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 +{
 +    return tex_coulomb_tab;
 +}
 +
 +/*! Set up the cache configuration for the non-bonded kernels,
 + */
 +void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 +{
 +    cudaError_t stat;
 +
 +    for (int i = 0; i < eelCuNR; i++)
 +        for (int j = 0; j < nEnergyKernelTypes; j++)
 +            for (int k = 0; k < nPruneKernelTypes; k++)
 +            {
 +                /* Legacy kernel 16/48 kB Shared/L1 */
 +                stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 +
 +                if (devinfo->prop.major >= 3)
 +                {
 +                    /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 +                }
 +                else
 +                {
 +                    /* On Fermi prefer L1 gives 2% higher performance */
 +                    /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 +                }
 +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 +            }
 +}
index bea220b4c5d90a75ae170b40a6142ade326e6647,0000000000000000000000000000000000000000..3940ba87ade22e868805146468df9308a409aed7
mode 100644,000000..100644
--- /dev/null
@@@ -1,895 -1,0 +1,908 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdlib.h>
 +#include <stdio.h>
 +#include <assert.h>
 +
++#include <cuda.h>
++
 +#include "gmx_fatal.h"
 +#include "smalloc.h"
 +#include "tables.h"
 +#include "typedefs.h"
 +#include "types/nb_verlet.h"
 +#include "types/interaction_const.h"
 +#include "types/force_flags.h"
 +#include "../nbnxn_consts.h"
 +
 +#include "nbnxn_cuda_types.h"
 +#include "../../gmxlib/cuda_tools/cudautils.cuh"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +#include "gpu_utils.h"
 +
 +static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
 +
 +/* This is a heuristically determined parameter for the Fermi architecture for
 + * the minimum size of ci lists by multiplying this constant with the # of
 + * multiprocessors on the current device.
 + */
 +static unsigned int gpu_min_ci_balanced_factor = 40;
 +
 +/* Functions from nbnxn_cuda.cu */
 +extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
 +extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
 +extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
 +
 +/* We should actually be using md_print_warn in md_logging.c,
 + * but we can't include mpi.h in CUDA code.
 + */
 +static void md_print_warn(FILE *fplog, const char *buf)
 +{
 +    if (fplog != NULL)
 +    {
 +        /* We should only print to stderr on the master node,
 +         * in most cases fplog is only set on the master node, so this works.
 +         */
 +        fprintf(stderr, "\n%s\n", buf);
 +        fprintf(fplog,  "\n%s\n", buf);
 +    }
 +}
 +
 +/* Fw. decl. */
 +static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
 +
 +
 +/*! Tabulates the Ewald Coulomb force and initializes the size/scale
 +    and the table GPU array. If called with an already allocated table,
 +    it just re-uploads the table.
 + */
 +static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
 +{
 +    float       *ftmp, *coul_tab;
 +    int         tabsize;
 +    double      tabscale;
 +    cudaError_t stat;
 +
 +    tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +    /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +    tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
 +
 +    pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
 +
 +    table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
 +                                1/tabscale, nbp->ewald_beta);
 +
 +    /* If the table pointer == NULL the table is generated the first time =>
 +       the array pointer will be saved to nbparam and the texture is bound.
 +     */
 +    coul_tab = nbp->coulomb_tab;
 +    if (coul_tab == NULL)
 +    {
 +        stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
 +        CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 +
 +        nbp->coulomb_tab = coul_tab;
 +
 +        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 +        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 +                               coul_tab, &cd, tabsize*sizeof(*coul_tab));
 +        CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
 +    }
 +
 +    cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
 +
 +    nbp->coulomb_tab_size     = tabsize;
 +    nbp->coulomb_tab_scale    = tabscale;
 +
 +    pfree(ftmp);
 +}
 +
 +
 +/*! Initializes the atomdata structure first time, it only gets filled at
 +    pair-search. */
 +static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 +{
 +    cudaError_t stat;
 +
 +    ad->ntypes  = ntypes;
 +    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 +    ad->bShiftVecUploaded = false;
 +
 +    stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 +
 +    stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 +    stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 +
 +    /* initialize to NULL poiters to data that is not allocated here and will
 +       need reallocation in nbnxn_cuda_init_atomdata */
 +    ad->xq = NULL;
 +    ad->f  = NULL;
 +
 +    /* size -1 indicates that the respective array hasn't been initialized yet */
 +    ad->natoms = -1;
 +    ad->nalloc = -1;
 +}
 +
 +/*! Initializes the nonbonded parameter data structure. */
 +static void init_nbparam(cu_nbparam_t *nbp,
 +                         const interaction_const_t *ic,
 +                         const nonbonded_verlet_t *nbv)
 +{
 +    cudaError_t stat;
 +    int         ntypes, nnbfp;
 +
 +    ntypes  = nbv->grp[0].nbat->ntype;
 +
 +    nbp->ewald_beta = ic->ewaldcoeff;
 +    nbp->sh_ewald   = ic->sh_ewald;
 +    nbp->epsfac     = ic->epsfac;
 +    nbp->two_k_rf   = 2.0 * ic->k_rf;
 +    nbp->c_rf       = ic->c_rf;
 +    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
 +    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
 +    nbp->rlist_sq   = ic->rlist * ic->rlist;
 +    nbp->sh_invrc6  = ic->sh_invrc6;
 +
 +    if (ic->eeltype == eelCUT)
 +    {
 +        nbp->eeltype = eelCuCUT;
 +    }
 +    else if (EEL_RF(ic->eeltype))
 +    {
 +        nbp->eeltype = eelCuRF;
 +    }
 +    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
 +    {
 +        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off, unless
 +           forced by the env. var. (used only for benchmarking). */
 +        if (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
 +        {
 +            nbp->eeltype = eelCuEWALD;
 +        }
 +        else
 +        {
 +            nbp->eeltype = eelCuEWALD_TWIN;
 +        }
 +    }
 +    else
 +    {
 +        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 +        gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 +    }
 +
 +    /* generate table for PME */
 +    if (nbp->eeltype == eelCuEWALD)
 +    {
 +        nbp->coulomb_tab = NULL;
 +        init_ewald_coulomb_force_table(nbp);
 +    }
 +
 +    nnbfp = 2*ntypes*ntypes;
 +    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 +    CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 +    cu_copy_H2D(nbp->nbfp, nbv->grp[0].nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 +
 +    cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 +    stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 +                           nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 +    CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
 +}
 +
 +/*! Re-generate the GPU Ewald force table, resets rlist, and update the
 + *  electrostatic type switching to twin cut-off (or back) if needed. */
 +void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
 +                                         const interaction_const_t *ic)
 +{
 +    cu_nbparam_t *nbp = cu_nb->nbparam;
 +
 +    nbp->rlist_sq       = ic->rlist * ic->rlist;
 +    nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
 +    nbp->ewald_beta     = ic->ewaldcoeff;
 +
 +    /* When switching to/from twin cut-off, the electrostatics type needs updating.
 +       (The env. var. that forces twin cut-off is for benchmarking only!) */
 +    if (ic->rcoulomb == ic->rvdw &&
 +        getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
 +    {
 +        nbp->eeltype = eelCuEWALD;
 +    }
 +    else
 +    {
 +        nbp->eeltype = eelCuEWALD_TWIN;
 +    }
 +
 +    init_ewald_coulomb_force_table(cu_nb->nbparam);
 +}
 +
 +/*! Initializes the pair list data structure. */
 +static void init_plist(cu_plist_t *pl)
 +{
 +    /* initialize to NULL pointers to data that is not allocated here and will
 +       need reallocation in nbnxn_cuda_init_pairlist */
 +    pl->sci     = NULL;
 +    pl->cj4     = NULL;
 +    pl->excl    = NULL;
 +
 +    /* size -1 indicates that the respective array hasn't been initialized yet */
 +    pl->na_c        = -1;
 +    pl->nsci        = -1;
 +    pl->sci_nalloc  = -1;
 +    pl->ncj4        = -1;
 +    pl->cj4_nalloc  = -1;
 +    pl->nexcl       = -1;
 +    pl->excl_nalloc = -1;
 +    pl->bDoPrune    = false;
 +}
 +
 +/*! Initializes the timer data structure. */
 +static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 +{
 +    cudaError_t stat;
 +    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
 +
 +    stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 +    CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 +    stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 +    CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 +
 +    /* The non-local counters/stream (second in the array) are needed only with DD. */
 +    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 +    {
 +        stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 +
 +
 +        stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 +
 +        stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 +
 +        stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 +    }
 +}
 +
 +/*! Initializes the timings data structure. */
 +static void init_timings(wallclock_gpu_t *t)
 +{
 +    int i, j;
 +
 +    t->nb_h2d_t = 0.0;
 +    t->nb_d2h_t = 0.0;
 +    t->nb_c    = 0;
 +    t->pl_h2d_t = 0.0;
 +    t->pl_h2d_c = 0;
 +    for (i = 0; i < 2; i++)
 +    {
 +        for(j = 0; j < 2; j++)
 +        {
 +            t->ktime[i][j].t = 0.0;
 +            t->ktime[i][j].c = 0;
 +        }
 +    }
 +}
 +
 +/* Decide which kernel version to use (default or legacy) based on:
 + *  - CUDA version
 + *  - non-bonded kernel selector environment variables
 + *  - GPU SM version TODO ???
 + */
 +static int pick_nbnxn_kernel_version()
 +{
 +    bool bLegacyKernel, bDefaultKernel, bCUDA40, bCUDA32;
 +    char sbuf[STRLEN];
 +    int  kver;
 +
 +    /* legacy kernel (former k2), kept for now for backward compatibility,
 +       faster than the default with  CUDA 3.2/4.0 (TODO: on Kepler?). */
 +    bLegacyKernel  = (getenv("GMX_CUDA_NB_LEGACY") != NULL);
 +    /* default kernel (former k3). */
 +    bDefaultKernel = (getenv("GMX_CUDA_NB_DEFAULT") != NULL);
 +
 +    if ((unsigned)(bLegacyKernel + bDefaultKernel) > 1)
 +    {
 +        gmx_fatal(FARGS, "Multiple CUDA non-bonded kernels requested; to manually pick a kernel set only one \n"
 +                  "of the following environment variables: \n"
 +                  "GMX_CUDA_NB_DEFAULT, GMX_CUDA_NB_LEGACY");
 +    }
 +
 +    bCUDA32 = bCUDA40 = false;
 +#if CUDA_VERSION == 3200
 +    bCUDA32 = true;
 +    sprintf(sbuf, "3.2");
 +#elif CUDA_VERSION == 4000
 +    bCUDA40 = true;
 +    sprintf(sbuf, "4.0");
 +#endif
 +
 +    /* default is default ;) */
 +    kver = eNbnxnCuKDefault;
 +
 +    if (bCUDA32 || bCUDA40)
 +    {
 +        /* use legacy kernel unless something else is forced by an env. var */
 +        if (bDefaultKernel)
 +        {
 +            fprintf(stderr,
 +                    "\nNOTE: CUDA %s compilation detected; with this compiler version the legacy\n"
 +                    "      non-bonded kernels perform best. However, the default kernels were\n"
 +                    "      selected by the GMX_CUDA_NB_DEFAULT environment variable.\n"
 +                    "      For best performance upgrade your CUDA toolkit.",
 +                    sbuf);
 +        }
 +        else
 +        {
 +            kver = eNbnxnCuKLegacy;
 +        }
 +    }
 +    else
 +    {
 +        /* issue not if the non-default kernel is forced by an env. var */
 +        if (bLegacyKernel)
 +        {
 +            fprintf(stderr,
 +                    "\nNOTE: Legacy non-bonded CUDA kernels were selected by the GMX_CUDA_NB_LEGACY\n"
 +                    "      env. var. Consider using using the default kernels which should be faster!\n");
 +
 +            kver = eNbnxnCuKLegacy;
 +        }
 +    }
 +
 +    return kver;
 +}
 +
 +void nbnxn_cuda_init(FILE *fplog,
 +                     nbnxn_cuda_ptr_t *p_cu_nb,
 +                     gmx_gpu_info_t *gpu_info, int my_gpu_index,
 +                     gmx_bool bLocalAndNonlocal)
 +{
 +    cudaError_t stat;
 +    nbnxn_cuda_ptr_t  nb;
 +    char sbuf[STRLEN];
 +    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
 +    int cuda_drv_ver;
 +
 +    assert(gpu_info);
 +
 +    if (p_cu_nb == NULL) return;
 +
 +    snew(nb, 1);
 +    snew(nb->atdat, 1);
 +    snew(nb->nbparam, 1);
 +    snew(nb->plist[eintLocal], 1);
 +    if (bLocalAndNonlocal)
 +    {
 +        snew(nb->plist[eintNonlocal], 1);
 +    }
 +
 +    nb->bUseTwoStreams = bLocalAndNonlocal;
 +
 +    snew(nb->timers, 1);
 +    snew(nb->timings, 1);
 +
 +    /* init nbst */
 +    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 +    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 +    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 +
 +    init_plist(nb->plist[eintLocal]);
 +
 +    /* local/non-local GPU streams */
 +    stat = cudaStreamCreate(&nb->stream[eintLocal]);
 +    CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 +    if (nb->bUseTwoStreams)
 +    {
 +        init_plist(nb->plist[eintNonlocal]);
 +        stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 +        CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 +    }
 +
 +    /* init events for sychronization (timing disabled for performance reasons!) */
 +    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 +    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 +    stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 +    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 +
 +    /* set device info, just point it to the right GPU among the detected ones */
 +    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
 +
 +    /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 +     * (which increases with shorter time/step) caused by a known CUDA driver bug.
 +     * To work around the issue we'll use an (admittedly fragile) memory polling
 +     * waiting to preserve performance. This requires support for atomic
 +     * operations and only works on x86/x86_64.
 +     * With polling wait event-timing also needs to be disabled.
 +     *
 +     * The overhead is greatly reduced in API v5.0 drivers and the improvement
 +     $ is independent of runtime version. Hence, with API v5.0 drivers and later
 +     * we won't switch to polling.
 +     *
 +     * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
 +     * ranks so we will also disable it in that case.
 +     */
 +
 +    bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 +    bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 +
 +#ifdef TMPI_ATOMICS
 +    bTMPIAtomics = true;
 +#else
 +    bTMPIAtomics = false;
 +#endif
 +
 +#if defined(i386) || defined(__x86_64__)
 +    bX86 = true;
 +#else
 +    bX86 = false;
 +#endif
 +
 +    if (bStreamSync && bNoStreamSync)
 +    {
 +        gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 +    }
 +
 +    stat = cudaDriverGetVersion(&cuda_drv_ver);
 +    CU_RET_ERR(stat, "cudaDriverGetVersion failed");
 +    bOldDriver = (cuda_drv_ver < 5000);
 +
 +    if (nb->dev_info->prop.ECCEnabled == 1)
 +    {
 +        if (bStreamSync)
 +        {
 +            nb->bUseStreamSync = true;
 +
 +            /* only warn if polling should be used */
 +            if (bOldDriver && !gpu_info->bDevShare)
 +            {
 +                md_print_warn(fplog,
 +                              "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
 +                              "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 +            }
 +        }
 +        else
 +        {
 +            /* Can/should turn of cudaStreamSynchronize wait only if
 +             *   - we're on x86/x86_64
 +             *   - atomics are available
 +             *   - GPUs are not being shared
 +             *   - and driver is old. */
 +            nb->bUseStreamSync =
 +                (bX86 && bTMPIAtomics && !gpu_info->bDevShare && bOldDriver) ?
 +                true : false;
 +
 +            if (nb->bUseStreamSync)
 +            {
 +                md_print_warn(fplog,
 +                              "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
 +                              "      cause performance loss. Switching to the alternative polling GPU waiting.\n"
 +                              "      If you encounter issues, switch back to standard GPU waiting by setting\n"
 +                              "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 +            }
 +            else if (bOldDriver)
 +            {
 +                /* Tell the user that the ECC+old driver combination can be bad */
 +                sprintf(sbuf,
 +                        "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0. A bug in this\n"
 +                        "      driver can cause performance loss.\n"
 +                        "      However, the polling waiting workaround can not be used because\n%s\n"
 +                        "      Consider updating the driver or turning ECC off.",
 +                        (!bX86 || !bTMPIAtomics) ?
 +                           "         atomic operations are not supported by the platform/CPU+compiler." :
 +                           "         GPU(s) are being oversubscribed.");
 +                md_print_warn(fplog, sbuf);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        if (bNoStreamSync)
 +        {
 +            nb->bUseStreamSync = false;
 +
 +            md_print_warn(fplog,
 +                          "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 +        }
 +        else
 +        {
 +            /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 +            nb->bUseStreamSync = true;
 +        }
 +    }
 +
 +    /* CUDA timing disabled as event timers don't work:
 +       - with multiple streams = domain-decomposition;
 +       - with the polling waiting hack (without cudaStreamSynchronize);
 +       - when turned off by GMX_DISABLE_CUDA_TIMING.
 +     */
 +    nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 +                   (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 +
 +    if (nb->bDoTime)
 +    {
 +        init_timers(nb->timers, nb->bUseTwoStreams);
 +        init_timings(nb->timings);
 +    }
 +
 +    /* set the kernel type for the current GPU */
 +    nb->kernel_ver = pick_nbnxn_kernel_version();
 +    /* pick L1 cache configuration */
 +    nbnxn_cuda_set_cacheconfig(nb->dev_info);
 +
 +    *p_cu_nb = nb;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Initialized CUDA data structures.\n");
 +    }
 +}
 +
 +void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
 +                           const interaction_const_t *ic,
 +                           const nonbonded_verlet_t *nbv)
 +{
 +    init_atomdata_first(cu_nb->atdat, nbv->grp[0].nbat->ntype);
 +    init_nbparam(cu_nb->nbparam, ic, nbv);
 +
 +    /* clear energy and shift force outputs */
 +    nbnxn_cuda_clear_e_fshift(cu_nb);
 +}
 +
 +void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_pairlist_t *h_plist,
 +                              int iloc)
 +{
 +    char         sbuf[STRLEN];
 +    cudaError_t  stat;
 +    bool         bDoTime    = cu_nb->bDoTime;
 +    cudaStream_t stream     = cu_nb->stream[iloc];
 +    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
 +
 +    if (d_plist->na_c < 0)
 +    {
 +        d_plist->na_c = h_plist->na_ci;
 +    }
 +    else
 +    {
 +        if (d_plist->na_c != h_plist->na_ci)
 +        {
 +            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 +                    d_plist->na_c, h_plist->na_ci);
 +            gmx_incons(sbuf);
 +        }
 +    }
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 +                         &d_plist->nsci, &d_plist->sci_nalloc,
 +                         h_plist->nsci,
 +                         stream, true);
 +
 +    cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 +                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 +                         h_plist->ncj4,
 +                         stream, true);
 +
 +    cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 +                         &d_plist->nexcl, &d_plist->excl_nalloc,
 +                         h_plist->nexcl,
 +                         stream, true);
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* need to prune the pair list during the next step */
 +    d_plist->bDoPrune = true;
 +}
 +
 +void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
 +                                const nbnxn_atomdata_t *nbatom)
 +{
 +    cu_atomdata_t *adat = cu_nb->atdat;
 +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
 +
 +    /* only if we have a dynamic box */
 +    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 +    {
 +        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
 +                          SHIFTS * sizeof(*adat->shift_vec), ls);
 +        adat->bShiftVecUploaded = true;
 +    }
 +}
 +
 +/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 +static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 +{
 +    cudaError_t   stat;
 +    cu_atomdata_t *adat = cu_nb->atdat;
 +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
 +
 +    stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 +}
 +
 +/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 +static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    cudaError_t   stat;
 +    cu_atomdata_t *adat = cu_nb->atdat;
 +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
 +
 +    stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 +    stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 +    stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 +}
 +
 +void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
 +{
 +    nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
 +    /* clear shift force array and energies if the outputs were 
 +       used in the current step */
 +    if (flags & GMX_FORCE_VIRIAL)
 +    {
 +        nbnxn_cuda_clear_e_fshift(cu_nb);
 +    }
 +}
 +
 +void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_atomdata_t *nbat)
 +{
 +    cudaError_t   stat;
 +    int           nalloc, natoms;
 +    bool          realloced;
 +    bool          bDoTime   = cu_nb->bDoTime;
 +    cu_timers_t   *timers   = cu_nb->timers;
 +    cu_atomdata_t *d_atdat  = cu_nb->atdat;
 +    cudaStream_t  ls        = cu_nb->stream[eintLocal];
 +
 +    natoms = nbat->natoms;
 +    realloced = false;
 +
 +    if (bDoTime)
 +    {
 +        /* time async copy */
 +        stat = cudaEventRecord(timers->start_atdat, ls);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* need to reallocate if we have to copy more atoms than the amount of space
 +       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 +    if (natoms > d_atdat->nalloc)
 +    {
 +        nalloc = over_alloc_small(natoms);
 +
 +        /* free up first if the arrays have already been initialized */
 +        if (d_atdat->nalloc != -1)
 +        {
 +            cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 +            cu_free_buffered(d_atdat->xq);
 +            cu_free_buffered(d_atdat->atom_types);
 +        }
 +
 +        stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 +        stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 +
 +        stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 +
 +        d_atdat->nalloc = nalloc;
 +        realloced = true;
 +    }
 +
 +    d_atdat->natoms = natoms;
 +    d_atdat->natoms_local = nbat->natoms_local;
 +
 +    /* need to clear GPU f output if realloc happened */
 +    if (realloced)
 +    {
 +        nbnxn_cuda_clear_f(cu_nb, nalloc);
 +    }
 +
 +    cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 +                      natoms*sizeof(*d_atdat->atom_types), ls);
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(timers->stop_atdat, ls);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
 +{
 +    cudaError_t     stat;
 +    cu_atomdata_t   *atdat;
 +    cu_nbparam_t    *nbparam;
 +    cu_plist_t      *plist, *plist_nl;
 +    cu_timers_t     *timers;
 +
 +    if (cu_nb == NULL) return;
 +
 +    atdat       = cu_nb->atdat;
 +    nbparam     = cu_nb->nbparam;
 +    plist       = cu_nb->plist[eintLocal];
 +    plist_nl    = cu_nb->plist[eintNonlocal];
 +    timers      = cu_nb->timers;
 +
 +    if (nbparam->eeltype == eelCuEWALD || nbparam->eeltype == eelCuEWALD_TWIN)
 +    {
 +      stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 +      CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
 +      cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 +    }
 +
 +    stat = cudaEventDestroy(cu_nb->nonlocal_done);
 +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 +    stat = cudaEventDestroy(cu_nb->misc_ops_done);
 +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 +
 +    if (cu_nb->bDoTime)
 +    {
 +        stat = cudaEventDestroy(timers->start_atdat);
 +        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 +        stat = cudaEventDestroy(timers->stop_atdat);
 +        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 +
 +        /* The non-local counters/stream (second in the array) are needed only with DD. */
 +        for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
 +        {
 +            stat = cudaEventDestroy(timers->start_nb_k[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
 +            stat = cudaEventDestroy(timers->stop_nb_k[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
 +
 +            stat = cudaEventDestroy(timers->start_pl_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
 +            stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
 +
 +            stat = cudaStreamDestroy(cu_nb->stream[i]);
 +            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
 +
 +            stat = cudaEventDestroy(timers->start_nb_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
 +            stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
 +
 +            stat = cudaEventDestroy(timers->start_nb_d2h[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
 +            stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
 +        }
 +    }
 +
 +    stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
 +    CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
 +    cu_free_buffered(nbparam->nbfp);
 +
 +    stat = cudaFree(atdat->shift_vec);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
 +    stat = cudaFree(atdat->fshift);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
 +
 +    stat = cudaFree(atdat->e_lj);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
 +    stat = cudaFree(atdat->e_el);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
 +
 +    cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
 +    cu_free_buffered(atdat->xq);
 +    cu_free_buffered(atdat->atom_types, &atdat->ntypes);
 +
 +    cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
 +    cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
 +    cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
 +    if (cu_nb->bUseTwoStreams)
 +    {
 +        cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
 +        cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
 +        cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
 +    }
 +
++    sfree(atdat);
++    sfree(nbparam);
++    sfree(plist);
++    if (cu_nb->bUseTwoStreams)
++    {
++        sfree(plist_nl);
++    }
++    sfree(timers);
++    sfree(cu_nb->timings);
++    sfree(cu_nb);
++
 +    if (debug)
 +    {
 +        fprintf(debug, "Cleaned up CUDA data structures.\n");
 +    }
 +}
 +
 +void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
 +{
 +    cudaError_t stat;
 +    cudaStream_t stream = cu_nb->stream[iloc];
 +
 +    stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
 +    CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
 +}
 +
 +wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
 +}
 +
 +void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    if (cu_nb->bDoTime)
 +    {
 +        init_timings(cu_nb->timings);
 +    }
 +}
 +
 +int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    return cu_nb != NULL ?
 +        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
 +
 +}
index 0a7a187438d31271949ce6b7301cd8caeac05bd0,0000000000000000000000000000000000000000..f88d0ba43121da7bff96150ebeec98af6561bbee
mode 100644,000000..100644
--- /dev/null
@@@ -1,256 -1,0 +1,262 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_internal_h
 +#define _nbnxn_internal_h
 +
 +#include "typedefs.h"
 +#include "domdec.h"
 +#include "gmx_cyclecounter.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +/* Use 4-way SIMD for, always, single precision bounding box calculations */
 +#define NBNXN_SEARCH_BB_SSE
 +#endif
 +
 +
 +#ifdef GMX_NBNXN_SIMD
 +/* Memory alignment in bytes as required by SIMD aligned loads/stores */
 +#define NBNXN_MEM_ALIGN  (GMX_NBNXN_SIMD_BITWIDTH/8)
 +#else
 +/* No alignment required, but set it so we can call the same routines */
 +#define NBNXN_MEM_ALIGN  32
 +#endif
 +
 +
 +/* A pair-search grid struct for one domain decomposition zone */
 +typedef struct {
 +    rvec     c0;               /* The lower corner of the (local) grid        */
 +    rvec     c1;               /* The upper corner of the (local) grid        */
 +    real     atom_density;     /* The atom number density for the local grid  */
 +
 +    gmx_bool bSimple;          /* Is this grid simple or super/sub            */
 +    int      na_c;             /* Number of atoms per cluster                 */
 +    int      na_cj;            /* Number of atoms for list j-clusters         */
 +    int      na_sc;            /* Number of atoms per super-cluster           */
 +    int      na_c_2log;        /* 2log of na_c                                */
 +
 +    int      ncx;              /* Number of (super-)cells along x             */
 +    int      ncy;              /* Number of (super-)cells along y             */
 +    int      nc;               /* Total number of (super-)cells               */
 +
 +    real     sx;               /* x-size of a (super-)cell                    */
 +    real     sy;               /* y-size of a (super-)cell                    */
 +    real     inv_sx;           /* 1/sx                                        */
 +    real     inv_sy;           /* 1/sy                                        */
 +
 +    int      cell0;            /* Index in nbs->cell corresponding to cell 0  */
 +
 +    int     *cxy_na;           /* The number of atoms for each column in x,y  */
 +    int     *cxy_ind;          /* Grid (super)cell index, offset from cell0   */
 +    int      cxy_nalloc;       /* Allocation size for cxy_na and cxy_ind      */
 +
 +    int     *nsubc;            /* The number of sub cells for each super cell */
 +    float   *bbcz;             /* Bounding boxes in z for the super cells     */
 +    float   *bb;               /* 3D bounding boxes for the sub cells         */
 +    float   *bbj;              /* 3D j-b.boxes for SSE-double or AVX-single   */
 +    int     *flags;            /* Flag for the super cells                    */
 +    int      nc_nalloc;        /* Allocation size for the pointers above      */
 +
 +    float   *bbcz_simple;      /* bbcz for simple grid converted from super   */
 +    float   *bb_simple;        /* bb for simple grid converted from super     */
 +    int     *flags_simple;     /* flags for simple grid converted from super  */
 +    int      nc_nalloc_simple; /* Allocation size for the pointers above   */
 +
 +    int      nsubc_tot;        /* Total number of subcell, used for printing  */
 +} nbnxn_grid_t;
 +
 +#ifdef GMX_NBNXN_SIMD
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define GMX_MM128_HERE
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#define GMX_MM256_HERE
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 +#endif
 +#endif
 +#include "gmx_simd_macros.h"
 +
 +typedef struct nbnxn_x_ci_simd_4xn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
 +    gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
 +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
 +    gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
 +} nbnxn_x_ci_simd_4xn_t;
 +
 +typedef struct nbnxn_x_ci_simd_2xnn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
 +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
 +} nbnxn_x_ci_simd_2xnn_t;
 +
 +#endif
 +
 +/* Working data for the actual i-supercell during pair search */
 +typedef struct nbnxn_list_work {
 +    gmx_cache_protect_t     cp0;   /* Protect cache between threads               */
 +
 +    float                  *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
 +    real                   *x_ci;  /* The coordinates, pbc shifted, for each atom       */
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_x_ci_simd_4xn_t  *x_ci_simd_4xn;
 +    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 +#endif
 +    int                     cj_ind;    /* The current cj_ind index for the current list     */
 +    int                     cj4_init;  /* The first unitialized cj4 block                   */
 +
 +    float                  *d2;        /* Bounding box distance work array                  */
 +
 +    nbnxn_cj_t             *cj;        /* The j-cell list                                   */
 +    int                     cj_nalloc; /* Allocation size of cj                             */
 +
 +    int                     ncj_noq;   /* Nr. of cluster pairs without Coul for flop count  */
 +    int                     ncj_hlj;   /* Nr. of cluster pairs with 1/2 LJ for flop count   */
 +
++    int                    *sort;            /* Sort index                    */
++    int                     sort_nalloc;     /* Allocation size of sort       */
++
++    nbnxn_sci_t            *sci_sort;        /* Second sci array, for sorting */
++    int                     sci_sort_nalloc; /* Allocation size of sci_sort   */
++
 +    gmx_cache_protect_t     cp1;       /* Protect cache between threads               */
 +} nbnxn_list_work_t;
 +
 +/* Function type for setting the i-atom coordinate working data */
 +typedef void
 +    gmx_icell_set_x_t (int ci,
 +                       real shx, real shy, real shz,
 +                       int na_c,
 +                       int stride, const real *x,
 +                       nbnxn_list_work_t *work);
 +
 +static gmx_icell_set_x_t icell_set_x_simple;
 +#ifdef GMX_NBNXN_SIMD
 +static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
 +static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 +#endif
 +static gmx_icell_set_x_t icell_set_x_supersub;
 +#ifdef NBNXN_SEARCH_SSE
 +static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 +#endif
 +
 +#undef GMX_MM128_HERE
 +#undef GMX_MM256_HERE
 +
 +/* Local cycle count struct for profiling */
 +typedef struct {
 +    int          count;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +} nbnxn_cycle_t;
 +
 +/* Local cycle count enum for profiling */
 +enum {
 +    enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr
 +};
 +
 +/* Thread-local work struct, contains part of nbnxn_grid_t */
 +typedef struct {
 +    gmx_cache_protect_t  cp0;
 +
 +    int                 *cxy_na;
 +    int                  cxy_na_nalloc;
 +
 +    int                 *sort_work;
 +    int                  sort_work_nalloc;
 +
 +    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
 +
 +    int                  ndistc;       /* Number of distance checks for flop counting */
 +
 +    nbnxn_cycle_t        cc[enbsCCnr];
 +
 +    gmx_cache_protect_t  cp1;
 +} nbnxn_search_work_t;
 +
 +/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
 +typedef struct nbnxn_search {
 +    int                 ePBC;            /* PBC type enum                              */
 +    matrix              box;             /* The periodic unit-cell                     */
 +
 +    gmx_bool            DomDec;          /* Are we doing domain decomposition?         */
 +    ivec                dd_dim;          /* Are we doing DD in x,y,z?                  */
 +    gmx_domdec_zones_t *zones;           /* The domain decomposition zones        */
 +
 +    int                 ngrid;           /* The number of grids, equal to #DD-zones    */
 +    nbnxn_grid_t       *grid;            /* Array of grids, size ngrid                 */
 +    int                *cell;            /* Actual allocated cell array for all grids  */
 +    int                 cell_nalloc;     /* Allocation size of cell                    */
 +    int                *a;               /* Atom index for grid, the inverse of cell   */
 +    int                 a_nalloc;        /* Allocation size of a                       */
 +
 +    int                 natoms_local;    /* The local atoms run from 0 to natoms_local */
 +    int                 natoms_nonlocal; /* The non-local atoms run from natoms_local
 +                                          * to natoms_nonlocal */
 +
 +    gmx_bool             print_cycles;
 +    int                  search_count;
 +    nbnxn_cycle_t        cc[enbsCCnr];
 +
 +    gmx_icell_set_x_t   *icell_set_x; /* Function for setting i-coords    */
 +
 +    int                  nthread_max; /* Maximum number of threads for pair-search  */
 +    nbnxn_search_work_t *work;        /* Work array, size nthread_max          */
 +} nbnxn_search_t_t;
 +
 +
 +static void nbs_cycle_start(nbnxn_cycle_t *cc)
 +{
 +    cc->start = gmx_cycles_read();
 +}
 +
 +static void nbs_cycle_stop(nbnxn_cycle_t *cc)
 +{
 +    cc->c += gmx_cycles_read() - cc->start;
 +    cc->count++;
 +}
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index e4a833cb5a267c45b5bffd41c2ca6a2dce6c6506,0000000000000000000000000000000000000000..ef8cd2d08a524fac028baaddbbd47d61261187d0
mode 100644,000000..100644
--- /dev/null
@@@ -1,5040 -1,0 +1,5125 @@@
- /* Print the full pair list, used for debug output */
- static void print_supersub_nsp(const char             *fn,
-                                const nbnxn_pairlist_t *nbl,
-                                int                     iloc)
- {
-     char  buf[STRLEN];
-     FILE *fp;
-     int   i, nsp, j4, p;
-     sprintf(buf, "%s_%s.xvg", fn, NONLOCAL_I(iloc) ? "nl" : "l");
-     fp = ffopen(buf, "w");
-     for (i = 0; i < nbl->nci; i++)
-     {
-         nsp = 0;
-         for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
-         {
-             for (p = 0; p < NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
-             {
-                 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
-             }
-         }
-         fprintf(fp, "%4d %3d %3d\n",
-                 i,
-                 nsp,
-                 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
-     }
-     fclose(fp);
- }
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nrnb.h"
 +
 +
 +/* Pair search box lower and upper corner in x,y,z.
 + * Store this in 4 iso 3 reals, which is useful with SSE.
 + * To avoid complicating the code we also use 4 without SSE.
 + */
 +#define NNBSBB_C         4
 +#define NNBSBB_B         (2*NNBSBB_C)
 +/* Pair search box lower and upper bound in z only. */
 +#define NNBSBB_D         2
 +/* Pair search box lower and upper corner x,y,z indices */
 +#define BBL_X  0
 +#define BBL_Y  1
 +#define BBL_Z  2
 +#define BBU_X  4
 +#define BBU_Y  5
 +#define BBU_Z  6
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* We use SSE or AVX-128bit for bounding box calculations */
 +
 +#ifndef GMX_DOUBLE
 +/* Single precision BBs + coordinates, we can also load coordinates using SSE */
 +#define NBNXN_SEARCH_SSE_SINGLE
 +#endif
 +
 +/* Include basic SSE2 stuff */
 +#include <emmintrin.h>
 +
 +#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
 +/* Store bounding boxes with x, y and z coordinates in packs of 4 */
 +#define NBNXN_PBB_SSE
 +#endif
 +
 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
 + * Here AVX-256 turns out to be slightly slower than AVX-128.
 + */
 +#define STRIDE_PBB        4
 +#define STRIDE_PBB_2LOG   2
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef GMX_NBNXN_SIMD
 +
 +/* The functions below are macros as they are performance sensitive */
 +
 +/* 4x4 list, pack=4: no complex conversion required */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J4(ci)   (ci)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
 +
 +/* 4x2 list, pack=4: j-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
 +
 +/* 4x8 list, pack=8: i-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
 +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 +
 +/* The j-cluster size is matched to the SIMD width */
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#ifdef GMX_DOUBLE
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
 +#else
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#endif
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#ifdef GMX_DOUBLE
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#else
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
 +/* Half SIMD with j-cluster size */
 +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
 +#endif
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_WIDTH"
 +#endif
 +#endif
 +
 +#endif /* GMX_NBNXN_SIMD */
 +
 +
 +/* Interaction masks for 4xN atom interactions.
 + * Bit i*CJ_SIZE + j tells if atom i and j interact.
 + */
 +/* All interaction mask is the same for all kernels */
 +#define NBNXN_INT_MASK_ALL        0xffffffff
 +/* 4x4 kernel diagonal mask */
 +#define NBNXN_INT_MASK_DIAG       0x08ce
 +/* 4x2 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J2_0  0x0002
 +#define NBNXN_INT_MASK_DIAG_J2_1  0x002F
 +/* 4x8 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
 +#define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 +#define NBNXN_BBXXXX
 +/* Size of bounding box corners quadruplet */
 +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
 +#endif
 +
 +/* We shift the i-particles backward for PBC.
 + * This leads to more conditionals than shifting forward.
 + * We do this to get more balanced pair lists.
 + */
 +#define NBNXN_SHIFT_BACKWARD
 +
 +
 +/* This define is a lazy way to avoid interdependence of the grid
 + * and searching data structures.
 + */
 +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
 +
 +
 +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 +{
 +    int i;
 +
 +    for (i = 0; i < enbsCCnr; i++)
 +    {
 +        cc[i].count = 0;
 +        cc[i].c     = 0;
 +    }
 +}
 +
 +static double Mcyc_av(const nbnxn_cycle_t *cc)
 +{
 +    return (double)cc->c*1e-6/cc->count;
 +}
 +
 +static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
 +{
 +    int n;
 +    int t;
 +
 +    fprintf(fp, "\n");
 +    fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
 +            nbs->cc[enbsCCgrid].count,
 +            Mcyc_av(&nbs->cc[enbsCCgrid]),
 +            Mcyc_av(&nbs->cc[enbsCCsearch]),
 +            Mcyc_av(&nbs->cc[enbsCCreducef]));
 +
 +    if (nbs->nthread_max > 1)
 +    {
 +        if (nbs->cc[enbsCCcombine].count > 0)
 +        {
 +            fprintf(fp, " comb %5.2f",
 +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
 +        }
 +        fprintf(fp, " s. th");
 +        for (t = 0; t < nbs->nthread_max; t++)
 +        {
 +            fprintf(fp, " %4.1f",
 +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
 +        }
 +    }
 +    fprintf(fp, "\n");
 +}
 +
 +static void nbnxn_grid_init(nbnxn_grid_t * grid)
 +{
 +    grid->cxy_na      = NULL;
 +    grid->cxy_ind     = NULL;
 +    grid->cxy_nalloc  = 0;
 +    grid->bb          = NULL;
 +    grid->bbj         = NULL;
 +    grid->nc_nalloc   = 0;
 +}
 +
 +static int get_2log(int n)
 +{
 +    int log2;
 +
 +    log2 = 0;
 +    while ((1<<log2) < n)
 +    {
 +        log2++;
 +    }
 +    if ((1<<log2) != n)
 +    {
 +        gmx_fatal(FARGS, "nbnxn na_c (%d) is not a power of 2", n);
 +    }
 +
 +    return log2;
 +}
 +
 +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk4x4_PlainC:
 +        case nbnxnk4xN_SIMD_4xN:
 +        case nbnxnk4xN_SIMD_2xNN:
 +            return NBNXN_CPU_CLUSTER_I_SIZE;
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            /* The cluster size for super/sub lists is only set here.
 +             * Any value should work for the pair-search and atomdata code.
 +             * The kernels, of course, might require a particular value.
 +             */
 +            return NBNXN_GPU_CLUSTER_SIZE;
 +        default:
 +            gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 +{
 +    int nbnxn_simd_width = 0;
 +    int cj_size          = 0;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
 +#endif
 +
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk4x4_PlainC:
 +            cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
 +            break;
 +        case nbnxnk4xN_SIMD_4xN:
 +            cj_size = nbnxn_simd_width;
 +            break;
 +        case nbnxnk4xN_SIMD_2xNN:
 +            cj_size = nbnxn_simd_width/2;
 +            break;
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +            break;
 +        default:
 +            gmx_incons("unknown kernel type");
 +    }
 +
 +    return cj_size;
 +}
 +
 +static int ci_to_cj(int na_cj_2log, int ci)
 +{
 +    switch (na_cj_2log)
 +    {
 +        case 2: return ci;     break;
 +        case 1: return (ci<<1); break;
 +        case 3: return (ci>>1); break;
 +    }
 +
 +    return 0;
 +}
 +
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 +{
 +    if (nb_kernel_type == nbnxnkNotSet)
 +    {
 +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 +    }
 +
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            return FALSE;
 +
 +        case nbnxnk4x4_PlainC:
 +        case nbnxnk4xN_SIMD_4xN:
 +        case nbnxnk4xN_SIMD_2xNN:
 +            return TRUE;
 +
 +        default:
 +            gmx_incons("Invalid nonbonded kernel type passed!");
 +            return FALSE;
 +    }
 +}
 +
 +void nbnxn_init_search(nbnxn_search_t    * nbs_ptr,
 +                       ivec               *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int                 nthread_max)
 +{
 +    nbnxn_search_t nbs;
 +    int            d, g, t;
 +
 +    snew(nbs, 1);
 +    *nbs_ptr = nbs;
 +
 +    nbs->DomDec = (n_dd_cells != NULL);
 +
 +    clear_ivec(nbs->dd_dim);
 +    nbs->ngrid = 1;
 +    if (nbs->DomDec)
 +    {
 +        nbs->zones = zones;
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if ((*n_dd_cells)[d] > 1)
 +            {
 +                nbs->dd_dim[d] = 1;
 +                /* Each grid matches a DD zone */
 +                nbs->ngrid *= 2;
 +            }
 +        }
 +    }
 +
 +    snew(nbs->grid, nbs->ngrid);
 +    for (g = 0; g < nbs->ngrid; g++)
 +    {
 +        nbnxn_grid_init(&nbs->grid[g]);
 +    }
 +    nbs->cell        = NULL;
 +    nbs->cell_nalloc = 0;
 +    nbs->a           = NULL;
 +    nbs->a_nalloc    = 0;
 +
 +    nbs->nthread_max = nthread_max;
 +
 +    /* Initialize the work data structures for each thread */
 +    snew(nbs->work, nbs->nthread_max);
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        nbs->work[t].cxy_na           = NULL;
 +        nbs->work[t].cxy_na_nalloc    = 0;
 +        nbs->work[t].sort_work        = NULL;
 +        nbs->work[t].sort_work_nalloc = 0;
 +    }
 +
 +    /* Initialize detailed nbsearch cycle counting */
 +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
 +    nbs->search_count = 0;
 +    nbs_cycle_clear(nbs->cc);
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        nbs_cycle_clear(nbs->work[t].cc);
 +    }
 +}
 +
 +static real grid_atom_density(int n, rvec corner0, rvec corner1)
 +{
 +    rvec size;
 +
 +    rvec_sub(corner1, corner0, size);
 +
 +    return n/(size[XX]*size[YY]*size[ZZ]);
 +}
 +
 +static int set_grid_size_xy(const nbnxn_search_t nbs,
 +                            nbnxn_grid_t *grid,
 +                            int dd_zone,
 +                            int n, rvec corner0, rvec corner1,
 +                            real atom_density,
 +                            int XFormat)
 +{
 +    rvec size;
 +    int  na_c;
 +    real adens, tlen, tlen_x, tlen_y, nc_max;
 +    int  t;
 +
 +    rvec_sub(corner1, corner0, size);
 +
 +    if (n > grid->na_sc)
 +    {
 +        /* target cell length */
 +        if (grid->bSimple)
 +        {
 +            /* To minimize the zero interactions, we should make
 +             * the largest of the i/j cell cubic.
 +             */
 +            na_c = max(grid->na_c, grid->na_cj);
 +
 +            /* Approximately cubic cells */
 +            tlen   = pow(na_c/atom_density, 1.0/3.0);
 +            tlen_x = tlen;
 +            tlen_y = tlen;
 +        }
 +        else
 +        {
 +            /* Approximately cubic sub cells */
 +            tlen   = pow(grid->na_c/atom_density, 1.0/3.0);
 +            tlen_x = tlen*GPU_NSUBCELL_X;
 +            tlen_y = tlen*GPU_NSUBCELL_Y;
 +        }
 +        /* We round ncx and ncy down, because we get less cell pairs
 +         * in the nbsist when the fixed cell dimensions (x,y) are
 +         * larger than the variable one (z) than the other way around.
 +         */
 +        grid->ncx = max(1, (int)(size[XX]/tlen_x));
 +        grid->ncy = max(1, (int)(size[YY]/tlen_y));
 +    }
 +    else
 +    {
 +        grid->ncx = 1;
 +        grid->ncy = 1;
 +    }
 +
 +    grid->sx     = size[XX]/grid->ncx;
 +    grid->sy     = size[YY]/grid->ncy;
 +    grid->inv_sx = 1/grid->sx;
 +    grid->inv_sy = 1/grid->sy;
 +
 +    if (dd_zone > 0)
 +    {
 +        /* This is a non-home zone, add an extra row of cells
 +         * for particles communicated for bonded interactions.
 +         * These can be beyond the cut-off. It doesn't matter where
 +         * they end up on the grid, but for performance it's better
 +         * if they don't end up in cells that can be within cut-off range.
 +         */
 +        grid->ncx++;
 +        grid->ncy++;
 +    }
 +
 +    /* We need one additional cell entry for particles moved by DD */
 +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 +    {
 +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +        srenew(grid->cxy_na, grid->cxy_nalloc);
 +        srenew(grid->cxy_ind, grid->cxy_nalloc+1);
 +    }
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 +        {
 +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +            srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 +        }
 +    }
 +
 +    /* Worst case scenario of 1 atom in each last cell */
 +    if (grid->na_cj <= grid->na_c)
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 +    }
 +    else
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 +    }
 +
 +    if (nc_max > grid->nc_nalloc)
 +    {
 +        int bb_nalloc;
 +
 +        grid->nc_nalloc = over_alloc_large(nc_max);
 +        srenew(grid->nsubc, grid->nc_nalloc);
 +        srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
 +#ifdef NBNXN_PBB_SSE
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
 +#else
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
 +#endif
 +        sfree_aligned(grid->bb);
 +        /* This snew also zeros the contents, this avoid possible
 +         * floating exceptions in SSE with the unused bb elements.
 +         */
 +        snew_aligned(grid->bb, bb_nalloc, 16);
 +
 +        if (grid->bSimple)
 +        {
 +            if (grid->na_cj == grid->na_c)
 +            {
 +                grid->bbj = grid->bb;
 +            }
 +            else
 +            {
 +                sfree_aligned(grid->bbj);
 +                snew_aligned(grid->bbj, bb_nalloc*grid->na_c/grid->na_cj, 16);
 +            }
 +        }
 +
 +        srenew(grid->flags, grid->nc_nalloc);
 +    }
 +
 +    copy_rvec(corner0, grid->c0);
 +    copy_rvec(corner1, grid->c1);
 +
 +    return nc_max;
 +}
 +
 +/* We need to sort paricles in grid columns on z-coordinate.
 + * As particle are very often distributed homogeneously, we a sorting
 + * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 + * by a factor, cast to an int and try to store in that hole. If the hole
 + * is full, we move this or another particle. A second pass is needed to make
 + * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 + * 4 is the optimal value for homogeneous particle distribution and allows
 + * for an O(#particles) sort up till distributions were all particles are
 + * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 + * as it can be expensive to detect imhomogeneous particle distributions.
 + * SGSF is the maximum ratio of holes used, in the worst case all particles
 + * end up in the last hole and we need #particles extra holes at the end.
 + */
 +#define SORT_GRID_OVERSIZE 4
 +#define SGSF (SORT_GRID_OVERSIZE + 1)
 +
 +/* Sort particle index a on coordinates x along dim.
 + * Backwards tells if we want decreasing iso increasing coordinates.
 + * h0 is the minimum of the coordinate range.
 + * invh is the inverse hole spacing.
 + * nsort, the theortical hole limit, is only used for debugging.
 + * sort is the sorting work array.
 + */
 +static void sort_atoms(int dim, gmx_bool Backwards,
 +                       int *a, int n, rvec *x,
 +                       real h0, real invh, int nsort, int *sort)
 +{
 +    int i, c;
 +    int zi, zim, zi_min, zi_max;
 +    int cp, tmp;
 +
 +    if (n <= 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +    /* Determine the index range used, so we can limit it for the second pass */
 +    zi_min = INT_MAX;
 +    zi_max = -1;
 +
 +    /* Sort the particles using a simple index sort */
 +    for (i = 0; i < n; i++)
 +    {
 +        /* The cast takes care of float-point rounding effects below zero.
 +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 +         * times the box height out of the box.
 +         */
 +        zi = (int)((x[a[i]][dim] - h0)*invh);
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +        if (zi < 0 || zi >= nsort)
 +        {
 +            gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
 +                      a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi, nsort);
 +        }
 +#endif
 +
 +        /* Ideally this particle should go in sort cell zi,
 +         * but that might already be in use,
 +         * in that case find the first empty cell higher up
 +         */
 +        if (sort[zi] < 0)
 +        {
 +            sort[zi] = a[i];
 +            zi_min   = min(zi_min, zi);
 +            zi_max   = max(zi_max, zi);
 +        }
 +        else
 +        {
 +            /* We have multiple atoms in the same sorting slot.
 +             * Sort on real z for minimal bounding box size.
 +             * There is an extra check for identical z to ensure
 +             * well-defined output order, independent of input order
 +             * to ensure binary reproducibility after restarts.
 +             */
 +            while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 +                                      (x[a[i]][dim] == x[sort[zi]][dim] &&
 +                                       a[i] > sort[zi])))
 +            {
 +                zi++;
 +            }
 +
 +            if (sort[zi] >= 0)
 +            {
 +                /* Shift all elements by one slot until we find an empty slot */
 +                cp  = sort[zi];
 +                zim = zi + 1;
 +                while (sort[zim] >= 0)
 +                {
 +                    tmp       = sort[zim];
 +                    sort[zim] = cp;
 +                    cp        = tmp;
 +                    zim++;
 +                }
 +                sort[zim] = cp;
 +                zi_max    = max(zi_max, zim);
 +            }
 +            sort[zi] = a[i];
 +            zi_max   = max(zi_max, zi);
 +        }
 +    }
 +
 +    c = 0;
 +    if (!Backwards)
 +    {
 +        for (zi = 0; zi < nsort; zi++)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++]   = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (zi = zi_max; zi >= zi_min; zi--)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++]   = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    if (c < n)
 +    {
 +        gmx_incons("Lost particles while sorting");
 +    }
 +}
 +
 +#ifdef GMX_DOUBLE
 +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 +#else
 +#define R2F_D(x) (x)
 +#define R2F_U(x) (x)
 +#endif
 +
 +/* Coordinate order x,y,z, bb order xyz0 */
 +static void calc_bounding_box(int na, int stride, const real *x, float *bb)
 +{
 +    int  i, j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    i  = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[i+XX]);
 +        xh = max(xh, x[i+XX]);
 +        yl = min(yl, x[i+YY]);
 +        yh = max(yh, x[i+YY]);
 +        zl = min(zl, x[i+ZZ]);
 +        zh = max(zh, x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4(int na, const real *x, float *bb)
 +{
 +    int  j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    xl = x[XX*PACK_X4];
 +    xh = x[XX*PACK_X4];
 +    yl = x[YY*PACK_X4];
 +    yh = x[YY*PACK_X4];
 +    zl = x[ZZ*PACK_X4];
 +    zh = x[ZZ*PACK_X4];
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[j+XX*PACK_X4]);
 +        xh = max(xh, x[j+XX*PACK_X4]);
 +        yl = min(yl, x[j+YY*PACK_X4]);
 +        yh = max(yh, x[j+YY*PACK_X4]);
 +        zl = min(zl, x[j+ZZ*PACK_X4]);
 +        zh = max(zh, x[j+ZZ*PACK_X4]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x8(int na, const real *x, float *bb)
 +{
 +    int  j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    xl = x[XX*PACK_X8];
 +    xh = x[XX*PACK_X8];
 +    yl = x[YY*PACK_X8];
 +    yh = x[YY*PACK_X8];
 +    zl = x[ZZ*PACK_X8];
 +    zh = x[ZZ*PACK_X8];
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[j+XX*PACK_X8]);
 +        xh = max(xh, x[j+XX*PACK_X8]);
 +        yl = min(yl, x[j+YY*PACK_X8]);
 +        yh = max(yh, x[j+YY*PACK_X8]);
 +        zl = min(zl, x[j+ZZ*PACK_X8]);
 +        zh = max(zh, x[j+ZZ*PACK_X8]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4_halves(int na, const real *x,
 +                                          float *bb, float *bbj)
 +{
 +    calc_bounding_box_x_x4(min(na, 2), x, bbj);
 +
 +    if (na > 2)
 +    {
 +        calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+NNBSBB_B);
 +    }
 +    else
 +    {
 +        /* Set the "empty" bounding box to the same as the first one,
 +         * so we don't need to treat special cases in the rest of the code.
 +         */
 +        _mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
 +        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
 +    }
 +
 +    _mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
 +                                _mm_load_ps(bbj+NNBSBB_B)));
 +    _mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
 +                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
 +}
 +
 +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 +{
 +    int  i, j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    i  = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[i+XX]);
 +        xh = max(xh, x[i+XX]);
 +        yl = min(yl, x[i+YY]);
 +        yh = max(yh, x[i+YY]);
 +        zl = min(zl, x[i+ZZ]);
 +        zh = max(zh, x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[0*STRIDE_PBB] = R2F_D(xl);
 +    bb[1*STRIDE_PBB] = R2F_D(yl);
 +    bb[2*STRIDE_PBB] = R2F_D(zl);
 +    bb[3*STRIDE_PBB] = R2F_U(xh);
 +    bb[4*STRIDE_PBB] = R2F_U(yh);
 +    bb[5*STRIDE_PBB] = R2F_U(zh);
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +
 +/* Coordinate order xyz?, bb order xyz0 */
 +static void calc_bounding_box_sse(int na, const float *x, float *bb)
 +{
 +    __m128 bb_0_SSE, bb_1_SSE;
 +    __m128 x_SSE;
 +
 +    int    i;
 +
 +    bb_0_SSE = _mm_load_ps(x);
 +    bb_1_SSE = bb_0_SSE;
 +
 +    for (i = 1; i < na; i++)
 +    {
 +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
 +        bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
 +        bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
 +    }
 +
 +    _mm_store_ps(bb, bb_0_SSE);
 +    _mm_store_ps(bb+4, bb_1_SSE);
 +}
 +
 +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx_sse(int na, const float *x,
 +                                       float *bb_work,
 +                                       real *bb)
 +{
 +    calc_bounding_box_sse(na, x, bb_work);
 +
 +    bb[0*STRIDE_PBB] = bb_work[BBL_X];
 +    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
 +    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
 +    bb[3*STRIDE_PBB] = bb_work[BBU_X];
 +    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
 +    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE_SINGLE */
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Combines pairs of consecutive bounding boxes */
 +static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
 +{
 +    int    i, j, sc2, nc2, c2;
 +    __m128 min_SSE, max_SSE;
 +
 +    for (i = 0; i < grid->ncx*grid->ncy; i++)
 +    {
 +        /* Starting bb in a column is expected to be 2-aligned */
 +        sc2 = grid->cxy_ind[i]>>1;
 +        /* For odd numbers skip the last bb here */
 +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
 +        for (c2 = sc2; c2 < sc2+nc2; c2++)
 +        {
 +            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
 +            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
 +            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
 +            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
 +        }
 +        if (((grid->cxy_na[i]+3)>>2) & 1)
 +        {
 +            /* Copy the last bb for odd bb count in this column */
 +            for (j = 0; j < NNBSBB_C; j++)
 +            {
 +                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
 +                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
 +            }
 +        }
 +    }
 +}
 +
 +#endif
 +
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_simple(FILE                *fp,
 +                                 const nbnxn_search_t nbs,
 +                                 const nbnxn_grid_t  *grid)
 +{
 +    int  c, d;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    for (c = 0; c < grid->nc; c++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
 +        }
 +    }
 +    dsvmul(1.0/grid->nc, ba, ba);
 +
 +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/grid->ncx,
 +            nbs->box[YY][YY]/grid->ncy,
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
 +            ba[XX], ba[YY], ba[ZZ],
 +            ba[XX]*grid->ncx/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_supersub(FILE                *fp,
 +                                   const nbnxn_search_t nbs,
 +                                   const nbnxn_grid_t  *grid)
 +{
 +    int  ns, c, s;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    ns = 0;
 +    for (c = 0; c < grid->nc; c++)
 +    {
 +#ifdef NBNXN_BBXXXX
 +        for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 +        {
 +            int cs_w, i, d;
 +
 +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
 +            for (i = 0; i < STRIDE_PBB; i++)
 +            {
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    ba[d] +=
 +                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 +                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 +                }
 +            }
 +        }
 +#else
 +        for (s = 0; s < grid->nsubc[c]; s++)
 +        {
 +            int cs, d;
 +
 +            cs = c*GPU_NSUBCELL + s;
 +            for (d = 0; d < DIM; d++)
 +            {
 +                ba[d] +=
 +                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
 +                    grid->bb[cs*NNBSBB_B         +d];
 +            }
 +        }
 +#endif
 +        ns += grid->nsubc[c];
 +    }
 +    dsvmul(1.0/ns, ba, ba);
 +
 +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
 +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
 +            ba[XX], ba[YY], ba[ZZ],
 +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
 + * Also sets interaction flags.
 + */
 +void sort_on_lj(nbnxn_atomdata_t *nbat, int na_c,
 +                int a0, int a1, const int *atinfo,
 +                int *order,
 +                int *flags)
 +{
 +    int      subc, s, a, n1, n2, a_lj_max, i, j;
 +    int      sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    int      sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    gmx_bool haveQ;
 +
 +    *flags = 0;
 +
 +    subc = 0;
 +    for (s = a0; s < a1; s += na_c)
 +    {
 +        /* Make lists for this (sub-)cell on atoms with and without LJ */
 +        n1       = 0;
 +        n2       = 0;
 +        haveQ    = FALSE;
 +        a_lj_max = -1;
 +        for (a = s; a < min(s+na_c, a1); a++)
 +        {
 +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 +
 +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 +            {
 +                sort1[n1++] = order[a];
 +                a_lj_max    = a;
 +            }
 +            else
 +            {
 +                sort2[n2++] = order[a];
 +            }
 +        }
 +
 +        /* If we don't have atom with LJ, there's nothing to sort */
 +        if (n1 > 0)
 +        {
 +            *flags |= NBNXN_CI_DO_LJ(subc);
 +
 +            if (2*n1 <= na_c)
 +            {
 +                /* Only sort when strictly necessary. Ordering particles
 +                 * Ordering particles can lead to less accurate summation
 +                 * due to rounding, both for LJ and Coulomb interactions.
 +                 */
 +                if (2*(a_lj_max - s) >= na_c)
 +                {
 +                    for (i = 0; i < n1; i++)
 +                    {
 +                        order[a0+i] = sort1[i];
 +                    }
 +                    for (j = 0; j < n2; j++)
 +                    {
 +                        order[a0+n1+j] = sort2[j];
 +                    }
 +                }
 +
 +                *flags |= NBNXN_CI_HALF_LJ(subc);
 +            }
 +        }
 +        if (haveQ)
 +        {
 +            *flags |= NBNXN_CI_DO_COUL(subc);
 +        }
 +        subc++;
 +    }
 +}
 +
 +/* Fill a pair search cell with atoms.
 + * Potentially sorts atoms and sets the interaction flags.
 + */
 +void fill_cell(const nbnxn_search_t nbs,
 +               nbnxn_grid_t *grid,
 +               nbnxn_atomdata_t *nbat,
 +               int a0, int a1,
 +               const int *atinfo,
 +               rvec *x,
 +               int sx, int sy, int sz,
 +               float *bb_work)
 +{
 +    int     na, a;
 +    size_t  offset;
 +    float  *bb_ptr;
 +
 +    na = a1 - a0;
 +
 +    if (grid->bSimple)
 +    {
 +        sort_on_lj(nbat, grid->na_c, a0, a1, atinfo, nbs->a,
 +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 +    }
 +
 +    /* Now we have sorted the atoms, set the cell indices */
 +    for (a = a0; a < a1; a++)
 +    {
 +        nbs->cell[nbs->a[a]] = a;
 +    }
 +
 +    copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
 +                           nbat->XFormat, nbat->x, a0,
 +                           sx, sy, sz);
 +
 +    if (nbat->XFormat == nbatX4)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
 +        if (2*grid->na_cj == grid->na_c)
 +        {
 +            calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
 +                                          grid->bbj+offset*2);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
 +        }
 +    }
 +    else if (nbat->XFormat == nbatX8)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
 +    }
 +#ifdef NBNXN_BBXXXX
 +    else if (!grid->bSimple)
 +    {
 +        /* Store the bounding boxes in a format convenient
 +         * for SSE calculations: xxxxyyyyzzzz...
 +         */
 +        bb_ptr =
 +            grid->bb +
 +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +        if (nbat->XFormat == nbatXYZQ)
 +        {
 +            calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
 +                                       bb_work, bb_ptr);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 +                                   bb_ptr);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx, sy, sz,
 +                    bb_ptr[0*STRIDE_PBB], bb_ptr[3*STRIDE_PBB],
 +                    bb_ptr[1*STRIDE_PBB], bb_ptr[4*STRIDE_PBB],
 +                    bb_ptr[2*STRIDE_PBB], bb_ptr[5*STRIDE_PBB]);
 +        }
 +    }
 +#endif
 +    else
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +
 +        calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 +                          bb_ptr);
 +
 +        if (gmx_debug_at)
 +        {
 +            int bbo;
 +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx, sy, sz,
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_simple(const nbnxn_search_t nbs,
 +                                int dd_zone,
 +                                nbnxn_grid_t *grid,
 +                                int a0, int a1,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                nbnxn_atomdata_t *nbat,
 +                                int cxy_start, int cxy_end,
 +                                int *sort_work)
 +{
 +    int  cxy;
 +    int  cx, cy, cz, ncz, cfilled, c;
 +    int  na, ash, ind, a;
 +    int  na_c, ash_c;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0, cxy_start, cxy_end, a0, a1);
 +    }
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ, FALSE,
 +                   nbs->a+ash, na, x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF, sort_work);
 +
 +        /* Fill the ncz cells in this column */
 +        cfilled = grid->cxy_ind[cxy];
 +        for (cz = 0; cz < ncz; cz++)
 +        {
 +            c  = grid->cxy_ind[cxy] + cz;
 +
 +            ash_c = ash + cz*grid->na_sc;
 +            na_c  = min(grid->na_sc, na-(ash_c-ash));
 +
 +            fill_cell(nbs, grid, nbat,
 +                      ash_c, ash_c+na_c, atinfo, x,
 +                      grid->na_sc*cx + (dd_zone >> 2),
 +                      grid->na_sc*cy + (dd_zone & 3),
 +                      grid->na_sc*cz,
 +                      NULL);
 +
 +            /* This copy to bbcz is not really necessary.
 +             * But it allows to use the same grid search code
 +             * for the simple and supersub cell setups.
 +             */
 +            if (na_c > 0)
 +            {
 +                cfilled = c;
 +            }
 +            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
 +            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for (ind = na; ind < ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_supersub(const nbnxn_search_t nbs,
 +                                  int dd_zone,
 +                                  nbnxn_grid_t *grid,
 +                                  int a0, int a1,
 +                                  const int *atinfo,
 +                                  rvec *x,
 +                                  nbnxn_atomdata_t *nbat,
 +                                  int cxy_start, int cxy_end,
 +                                  int *sort_work)
 +{
 +    int  cxy;
 +    int  cx, cy, cz = -1, c = -1, ncz;
 +    int  na, ash, na_c, ind, a;
 +    int  subdiv_z, sub_z, na_z, ash_z;
 +    int  subdiv_y, sub_y, na_y, ash_y;
 +    int  subdiv_x, sub_x, na_x, ash_x;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    float bb_work_array[NNBSBB_B+3], *bb_work_align;
 +
 +    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0, cxy_start, cxy_end, a0, a1);
 +    }
 +
 +    subdiv_x = grid->na_c;
 +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
 +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ, FALSE,
 +                   nbs->a+ash, na, x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF, sort_work);
 +
 +        /* This loop goes over the supercells and subcells along z at once */
 +        for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
 +        {
 +            ash_z = ash + sub_z*subdiv_z;
 +            na_z  = min(subdiv_z, na-(ash_z-ash));
 +
 +            /* We have already sorted on z */
 +
 +            if (sub_z % GPU_NSUBCELL_Z == 0)
 +            {
 +                cz = sub_z/GPU_NSUBCELL_Z;
 +                c  = grid->cxy_ind[cxy] + cz;
 +
 +                /* The number of atoms in this supercell */
 +                na_c = min(grid->na_sc, na-(ash_z-ash));
 +
 +                grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
 +
 +                /* Store the z-boundaries of the super cell */
 +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
 +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
 +            }
 +
 +#if GPU_NSUBCELL_Y > 1
 +            /* Sort the atoms along y */
 +            sort_atoms(YY, (sub_z & 1),
 +                       nbs->a+ash_z, na_z, x,
 +                       grid->c0[YY]+cy*grid->sy, grid->inv_sy,
 +                       subdiv_y*SGSF, sort_work);
 +#endif
 +
 +            for (sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
 +            {
 +                ash_y = ash_z + sub_y*subdiv_y;
 +                na_y  = min(subdiv_y, na-(ash_y-ash));
 +
 +#if GPU_NSUBCELL_X > 1
 +                /* Sort the atoms along x */
 +                sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
 +                           nbs->a+ash_y, na_y, x,
 +                           grid->c0[XX]+cx*grid->sx, grid->inv_sx,
 +                           subdiv_x*SGSF, sort_work);
 +#endif
 +
 +                for (sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
 +                {
 +                    ash_x = ash_y + sub_x*subdiv_x;
 +                    na_x  = min(subdiv_x, na-(ash_x-ash));
 +
 +                    fill_cell(nbs, grid, nbat,
 +                              ash_x, ash_x+na_x, atinfo, x,
 +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
 +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
 +                              grid->na_c*sub_z,
 +                              bb_work_align);
 +                }
 +            }
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for (ind = na; ind < ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid column atoms should go */
 +static void calc_column_indices(nbnxn_grid_t *grid,
 +                                int a0, int a1,
 +                                rvec *x,
 +                                int dd_zone, const int *move,
 +                                int thread, int nthread,
 +                                int *cell,
 +                                int *cxy_na)
 +{
 +    int  n0, n1, i;
 +    int  cx, cy;
 +
 +    /* We add one extra cell for particles which moved during DD */
 +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
 +    {
 +        cxy_na[i] = 0;
 +    }
 +
 +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
 +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
 +    if (dd_zone == 0)
 +    {
 +        /* Home zone */
 +        for (i = n0; i < n1; i++)
 +        {
 +            if (move == NULL || move[i] >= 0)
 +            {
 +                /* We need to be careful with rounding,
 +                 * particles might be a few bits outside the local zone.
 +                 * The int cast takes care of the lower bound,
 +                 * we will explicitly take care of the upper bound.
 +                 */
 +                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +                if (cx < 0 || cx >= grid->ncx ||
 +                    cy < 0 || cy >= grid->ncy)
 +                {
 +                    gmx_fatal(FARGS,
 +                              "grid cell cx %d cy %d out of range (max %d %d)\n"
 +                              "atom %f %f %f, grid->c0 %f %f",
 +                              cx, cy, grid->ncx, grid->ncy,
 +                              x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
 +                }
 +#endif
 +                /* Take care of potential rouding issues */
 +                cx = min(cx, grid->ncx - 1);
 +                cy = min(cy, grid->ncy - 1);
 +
 +                /* For the moment cell will contain only the, grid local,
 +                 * x and y indices, not z.
 +                 */
 +                cell[i] = cx*grid->ncy + cy;
 +            }
 +            else
 +            {
 +                /* Put this moved particle after the end of the grid,
 +                 * so we can process it later without using conditionals.
 +                 */
 +                cell[i] = grid->ncx*grid->ncy;
 +            }
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +    else
 +    {
 +        /* Non-home zone */
 +        for (i = n0; i < n1; i++)
 +        {
 +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +            /* For non-home zones there could be particles outside
 +             * the non-bonded cut-off range, which have been communicated
 +             * for bonded interactions only. For the result it doesn't
 +             * matter where these end up on the grid. For performance
 +             * we put them in an extra row at the border.
 +             */
 +            cx = max(cx, 0);
 +            cx = min(cx, grid->ncx - 1);
 +            cy = max(cy, 0);
 +            cy = min(cy, grid->ncy - 1);
 +
 +            /* For the moment cell will contain only the, grid local,
 +             * x and y indices, not z.
 +             */
 +            cell[i] = cx*grid->ncy + cy;
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid cells the atoms should go */
 +static void calc_cell_indices(const nbnxn_search_t nbs,
 +                              int dd_zone,
 +                              nbnxn_grid_t *grid,
 +                              int a0, int a1,
 +                              const int *atinfo,
 +                              rvec *x,
 +                              const int *move,
 +                              nbnxn_atomdata_t *nbat)
 +{
 +    int   n0, n1, i;
 +    int   cx, cy, cxy, ncz_max, ncz;
 +    int   nthread, thread;
 +    int  *cxy_na, cxy_na_i;
 +
 +    nthread = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
 +                            nbs->cell, nbs->work[thread].cxy_na);
 +    }
 +
 +    /* Make the cell index as a function of x and y */
 +    ncz_max          = 0;
 +    ncz              = 0;
 +    grid->cxy_ind[0] = 0;
 +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
 +    {
 +        /* We set ncz_max at the beginning of the loop iso at the end
 +         * to skip i=grid->ncx*grid->ncy which are moved particles
 +         * that do not need to be ordered on the grid.
 +         */
 +        if (ncz > ncz_max)
 +        {
 +            ncz_max = ncz;
 +        }
 +        cxy_na_i = nbs->work[0].cxy_na[i];
 +        for (thread = 1; thread < nthread; thread++)
 +        {
 +            cxy_na_i += nbs->work[thread].cxy_na[i];
 +        }
 +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
 +        if (nbat->XFormat == nbatX8)
 +        {
 +            /* Make the number of cell a multiple of 2 */
 +            ncz = (ncz + 1) & ~1;
 +        }
 +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
 +        /* Clear cxy_na, so we can reuse the array below */
 +        grid->cxy_na[i] = 0;
 +    }
 +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
 +
 +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
 +                grid->na_sc, grid->na_c, grid->nc,
 +                grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
 +                ncz_max);
 +        if (gmx_debug_at)
 +        {
 +            i = 0;
 +            for (cy = 0; cy < grid->ncy; cy++)
 +            {
 +                for (cx = 0; cx < grid->ncx; cx++)
 +                {
 +                    fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
 +                    i++;
 +                }
 +                fprintf(debug, "\n");
 +            }
 +        }
 +    }
 +
 +    /* Make sure the work array for sorting is large enough */
 +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
 +    {
 +        for (thread = 0; thread < nbs->nthread_max; thread++)
 +        {
 +            nbs->work[thread].sort_work_nalloc =
 +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
 +            srenew(nbs->work[thread].sort_work,
 +                   nbs->work[thread].sort_work_nalloc);
 +            /* When not in use, all elements should be -1 */
 +            for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
 +            {
 +                nbs->work[thread].sort_work[i] = -1;
 +            }
 +        }
 +    }
 +
 +    /* Now we know the dimensions we can fill the grid.
 +     * This is the first, unsorted fill. We sort the columns after this.
 +     */
 +    for (i = a0; i < a1; i++)
 +    {
 +        /* At this point nbs->cell contains the local grid x,y indices */
 +        cxy = nbs->cell[i];
 +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
 +    }
 +
 +    if (dd_zone == 0)
 +    {
 +        /* Set the cell indices for the moved particles */
 +        n0 = grid->nc*grid->na_sc;
 +        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
 +        if (dd_zone == 0)
 +        {
 +            for (i = n0; i < n1; i++)
 +            {
 +                nbs->cell[nbs->a[i]] = i;
 +            }
 +        }
 +    }
 +
 +    /* Sort the super-cell columns along z into the sub-cells. */
 +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
 +    for (thread = 0; thread < nbs->nthread_max; thread++)
 +    {
 +        if (grid->bSimple)
 +        {
 +            sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
 +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                nbs->work[thread].sort_work);
 +        }
 +        else
 +        {
 +            sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
 +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                  nbs->work[thread].sort_work);
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid, grid->bb);
 +    }
 +#endif
 +
 +    if (!grid->bSimple)
 +    {
 +        grid->nsubc_tot = 0;
 +        for (i = 0; i < grid->nc; i++)
 +        {
 +            grid->nsubc_tot += grid->nsubc[i];
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (grid->bSimple)
 +        {
 +            print_bbsizes_simple(debug, nbs, grid);
 +        }
 +        else
 +        {
 +            fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
 +                    grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
 +
 +            print_bbsizes_supersub(debug, nbs, grid);
 +        }
 +    }
 +}
 +
 +static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 +                              int                   natoms)
 +{
 +    int b;
 +
 +    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 +    if (flags->nflag > flags->flag_nalloc)
 +    {
 +        flags->flag_nalloc = over_alloc_large(flags->nflag);
 +        srenew(flags->flag, flags->flag_nalloc);
 +    }
 +    for (b = 0; b < flags->nflag; b++)
 +    {
 +        flags->flag[b] = 0;
 +    }
 +}
 +
 +/* Sets up a grid and puts the atoms on the grid.
 + * This function only operates on one domain of the domain decompostion.
 + * Note that without domain decomposition there is only one domain.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC, matrix box,
 +                       int dd_zone,
 +                       rvec corner0, rvec corner1,
 +                       int a0, int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved, int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    int           n;
 +    int           nc_max_grid, nc_max;
 +
 +    grid = &nbs->grid[dd_zone];
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
 +
 +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
 +    grid->na_c_2log = get_2log(grid->na_c);
 +
 +    nbat->na_c = grid->na_c;
 +
 +    if (dd_zone == 0)
 +    {
 +        grid->cell0 = 0;
 +    }
 +    else
 +    {
 +        grid->cell0 =
 +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
 +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
 +    }
 +
 +    n = a1 - a0;
 +
 +    if (dd_zone == 0)
 +    {
 +        nbs->ePBC = ePBC;
 +        copy_mat(box, nbs->box);
 +
 +        if (atom_density >= 0)
 +        {
 +            grid->atom_density = atom_density;
 +        }
 +        else
 +        {
 +            grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
 +        }
 +
 +        grid->cell0 = 0;
 +
 +        nbs->natoms_local    = a1 - nmoved;
 +        /* We assume that nbnxn_put_on_grid is called first
 +         * for the local atoms (dd_zone=0).
 +         */
 +        nbs->natoms_nonlocal = a1 - nmoved;
 +    }
 +    else
 +    {
 +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1);
 +    }
 +
 +    nc_max_grid = set_grid_size_xy(nbs, grid,
 +                                   dd_zone, n-nmoved, corner0, corner1,
 +                                   nbs->grid[0].atom_density,
 +                                   nbat->XFormat);
 +
 +    nc_max = grid->cell0 + nc_max_grid;
 +
 +    if (a1 > nbs->cell_nalloc)
 +    {
 +        nbs->cell_nalloc = over_alloc_large(a1);
 +        srenew(nbs->cell, nbs->cell_nalloc);
 +    }
 +
 +    /* To avoid conditionals we store the moved particles at the end of a,
 +     * make sure we have enough space.
 +     */
 +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
 +    {
 +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
 +        srenew(nbs->a, nbs->a_nalloc);
 +    }
 +
 +    /* We need padding up to a multiple of the buffer flag size: simply add */
 +    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
 +    {
 +        nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
 +    }
 +
 +    calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
 +
 +    if (dd_zone == 0)
 +    {
 +        nbat->natoms_local = nbat->natoms;
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
 +}
 +
 +/* Calls nbnxn_put_on_grid for all non-local domains */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t            nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int                *atinfo,
 +                                rvec                     *x,
 +                                int                       nb_kernel_type,
 +                                nbnxn_atomdata_t         *nbat)
 +{
 +    int  zone, d;
 +    rvec c0, c1;
 +
 +    for (zone = 1; zone < zones->n; zone++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            c0[d] = zones->size[zone].bb_x0[d];
 +            c1[d] = zones->size[zone].bb_x1[d];
 +        }
 +
 +        nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
 +                          zone, c0, c1,
 +                          zones->cg_range[zone],
 +                          zones->cg_range[zone+1],
 +                          -1,
 +                          atinfo,
 +                          x,
 +                          0, NULL,
 +                          nb_kernel_type,
 +                          nbat);
 +    }
 +}
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
 +                           nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    float        *bbcz, *bb;
 +    int           ncd, sc;
 +
 +    grid = &nbs->grid[0];
 +
 +    if (grid->bSimple)
 +    {
 +        gmx_incons("nbnxn_grid_simple called with a simple grid");
 +    }
 +
 +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (grid->nc*ncd > grid->nc_nalloc_simple)
 +    {
 +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
 +        srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
 +        srenew(grid->bb_simple, grid->nc_nalloc_simple*NNBSBB_B);
 +        srenew(grid->flags_simple, grid->nc_nalloc_simple);
 +        if (nbat->XFormat)
 +        {
 +            sfree_aligned(grid->bbj);
 +            snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
 +        }
 +    }
 +
 +    bbcz = grid->bbcz_simple;
 +    bb   = grid->bb_simple;
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for (sc = 0; sc < grid->nc; sc++)
 +    {
 +        int c, tx, na;
 +
 +        for (c = 0; c < ncd; c++)
 +        {
 +            tx = sc*ncd + c;
 +
 +            na = NBNXN_CPU_CLUSTER_I_SIZE;
 +            while (na > 0 &&
 +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
 +            {
 +                na--;
 +            }
 +
 +            if (na > 0)
 +            {
 +                switch (nbat->XFormat)
 +                {
 +                    case nbatX4:
 +                        /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
 +                        calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
 +                                               bb+tx*NNBSBB_B);
 +                        break;
 +                    case nbatX8:
 +                        /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
 +                        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
 +                                               bb+tx*NNBSBB_B);
 +                        break;
 +                    default:
 +                        calc_bounding_box(na, nbat->xstride,
 +                                          nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
 +                                          bb+tx*NNBSBB_B);
 +                        break;
 +                }
 +                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
 +                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
 +
 +                /* No interaction optimization yet here */
 +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
 +            }
 +            else
 +            {
 +                grid->flags_simple[tx] = 0;
 +            }
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid, grid->bb_simple);
 +    }
 +#endif
 +}
 +
 +void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
 +{
 +    *ncx = nbs->grid[0].ncx;
 +    *ncy = nbs->grid[0].ncy;
 +}
 +
 +void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)
 +{
 +    const nbnxn_grid_t *grid;
 +
 +    grid = &nbs->grid[0];
 +
 +    /* Return the atom order for the home cell (index 0) */
 +    *a  = nbs->a;
 +
 +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
 +}
 +
 +void nbnxn_set_atomorder(nbnxn_search_t nbs)
 +{
 +    nbnxn_grid_t *grid;
 +    int           ao, cx, cy, cxy, cz, j;
 +
 +    /* Set the atom order for the home cell (index 0) */
 +    grid = &nbs->grid[0];
 +
 +    ao = 0;
 +    for (cx = 0; cx < grid->ncx; cx++)
 +    {
 +        for (cy = 0; cy < grid->ncy; cy++)
 +        {
 +            cxy = cx*grid->ncy + cy;
 +            j   = grid->cxy_ind[cxy]*grid->na_sc;
 +            for (cz = 0; cz < grid->cxy_na[cxy]; cz++)
 +            {
 +                nbs->a[j]     = ao;
 +                nbs->cell[ao] = j;
 +                ao++;
 +                j++;
 +            }
 +        }
 +    }
 +}
 +
 +/* Determines the cell range along one dimension that
 + * the bounding box b0 - b1 sees.
 + */
 +static void get_cell_range(real b0, real b1,
 +                           int nc, real c0, real s, real invs,
 +                           real d2, real r2, int *cf, int *cl)
 +{
 +    *cf = max((int)((b0 - c0)*invs), 0);
 +
 +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
 +    {
 +        (*cf)--;
 +    }
 +
 +    *cl = min((int)((b1 - c0)*invs), nc-1);
 +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
 +    {
 +        (*cl)++;
 +    }
 +}
 +
 +/* Reference code calculating the distance^2 between two bounding boxes */
 +static float box_dist2(float bx0, float bx1, float by0,
 +                       float by1, float bz0, float bz1,
 +                       const float *bb)
 +{
 +    float d2;
 +    float dl, dh, dm, dm0;
 +
 +    d2 = 0;
 +
 +    dl  = bx0 - bb[BBU_X];
 +    dh  = bb[BBL_X] - bx1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = by0 - bb[BBU_Y];
 +    dh  = bb[BBL_Y] - by1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bz0 - bb[BBU_Z];
 +    dh  = bb[BBL_Z] - bz1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +/* Plain C code calculating the distance^2 between two bounding boxes */
 +static float subc_bb_dist2(int si, const float *bb_i_ci,
 +                           int csj, const float *bb_j_all)
 +{
 +    const float *bb_i, *bb_j;
 +    float        d2;
 +    float        dl, dh, dm, dm0;
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    d2 = 0;
 +
 +    dl  = bb_i[BBL_X] - bb_j[BBU_X];
 +    dh  = bb_j[BBL_X] - bb_i[BBU_X];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
 +    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
 +    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* SSE code for bb distance for bb format xyz0 */
 +static float subc_bb_dist2_sse(int na_c,
 +                               int si, const float *bb_i_ci,
 +                               int csj, const float *bb_j_all)
 +{
 +    const float *bb_i, *bb_j;
 +
 +    __m128       bb_i_SSE0, bb_i_SSE1;
 +    __m128       bb_j_SSE0, bb_j_SSE1;
 +    __m128       dl_SSE;
 +    __m128       dh_SSE;
 +    __m128       dm_SSE;
 +    __m128       dm0_SSE;
 +    __m128       d2_SSE;
 +#ifndef GMX_X86_SSE4_1
 +    float        d2_array[7], *d2_align;
 +
 +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
 +#else
 +    float d2;
 +#endif
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    bb_i_SSE0 = _mm_load_ps(bb_i);
 +    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
 +    bb_j_SSE0 = _mm_load_ps(bb_j);
 +    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
 +
 +    dl_SSE    = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
 +    dh_SSE    = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
 +
 +    dm_SSE    = _mm_max_ps(dl_SSE, dh_SSE);
 +    dm0_SSE   = _mm_max_ps(dm_SSE, _mm_setzero_ps());
 +#ifndef GMX_X86_SSE4_1
 +    d2_SSE    = _mm_mul_ps(dm0_SSE, dm0_SSE);
 +
 +    _mm_store_ps(d2_align, d2_SSE);
 +
 +    return d2_align[0] + d2_align[1] + d2_align[2];
 +#else
 +    /* SSE4.1 dot product of components 0,1,2 */
 +    d2_SSE    = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
 +
 +    _mm_store_ss(&d2, d2_SSE);
 +
 +    return d2;
 +#endif
 +}
 +
 +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
 +    {                                                \
 +        int    shi;                                  \
 +                                                 \
 +        __m128 dx_0, dy_0, dz_0;                       \
 +        __m128 dx_1, dy_1, dz_1;                       \
 +                                                 \
 +        __m128 mx, my, mz;                             \
 +        __m128 m0x, m0y, m0z;                          \
 +                                                 \
 +        __m128 d2x, d2y, d2z;                          \
 +        __m128 d2s, d2t;                              \
 +                                                 \
 +        shi = si*NNBSBB_D*DIM;                       \
 +                                                 \
 +        xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
 +        yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
 +        zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
 +        xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
 +        yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
 +        zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
 +                                                 \
 +        dx_0 = _mm_sub_ps(xi_l, xj_h);                \
 +        dy_0 = _mm_sub_ps(yi_l, yj_h);                \
 +        dz_0 = _mm_sub_ps(zi_l, zj_h);                \
 +                                                 \
 +        dx_1 = _mm_sub_ps(xj_l, xi_h);                \
 +        dy_1 = _mm_sub_ps(yj_l, yi_h);                \
 +        dz_1 = _mm_sub_ps(zj_l, zi_h);                \
 +                                                 \
 +        mx   = _mm_max_ps(dx_0, dx_1);                \
 +        my   = _mm_max_ps(dy_0, dy_1);                \
 +        mz   = _mm_max_ps(dz_0, dz_1);                \
 +                                                 \
 +        m0x  = _mm_max_ps(mx, zero);                  \
 +        m0y  = _mm_max_ps(my, zero);                  \
 +        m0z  = _mm_max_ps(mz, zero);                  \
 +                                                 \
 +        d2x  = _mm_mul_ps(m0x, m0x);                  \
 +        d2y  = _mm_mul_ps(m0y, m0y);                  \
 +        d2z  = _mm_mul_ps(m0z, m0z);                  \
 +                                                 \
 +        d2s  = _mm_add_ps(d2x, d2y);                  \
 +        d2t  = _mm_add_ps(d2s, d2z);                  \
 +                                                 \
 +        _mm_store_ps(d2+si, d2t);                     \
 +    }
 +
 +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
 +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 +                                   int nsi, const float *bb_i,
 +                                   float *d2)
 +{
 +    __m128 xj_l, yj_l, zj_l;
 +    __m128 xj_h, yj_h, zj_h;
 +    __m128 xi_l, yi_l, zi_l;
 +    __m128 xi_h, yi_h, zi_h;
 +
 +    __m128 zero;
 +
 +    zero = _mm_setzero_ps();
 +
 +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
 +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
 +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
 +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
 +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
 +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
 +
 +    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
 +     * But as we know the number of iterations is 1 or 2, we unroll manually.
 +     */
 +    SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
 +    if (STRIDE_PBB < nsi)
 +    {
 +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
 +    }
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +/* Plain C function which determines if any atom pair between two cells
 + * is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_x(int na_c,
 +                                int si, const real *x_i,
 +                                int csj, int stride, const real *x_j,
 +                                real rl2)
 +{
 +    int  i, j, i0, j0;
 +    real d2;
 +
 +    for (i = 0; i < na_c; i++)
 +    {
 +        i0 = (si*na_c + i)*DIM;
 +        for (j = 0; j < na_c; j++)
 +        {
 +            j0 = (csj*na_c + j)*stride;
 +
 +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
 +                sqr(x_i[i0+1] - x_j[j0+1]) +
 +                sqr(x_i[i0+2] - x_j[j0+2]);
 +
 +            if (d2 < rl2)
 +            {
 +                return TRUE;
 +            }
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +/* SSE function which determines if any atom pair between two cells,
 + * both with 8 atoms, is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_sse8(int na_c,
 +                                   int si, const real *x_i,
 +                                   int csj, int stride, const real *x_j,
 +                                   real rl2)
 +{
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +    __m128 ix_SSE0, iy_SSE0, iz_SSE0;
 +    __m128 ix_SSE1, iy_SSE1, iz_SSE1;
 +
 +    __m128 rc2_SSE;
 +
 +    int    na_c_sse;
 +    int    j0, j1;
 +
 +    rc2_SSE   = _mm_set1_ps(rl2);
 +
 +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
 +    ix_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
 +    iy_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
 +    iz_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
 +    ix_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
 +    iy_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
 +    iz_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
 +
 +    /* We loop from the outer to the inner particles to maximize
 +     * the chance that we find a pair in range quickly and return.
 +     */
 +    j0 = csj*na_c;
 +    j1 = j0 + na_c - 1;
 +    while (j0 < j1)
 +    {
 +        __m128 jx0_SSE, jy0_SSE, jz0_SSE;
 +        __m128 jx1_SSE, jy1_SSE, jz1_SSE;
 +
 +        __m128 dx_SSE0, dy_SSE0, dz_SSE0;
 +        __m128 dx_SSE1, dy_SSE1, dz_SSE1;
 +        __m128 dx_SSE2, dy_SSE2, dz_SSE2;
 +        __m128 dx_SSE3, dy_SSE3, dz_SSE3;
 +
 +        __m128 rsq_SSE0;
 +        __m128 rsq_SSE1;
 +        __m128 rsq_SSE2;
 +        __m128 rsq_SSE3;
 +
 +        __m128 wco_SSE0;
 +        __m128 wco_SSE1;
 +        __m128 wco_SSE2;
 +        __m128 wco_SSE3;
 +        __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
 +
 +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
 +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
 +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
 +
 +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
 +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
 +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
 +
 +        /* Calculate distance */
 +        dx_SSE0            = _mm_sub_ps(ix_SSE0, jx0_SSE);
 +        dy_SSE0            = _mm_sub_ps(iy_SSE0, jy0_SSE);
 +        dz_SSE0            = _mm_sub_ps(iz_SSE0, jz0_SSE);
 +        dx_SSE1            = _mm_sub_ps(ix_SSE1, jx0_SSE);
 +        dy_SSE1            = _mm_sub_ps(iy_SSE1, jy0_SSE);
 +        dz_SSE1            = _mm_sub_ps(iz_SSE1, jz0_SSE);
 +        dx_SSE2            = _mm_sub_ps(ix_SSE0, jx1_SSE);
 +        dy_SSE2            = _mm_sub_ps(iy_SSE0, jy1_SSE);
 +        dz_SSE2            = _mm_sub_ps(iz_SSE0, jz1_SSE);
 +        dx_SSE3            = _mm_sub_ps(ix_SSE1, jx1_SSE);
 +        dy_SSE3            = _mm_sub_ps(iy_SSE1, jy1_SSE);
 +        dz_SSE3            = _mm_sub_ps(iz_SSE1, jz1_SSE);
 +
 +        /* rsq = dx*dx+dy*dy+dz*dz */
 +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
 +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
 +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
 +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
 +
 +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
 +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
 +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
 +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
 +
 +        wco_any_SSE01      = _mm_or_ps(wco_SSE0, wco_SSE1);
 +        wco_any_SSE23      = _mm_or_ps(wco_SSE2, wco_SSE3);
 +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
 +
 +        if (_mm_movemask_ps(wco_any_SSE))
 +        {
 +            return TRUE;
 +        }
 +
 +        j0++;
 +        j1--;
 +    }
 +    return FALSE;
 +
 +#else
 +    /* No SSE */
 +    gmx_incons("SSE function called without SSE support");
 +
 +    return TRUE;
 +#endif
 +}
 +
 +/* Returns the j sub-cell for index cj_ind */
 +static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
 +}
 +
 +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
 +}
 +
 +/* Ensures there is enough space for extra extra exclusion masks */
 +static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
 +{
 +    if (nbl->nexcl+extra > nbl->excl_nalloc)
 +    {
 +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 +        nbnxn_realloc_void((void **)&nbl->excl,
 +                           nbl->nexcl*sizeof(*nbl->excl),
 +                           nbl->excl_nalloc*sizeof(*nbl->excl),
 +                           nbl->alloc, nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-cells in the list */
 +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
 +                                            int               ncell)
 +{
 +    int cj_max;
 +
 +    cj_max = nbl->ncj + ncell;
 +
 +    if (cj_max > nbl->cj_nalloc)
 +    {
 +        nbl->cj_nalloc = over_alloc_small(cj_max);
 +        nbnxn_realloc_void((void **)&nbl->cj,
 +                           nbl->ncj*sizeof(*nbl->cj),
 +                           nbl->cj_nalloc*sizeof(*nbl->cj),
 +                           nbl->alloc, nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-subcells in the list */
 +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
 +                                              int               nsupercell)
 +{
 +    int ncj4_max, j4, j, w, t;
 +
 +#define NWARP       2
 +#define WARP_SIZE  32
 +
 +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
 +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
 +     * since we round down, we need one extra entry.
 +     */
 +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +
 +    if (ncj4_max > nbl->cj4_nalloc)
 +    {
 +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 +        nbnxn_realloc_void((void **)&nbl->cj4,
 +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
 +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
 +                           nbl->alloc, nbl->free);
 +    }
 +
 +    if (ncj4_max > nbl->work->cj4_init)
 +    {
 +        for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
 +        {
 +            /* No i-subcells and no excl's in the list initially */
 +            for (w = 0; w < NWARP; w++)
 +            {
 +                nbl->cj4[j4].imei[w].imask    = 0U;
 +                nbl->cj4[j4].imei[w].excl_ind = 0;
 +
 +            }
 +        }
 +        nbl->work->cj4_init = ncj4_max;
 +    }
 +}
 +
 +/* Set all excl masks for one GPU warp no exclusions */
 +static void set_no_excls(nbnxn_excl_t *excl)
 +{
 +    int t;
 +
 +    for (t = 0; t < WARP_SIZE; t++)
 +    {
 +        /* Turn all interaction bits on */
 +        excl->pair[t] = NBNXN_INT_MASK_ALL;
 +    }
 +}
 +
 +/* Initializes a single nbnxn_pairlist_t data structure */
 +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 +                                gmx_bool          bSimple,
 +                                nbnxn_alloc_t    *alloc,
 +                                nbnxn_free_t     *free)
 +{
 +    if (alloc == NULL)
 +    {
 +        nbl->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbl->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbl->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbl->free = free;
 +    }
 +
 +    nbl->bSimple     = bSimple;
 +    nbl->na_sc       = 0;
 +    nbl->na_ci       = 0;
 +    nbl->na_cj       = 0;
 +    nbl->nci         = 0;
 +    nbl->ci          = NULL;
 +    nbl->ci_nalloc   = 0;
 +    nbl->ncj         = 0;
 +    nbl->cj          = NULL;
 +    nbl->cj_nalloc   = 0;
 +    nbl->ncj4        = 0;
 +    /* We need one element extra in sj, so alloc initially with 1 */
 +    nbl->cj4_nalloc  = 0;
 +    nbl->cj4         = NULL;
 +    nbl->nci_tot     = 0;
 +
 +    if (!nbl->bSimple)
 +    {
 +        nbl->excl        = NULL;
 +        nbl->excl_nalloc = 0;
 +        nbl->nexcl       = 0;
 +        check_excl_space(nbl, 1);
 +        nbl->nexcl       = 1;
 +        set_no_excls(&nbl->excl[0]);
 +    }
 +
 +    snew(nbl->work, 1);
 +#ifdef NBNXN_BBXXXX
 +    snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
 +#else
 +    snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL*NNBSBB_B, NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
 +#ifdef GMX_NBNXN_SIMD
 +    snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
 +    snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
++
++    nbl->work->sort            = NULL;
++    nbl->work->sort_nalloc     = 0;
++    nbl->work->sci_sort        = NULL;
++    nbl->work->sci_sort_nalloc = 0;
 +}
 +
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool bSimple, gmx_bool bCombined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free)
 +{
 +    int i;
 +
 +    nbl_list->bSimple   = bSimple;
 +    nbl_list->bCombined = bCombined;
 +
 +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (!nbl_list->bCombined &&
 +        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 +    {
 +        gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 +                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
 +    }
 +
 +    snew(nbl_list->nbl, nbl_list->nnbl);
 +    /* Execute in order to avoid memory interleaving between threads */
 +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 +    for (i = 0; i < nbl_list->nnbl; i++)
 +    {
 +        /* Allocate the nblist data structure locally on each thread
 +         * to optimize memory access for NUMA architectures.
 +         */
 +        snew(nbl_list->nbl[i], 1);
 +
 +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 +        if (i == 0)
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);
 +        }
 +        else
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL, NULL);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair list, used for debug output */
 +static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
 +                                           const nbnxn_search_t nbs, real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int                 cs[SHIFTS];
 +    int                 s, i, j;
 +    int                 npexcl;
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp, "nbl nci %d ncj %d\n",
 +            nbl->nci, nbl->ncj);
 +    fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
 +
 +    fprintf(fp, "nbl average j cell list length %.1f\n",
 +            0.25*nbl->ncj/(double)nbl->nci);
 +
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        cs[s] = 0;
 +    }
 +    npexcl = 0;
 +    for (i = 0; i < nbl->nci; i++)
 +    {
 +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
 +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
 +
 +        j = nbl->ci[i].cj_ind_start;
 +        while (j < nbl->ci[i].cj_ind_end &&
 +               nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            npexcl++;
 +            j++;
 +        }
 +    }
 +    fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
 +            nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        if (cs[s] > 0)
 +        {
 +            fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair lists, used for debug output */
 +static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
 +                                             const nbnxn_search_t nbs, real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int                 i, j4, j, si, b;
 +    int                 c[GPU_NSUBCELL+1];
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
 +            nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
 +    fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
 +
 +    fprintf(fp, "nbl average j super cell list length %.1f\n",
 +            0.25*nbl->ncj4/(double)nbl->nsci);
 +    fprintf(fp, "nbl average i sub cell list length %.1f\n",
 +            nbl->nci_tot/((double)nbl->ncj4));
 +
 +    for (si = 0; si <= GPU_NSUBCELL; si++)
 +    {
 +        c[si] = 0;
 +    }
 +    for (i = 0; i < nbl->nsci; i++)
 +    {
 +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                b = 0;
 +                for (si = 0; si < GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        b++;
 +                    }
 +                }
 +                c[b]++;
 +            }
 +        }
 +    }
 +    for (b = 0; b <= GPU_NSUBCELL; b++)
 +    {
 +        fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
 +                b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
 +    }
 +}
 +
-                 (d2 < rl2 && subc_in_range_x(na_c, ci, x_ci, cj_gl, stride, x, rl2)))
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
 +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,
 +                                   int warp, nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* No exclusions set, make a new list entry */
 +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
 +        nbl->nexcl++;
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +        set_no_excls(*excl);
 +    }
 +    else
 +    {
 +        /* We already have some exclusions, new ones can be added to the list */
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,
 +                                 int warp, nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* We need to make a new list entry, check if we have space */
 +        check_excl_space(nbl, 1);
 +    }
 +    low_get_nbl_exclusions(nbl, cj4, warp, excl);
 +}
 +
 +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,
 +                                 nbnxn_excl_t **excl_w0,
 +                                 nbnxn_excl_t **excl_w1)
 +{
 +    /* Check for space we might need */
 +    check_excl_space(nbl, 2);
 +
 +    low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
 +    low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
 +}
 +
 +/* Sets the self exclusions i=j and pair exclusions i>j */
 +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 +                                               int cj4_ind, int sj_offset,
 +                                               int si)
 +{
 +    nbnxn_excl_t *excl[2];
 +    int           ei, ej, w;
 +
 +    /* Here we only set the set self and double pair exclusions */
 +
 +    get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
 +
 +    /* Only minor < major bits set */
 +    for (ej = 0; ej < nbl->na_ci; ej++)
 +    {
 +        w = (ej>>2);
 +        for (ei = ej; ei < nbl->na_ci; ei++)
 +        {
 +            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
 +                ~(1U << (sj_offset*GPU_NSUBCELL + si));
 +        }
 +    }
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 +static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 +static unsigned int get_imask_x86_simd128(gmx_bool rdiag, int ci, int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#else              /* cj-size = 2 */
 +    return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
 +            (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
 +             NBNXN_INT_MASK_ALL));
 +#endif
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 +static unsigned int get_imask_x86_simd256(gmx_bool rdiag, int ci, int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 8 */
 +    return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
 +            (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
 +             NBNXN_INT_MASK_ALL));
 +#else              /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#endif
 +}
 +
 +#ifdef GMX_NBNXN_SIMD
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define get_imask_x86_simd_4xn  get_imask_x86_simd128
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#define get_imask_x86_simd_4xn  get_imask_x86_simd256
 +#define get_imask_x86_simd_2xnn get_imask_x86_simd128
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 +#endif
 +#endif
 +#endif
 +
 +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 +                                     nbnxn_pairlist_t *nbl,
 +                                     int ci, int cjf, int cjl,
 +                                     gmx_bool remove_sub_diag,
 +                                     const real *x_j,
 +                                     real rl2, float rbb2,
 +                                     int *ndistc)
 +{
 +    const nbnxn_list_work_t *work;
 +
 +    const float             *bb_ci;
 +    const real              *x_ci;
 +
 +    gmx_bool                 InRange;
 +    real                     d2;
 +    int                      cjf_gl, cjl_gl, cj;
 +
 +    work = nbl->work;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +        d2       = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i, j;
 +
 +            cjf_gl = gridj->cell0 + cjf;
 +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +        d2       = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i, j;
 +
 +            cjl_gl = gridj->cell0 + cjl;
 +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for (cj = cjf; cj <= cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#ifdef GMX_NBNXN_SIMD_4XN
 +#include "nbnxn_search_simd_4xn.h"
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +#include "nbnxn_search_simd_2xnn.h"
 +#endif
 +
 +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 +                                       const nbnxn_grid_t *gridi,
 +                                       const nbnxn_grid_t *gridj,
 +                                       nbnxn_pairlist_t *nbl,
 +                                       int sci, int scj,
 +                                       gmx_bool sci_equals_scj,
 +                                       int stride, const real *x,
 +                                       real rl2, float rbb2,
 +                                       int *ndistc)
 +{
 +    int          na_c;
 +    int          npair;
 +    int          cjo, ci1, ci, cj, cj_gl;
 +    int          cj4_ind, cj_offset;
 +    unsigned     imask;
 +    nbnxn_cj4_t *cj4;
 +    const float *bb_ci;
 +    const real  *x_ci;
 +    float       *d2l, d2;
 +    int          w;
 +#define PRUNE_LIST_CPU_ONE
 +#ifdef PRUNE_LIST_CPU_ONE
 +    int  ci_last = -1;
 +#endif
 +
 +    d2l = nbl->work->d2;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    na_c = gridj->na_c;
 +
 +    for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)
 +    {
 +        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
 +        cj4       = &nbl->cj4[cj4_ind];
 +
 +        cj = scj*GPU_NSUBCELL + cjo;
 +
 +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
 +
 +        /* Initialize this j-subcell i-subcell list */
 +        cj4->cj[cj_offset] = cj_gl;
 +        imask              = 0;
 +
 +        if (sci_equals_scj)
 +        {
 +            ci1 = cjo + 1;
 +        }
 +        else
 +        {
 +            ci1 = gridi->nsubc[sci];
 +        }
 +
 +#ifdef NBNXN_BBXXXX
 +        /* Determine all ci1 bb distances in one call with SSE */
 +        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
 +                               ci1, bb_ci, d2l);
 +        *ndistc += na_c*2;
 +#endif
 +
 +        npair = 0;
 +        /* We use a fixed upper-bound instead of ci1 to help optimization */
 +        for (ci = 0; ci < GPU_NSUBCELL; ci++)
 +        {
 +            if (ci == ci1)
 +            {
 +                break;
 +            }
 +
 +#ifndef NBNXN_BBXXXX
 +            /* Determine the bb distance between ci and cj */
 +            d2l[ci]  = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);
 +            *ndistc += 2;
 +#endif
 +            d2 = d2l[ci];
 +
 +#ifdef PRUNE_LIST_CPU_ALL
 +            /* Check if the distance is within the distance where
 +             * we use only the bounding box distance rbb,
 +             * or within the cut-off and there is at least one atom pair
 +             * within the cut-off. This check is very costly.
 +             */
 +            *ndistc += na_c*na_c;
 +            if (d2 < rbb2 ||
-  * As we only now the current count on our own thread,
++                (d2 < rl2 &&
++#ifdef NBNXN_PBB_SSE
++                subc_in_range_sse8
++#else
++                subc_in_range_x
++#endif
++                    (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
 +#else
 +            /* Check if the distance between the two bounding boxes
 +             * in within the pair-list cut-off.
 +             */
 +            if (d2 < rl2)
 +#endif
 +            {
 +                /* Flag this i-subcell to be taken into account */
 +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +                ci_last = ci;
 +#endif
 +
 +                npair++;
 +            }
 +        }
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +        /* If we only found 1 pair, check if any atoms are actually
 +         * within the cut-off, so we could get rid of it.
 +         */
 +        if (npair == 1 && d2l[ci_last] >= rbb2)
 +        {
 +            /* Avoid using function pointers here, as it's slower */
 +            if (
 +#ifdef NBNXN_PBB_SSE
 +                !subc_in_range_sse8
 +#else
 +                !subc_in_range_x
 +#endif
 +                    (na_c, ci_last, x_ci, cj_gl, stride, x, rl2))
 +            {
 +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
 +                npair--;
 +            }
 +        }
 +#endif
 +
 +        if (npair > 0)
 +        {
 +            /* We have a useful sj entry, close it now */
 +
 +            /* Set the exclucions for the ci== sj entry.
 +             * Here we don't bother to check if this entry is actually flagged,
 +             * as it will nearly always be in the list.
 +             */
 +            if (sci_equals_scj)
 +            {
 +                set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);
 +            }
 +
 +            /* Copy the cluster interaction mask to the list */
 +            for (w = 0; w < NWARP; w++)
 +            {
 +                cj4->imei[w].imask |= imask;
 +            }
 +
 +            nbl->work->cj_ind++;
 +
 +            /* Keep the count */
 +            nbl->nci_tot += npair;
 +
 +            /* Increase the closing index in i super-cell list */
 +            nbl->sci[nbl->nsci].cj4_ind_end =
 +                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for simple list i-entry nbl_ci
 + */
 +static void set_ci_top_excls(const nbnxn_search_t nbs,
 +                             nbnxn_pairlist_t    *nbl,
 +                             gmx_bool             diagRemoved,
 +                             int                  na_ci_2log,
 +                             int                  na_cj_2log,
 +                             const nbnxn_ci_t    *nbl_ci,
 +                             const t_blocka      *excl)
 +{
 +    const int    *cell;
 +    int           ci;
 +    int           cj_ind_first, cj_ind_last;
 +    int           cj_first, cj_last;
 +    int           ndirect;
 +    int           i, ai, aj, si, eind, ge, se;
 +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
 +    int           cj_m;
 +    gmx_bool      Found_si;
 +    int           si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int           inner_i, inner_e;
 +
 +    cell = nbs->cell;
 +
 +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    ci = nbl_ci->ci;
 +
 +    cj_ind_first = nbl_ci->cj_ind_start;
 +    cj_ind_last  = nbl->ncj - 1;
 +
 +    cj_first = nbl->cj[cj_ind_first].cj;
 +    cj_last  = nbl->cj[cj_ind_last].cj;
 +
 +    /* Determine how many contiguous j-cells we have starting
 +     * from the first i-cell. This number can be used to directly
 +     * calculate j-cell indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    if (na_ci_2log == na_cj_2log)
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    else
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#endif
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for (i = 0; i < nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[ci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_ci_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = (ge >> na_cj_2log);
 +
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found    = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl->cj[cj_ind_m].cj;
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - (si << na_ci_2log);
 +                        inner_e = ge - (se << na_cj_2log);
 +
 +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for i-super-cell entry nbl_sci
 + */
 +static void set_sci_top_excls(const nbnxn_search_t nbs,
 +                              nbnxn_pairlist_t    *nbl,
 +                              gmx_bool             diagRemoved,
 +                              int                  na_c_2log,
 +                              const nbnxn_sci_t   *nbl_sci,
 +                              const t_blocka      *excl)
 +{
 +    const int    *cell;
 +    int           na_c;
 +    int           sci;
 +    int           cj_ind_first, cj_ind_last;
 +    int           cj_first, cj_last;
 +    int           ndirect;
 +    int           i, ai, aj, si, eind, ge, se;
 +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
 +    int           cj_m;
 +    gmx_bool      Found_si;
 +    int           si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int           inner_i, inner_e, w;
 +
 +    cell = nbs->cell;
 +
 +    na_c = nbl->na_ci;
 +
 +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    sci = nbl_sci->sci;
 +
 +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
 +    cj_ind_last  = nbl->work->cj_ind - 1;
 +
 +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
 +    cj_last  = nbl_cj(nbl, cj_ind_last);
 +
 +    /* Determine how many contiguous j-clusters we have starting
 +     * from the first i-cluster. This number can be used to directly
 +     * calculate j-cluster indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    while (cj_ind_first + ndirect <= cj_ind_last &&
 +           nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
 +    {
 +        ndirect++;
 +    }
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for (i = 0; i < nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[sci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_c_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = ge>>na_c_2log;
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found    = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl_cj(nbl, cj_ind_m);
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - si*na_c;
 +                        inner_e = ge - se*na_c;
 +
 +/* Macro for getting the index of atom a within a cluster */
 +#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
 +/* Macro for converting an atom number to a cluster number */
 +#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
 +/* Macro for getting the index of an i-atom within a warp */
 +#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
 +
 +                        if (nbl_imask0(nbl, found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
 +                        {
 +                            w       = (inner_e >> 2);
 +
 +                            get_nbl_exclusions_1(nbl, A2CJ4(found), w, &nbl_excl);
 +
 +                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
 +                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
 +                        }
 +
 +#undef AMODCJ4
 +#undef A2CJ4
 +#undef AMODWI
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Reallocate the simple ci list for at least n entries */
 +static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
 +{
 +    nbl->ci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->ci,
 +                       nbl->nci*sizeof(*nbl->ci),
 +                       nbl->ci_nalloc*sizeof(*nbl->ci),
 +                       nbl->alloc, nbl->free);
 +}
 +
 +/* Reallocate the super-cell sci list for at least n entries */
 +static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
 +{
 +    nbl->sci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->sci,
 +                       nbl->nsci*sizeof(*nbl->sci),
 +                       nbl->sci_nalloc*sizeof(*nbl->sci),
 +                       nbl->alloc, nbl->free);
 +}
 +
 +/* Make a new ci entry at index nbl->nci */
 +static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags,
 +                         nbnxn_list_work_t *work)
 +{
 +    if (nbl->nci + 1 > nbl->ci_nalloc)
 +    {
 +        nb_realloc_ci(nbl, nbl->nci+1);
 +    }
 +    nbl->ci[nbl->nci].ci            = ci;
 +    nbl->ci[nbl->nci].shift         = shift;
 +    /* Store the interaction flags along with the shift */
 +    nbl->ci[nbl->nci].shift        |= flags;
 +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
 +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
 +}
 +
 +/* Make a new sci entry at index nbl->nsci */
 +static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift, int flags,
 +                          nbnxn_list_work_t *work)
 +{
 +    if (nbl->nsci + 1 > nbl->sci_nalloc)
 +    {
 +        nb_realloc_sci(nbl, nbl->nsci+1);
 +    }
 +    nbl->sci[nbl->nsci].sci           = sci;
 +    nbl->sci[nbl->nsci].shift         = shift;
 +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
 +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
 +}
 +
 +/* Sort the simple j-list cj on exclusions.
 + * Entries with exclusions will all be sorted to the beginning of the list.
 + */
 +static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
 +                         nbnxn_list_work_t *work)
 +{
 +    int jnew, j;
 +
 +    if (ncj > work->cj_nalloc)
 +    {
 +        work->cj_nalloc = over_alloc_large(ncj);
 +        srenew(work->cj, work->cj_nalloc);
 +    }
 +
 +    /* Make a list of the j-cells involving exclusions */
 +    jnew = 0;
 +    for (j = 0; j < ncj; j++)
 +    {
 +        if (cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            work->cj[jnew++] = cj[j];
 +        }
 +    }
 +    /* Check if there are exclusions at all or not just the first entry */
 +    if (!((jnew == 0) ||
 +          (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
 +    {
 +        for (j = 0; j < ncj; j++)
 +        {
 +            if (cj[j].excl == NBNXN_INT_MASK_ALL)
 +            {
 +                work->cj[jnew++] = cj[j];
 +            }
 +        }
 +        for (j = 0; j < ncj; j++)
 +        {
 +            cj[j] = work->cj[j];
 +        }
 +    }
 +}
 +
 +/* Close this simple list i entry */
 +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
 +{
 +    int jlen;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
 +    if (jlen > 0)
 +    {
 +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);
 +
 +        /* The counts below are used for non-bonded pair/flop counts
 +         * and should therefore match the available kernel setups.
 +         */
 +        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
 +        {
 +            nbl->work->ncj_noq += jlen;
 +        }
 +        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
 +                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
 +        {
 +            nbl->work->ncj_hlj += jlen;
 +        }
 +
 +        nbl->nci++;
 +    }
 +}
 +
 +/* Split sci entry for load balancing on the GPU.
-  * both on nthread and our own thread index thread.
++ * Splitting ensures we have enough lists to fully utilize the whole GPU.
++ * With progBal we generate progressively smaller lists, which improves
++ * load balancing. As we only know the current count on our own thread,
 + * we will need to estimate the current total amount of i-entries.
 + * As the lists get concatenated later, this estimate depends
-     /* Estimate the total numbers of ci's of the nblist combined
-      * over all threads using the target number of ci's.
-      */
-     nsci_est = nc_bal*thread/nthread + nbl->nsci;
++ * both on nthread and our own thread index.
 + */
 +static void split_sci_entry(nbnxn_pairlist_t *nbl,
 +                            int nsp_max_av, gmx_bool progBal, int nc_bal,
 +                            int thread, int nthread)
 +{
 +    int nsci_est;
 +    int nsp_max;
 +    int cj4_start, cj4_end, j4len, cj4;
 +    int sci;
 +    int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
 +    int p;
 +
-         cj4        = cj4_start;
 +    if (progBal)
 +    {
++        /* Estimate the total numbers of ci's of the nblist combined
++         * over all threads using the target number of ci's.
++         */
++        nsci_est = nc_bal*thread/nthread + nbl->nsci;
++
 +        /* The first ci blocks should be larger, to avoid overhead.
 +         * The last ci blocks should be smaller, to improve load balancing.
 +         */
 +        nsp_max = max(1,
 +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
 +    }
 +    else
 +    {
 +        nsp_max = nsp_max_av;
 +    }
 +
 +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
 +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
 +    j4len     = cj4_end - cj4_start;
 +
 +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
 +    {
 +        /* Remove the last ci entry and process the cj4's again */
 +        nbl->nsci -= 1;
 +
 +        sci        = nbl->nsci;
-         while (cj4 < cj4_end)
 +        nsp        = 0;
 +        nsp_sci    = 0;
 +        nsp_cj4_e  = 0;
 +        nsp_cj4    = 0;
-             nsp += nsp_cj4;
++        for (cj4 = cj4_start; cj4 < cj4_end; cj4++)
 +        {
 +            nsp_cj4_p = nsp_cj4;
++            /* Count the number of cluster pairs in this cj4 group */
 +            nsp_cj4   = 0;
 +            for (p = 0; p < GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
 +            {
 +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
 +            }
-             if (nsp > nsp_max && nsp > nsp_cj4)
 +
-                 nsp_sci                     = nsp - nsp_cj4;
++            if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
 +            {
++                /* Split the list at cj4 */
 +                nbl->sci[sci].cj4_ind_end = cj4;
++                /* Create a new sci entry */
 +                sci++;
 +                nbl->nsci++;
 +                if (nbl->nsci+1 > nbl->sci_nalloc)
 +                {
 +                    nb_realloc_sci(nbl, nbl->nsci+1);
 +                }
 +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
 +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
 +                nbl->sci[sci].cj4_ind_start = cj4;
-                 nsp                         = nsp_cj4;
++                nsp_sci                     = nsp;
 +                nsp_cj4_e                   = nsp_cj4_p;
-             cj4++;
++                nsp                         = 0;
 +            }
-         /* Put the remaining cj4's in a new ci entry */
++            nsp += nsp_cj4;
 +        }
 +
-         /* Possibly balance out the last two ci's
-          * by moving the last cj4 of the second last ci.
++        /* Put the remaining cj4's in the last sci entry */
 +        nbl->sci[sci].cj4_ind_end = cj4_end;
 +
-         sci++;
++        /* Possibly balance out the last two sci's
++         * by moving the last cj4 of the second last sci.
 +         */
 +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
 +        {
 +            nbl->sci[sci-1].cj4_ind_end--;
 +            nbl->sci[sci].cj4_ind_start--;
 +        }
 +
-         /* Since the target value is a maximum (this avoid high outliers,
-          * which lead to load imbalance), not average, we get more lists
-          * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
-          * But more importantly, the optimal GPU performance moves
-          * to lower number of block for very small blocks.
-          * To compensate we add the maximum pair count per cj4.
 +        nbl->nsci++;
 +    }
 +}
 +
 +/* Clost this super/sub list i entry */
 +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
 +                                    int nsp_max_av,
 +                                    gmx_bool progBal, int nc_bal,
 +                                    int thread, int nthread)
 +{
 +    int j4len, tlen;
 +    int nb, b;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
 +    if (j4len > 0)
 +    {
 +        /* We can only have complete blocks of 4 j-entries in a list,
 +         * so round the count up before closing.
 +         */
 +        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +
 +        nbl->nsci++;
 +
 +        if (nsp_max_av > 0)
 +        {
++            /* Measure the size of the new entry and potentially split it */
 +            split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);
 +        }
 +    }
 +}
 +
 +/* Syncs the working array before adding another grid pair to the list */
 +static void sync_work(nbnxn_pairlist_t *nbl)
 +{
 +    if (!nbl->bSimple)
 +    {
 +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +        nbl->work->cj4_init = nbl->ncj4;
 +    }
 +}
 +
 +/* Clears an nbnxn_pairlist_t data structure */
 +static void clear_pairlist(nbnxn_pairlist_t *nbl)
 +{
 +    nbl->nci           = 0;
 +    nbl->nsci          = 0;
 +    nbl->ncj           = 0;
 +    nbl->ncj4          = 0;
 +    nbl->nci_tot       = 0;
 +    nbl->nexcl         = 1;
 +
 +    nbl->work->ncj_noq = 0;
 +    nbl->work->ncj_hlj = 0;
 +}
 +
 +/* Sets a simple list i-cell bounding box, including PBC shift */
 +static void set_icell_bb_simple(const float *bb, int ci,
 +                                real shx, real shy, real shz,
 +                                float *bb_ci)
 +{
 +    int ia;
 +
 +    ia           = ci*NNBSBB_B;
 +    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
 +    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
 +    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
 +    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
 +    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
 +    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
 +}
 +
 +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
 +static void set_icell_bb_supersub(const float *bb, int ci,
 +                                  real shx, real shy, real shz,
 +                                  float *bb_ci)
 +{
 +    int ia, m, i;
 +
 +#ifdef NBNXN_BBXXXX
 +    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
 +    for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
 +    {
 +        for (i = 0; i < STRIDE_PBB; i++)
 +        {
 +            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
 +            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
 +            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
 +            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
 +            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
 +            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
 +        }
 +    }
 +#else
 +    ia = ci*GPU_NSUBCELL*NNBSBB_B;
 +    for (i = 0; i < GPU_NSUBCELL*NNBSBB_B; i += NNBSBB_B)
 +    {
 +        bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
 +        bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
 +        bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
 +        bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
 +        bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
 +        bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
 +    }
 +#endif
 +}
 +
 +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_simple(int ci,
 +                               real shx, real shy, real shz,
 +                               int na_c,
 +                               int stride, const real *x,
 +                               nbnxn_list_work_t *work)
 +{
 +    int  ia, i;
 +
 +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE; i++)
 +    {
 +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
 +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
 +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
 +    }
 +}
 +
 +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_supersub(int ci,
 +                                 real shx, real shy, real shz,
 +                                 int na_c,
 +                                 int stride, const real *x,
 +                                 nbnxn_list_work_t *work)
 +{
 +    int  ia, i;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    ia = ci*GPU_NSUBCELL*na_c;
 +    for (i = 0; i < GPU_NSUBCELL*na_c; i++)
 +    {
 +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
 +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
 +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Copies PBC shifted super-cell packed atom coordinates to working array */
 +static void icell_set_x_supersub_sse8(int ci,
 +                                      real shx, real shy, real shz,
 +                                      int na_c,
 +                                      int stride, const real *x,
 +                                      nbnxn_list_work_t *work)
 +{
 +    int  si, io, ia, i, j;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    for (si = 0; si < GPU_NSUBCELL; si++)
 +    {
 +        for (i = 0; i < na_c; i += STRIDE_PBB)
 +        {
 +            io = si*na_c + i;
 +            ia = ci*GPU_NSUBCELL*na_c + io;
 +            for (j = 0; j < STRIDE_PBB; j++)
 +            {
 +                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
 +                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
 +                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density)
 +{
 +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density), 1.0/3.0));
 +}
 +
 +/* Estimates the interaction volume^2 for non-local interactions */
 +static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)
 +{
 +    int  z, d;
 +    real cl, ca, za;
 +    real vold_est;
 +    real vol2_est_tot;
 +
 +    vol2_est_tot = 0;
 +
 +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
 +     * not home interaction volume^2. As these volumes are not additive,
 +     * this is an overestimate, but it would only be significant in the limit
 +     * of small cells, where we anyhow need to split the lists into
 +     * as small parts as possible.
 +     */
 +
 +    for (z = 0; z < zones->n; z++)
 +    {
 +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
 +        {
 +            cl = 0;
 +            ca = 1;
 +            za = 1;
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (zones->shift[z][d] == 0)
 +                {
 +                    cl += 0.5*ls[d];
 +                    ca *= ls[d];
 +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
 +                }
 +            }
 +
 +            /* 4 octants of a sphere */
 +            vold_est  = 0.25*M_PI*r*r*r*r;
 +            /* 4 quarter pie slices on the edges */
 +            vold_est += 4*cl*M_PI/6.0*r*r*r;
 +            /* One rectangular volume on a face */
 +            vold_est += ca*0.5*r*r;
 +
 +            vol2_est_tot += vold_est*za;
 +        }
 +    }
 +
 +    return vol2_est_tot;
 +}
 +
 +/* Estimates the average size of a full j-list for super/sub setup */
 +static int get_nsubpair_max(const nbnxn_search_t nbs,
 +                            int                  iloc,
 +                            real                 rlist,
 +                            int                  min_ci_balanced)
 +{
 +    const nbnxn_grid_t *grid;
 +    rvec ls;
 +    real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;
 +    int  nsubpair_max;
 +
 +    grid = &nbs->grid[0];
 +
 +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
 +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
 +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
 +
 +    /* The average squared length of the diagonal of a sub cell */
 +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
 +
 +    /* The formulas below are a heuristic estimate of the average nsj per si*/
 +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
 +
 +    if (!nbs->DomDec || nbs->zones->n == 1)
 +    {
 +        nsp_est_nl = 0;
 +    }
 +    else
 +    {
 +        nsp_est_nl =
 +            sqr(grid->atom_density/grid->na_c)*
 +            nonlocal_vol2(nbs->zones, ls, r_eff_sup);
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Sub-cell interacts with itself */
 +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
 +        /* 6/2 rectangular volume on the faces */
 +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
 +        /* 12/2 quarter pie slices on the edges */
 +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
 +        /* 4 octants of a sphere */
 +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
 +
 +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 +
 +        /* Subtract the non-local pair count */
 +        nsp_est -= nsp_est_nl;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
 +                    nsp_est, nsp_est_nl);
 +        }
 +    }
 +    else
 +    {
 +        nsp_est = nsp_est_nl;
 +    }
 +
 +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
 +    {
 +        /* We don't need to worry */
 +        nsubpair_max = -1;
 +    }
 +    else
 +    {
 +        /* Thus the (average) maximum j-list size should be as follows */
 +        nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5));
 +
-         nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
++        /* Since the target value is a maximum (this avoids high outliers,
++         * which lead to load imbalance), not average, we add half the
++         * number of pairs in a cj4 block to get the average about right.
 +         */
-     int i, j4, j;
++        nsubpair_max += GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE/2;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",
 +                nsp_est, nsubpair_max);
 +    }
 +
 +    return nsubpair_max;
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
 +{
 +    int i, j;
 +
 +    for (i = 0; i < nbl->nci; i++)
 +    {
 +        fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
 +                nbl->ci[i].ci, nbl->ci[i].shift,
 +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
 +
 +        for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)
 +        {
 +            fprintf(fp, "  cj %5d  imask %x\n",
 +                    nbl->cj[j].cj,
 +                    nbl->cj[j].excl);
 +        }
 +    }
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
 +{
-                                          (LOCAL_I(iloc) || nbs->zones->n <= 2),
-                                          min_ci_balanced,
++    int i, j4, j, ncp, si;
 +
 +    for (i = 0; i < nbl->nsci; i++)
 +    {
 +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
 +                nbl->sci[i].sci, nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
 +
++        ncp = 0;
 +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                fprintf(fp, "  sj %5d  imask %x\n",
 +                        nbl->cj4[j4].cj[j],
 +                        nbl->cj4[j4].imei[0].imask);
++                for (si=0; si<GPU_NSUBCELL; si++)
++                {
++                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
++                    {
++                        ncp++;
++                    }
++                }
 +            }
 +        }
++        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
++                nbl->sci[i].sci, nbl->sci[i].shift,
++                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
++                ncp);
 +    }
 +}
 +
 +/* Combine pair lists *nbl generated on multiple threads nblc */
 +static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
 +                            nbnxn_pairlist_t *nblc)
 +{
 +    int nsci, ncj4, nexcl;
 +    int n, i;
 +
 +    if (nblc->bSimple)
 +    {
 +        gmx_incons("combine_nblists does not support simple lists");
 +    }
 +
 +    nsci  = nblc->nsci;
 +    ncj4  = nblc->ncj4;
 +    nexcl = nblc->nexcl;
 +    for (i = 0; i < nnbl; i++)
 +    {
 +        nsci  += nbl[i]->nsci;
 +        ncj4  += nbl[i]->ncj4;
 +        nexcl += nbl[i]->nexcl;
 +    }
 +
 +    if (nsci > nblc->sci_nalloc)
 +    {
 +        nb_realloc_sci(nblc, nsci);
 +    }
 +    if (ncj4 > nblc->cj4_nalloc)
 +    {
 +        nblc->cj4_nalloc = over_alloc_small(ncj4);
 +        nbnxn_realloc_void((void **)&nblc->cj4,
 +                           nblc->ncj4*sizeof(*nblc->cj4),
 +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
 +                           nblc->alloc, nblc->free);
 +    }
 +    if (nexcl > nblc->excl_nalloc)
 +    {
 +        nblc->excl_nalloc = over_alloc_small(nexcl);
 +        nbnxn_realloc_void((void **)&nblc->excl,
 +                           nblc->nexcl*sizeof(*nblc->excl),
 +                           nblc->excl_nalloc*sizeof(*nblc->excl),
 +                           nblc->alloc, nblc->free);
 +    }
 +
 +    /* Each thread should copy its own data to the combined arrays,
 +     * as otherwise data will go back and forth between different caches.
 +     */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for (n = 0; n < nnbl; n++)
 +    {
 +        int sci_offset;
 +        int cj4_offset;
 +        int ci_offset;
 +        int excl_offset;
 +        int i, j4;
 +        const nbnxn_pairlist_t *nbli;
 +
 +        /* Determine the offset in the combined data for our thread */
 +        sci_offset  = nblc->nsci;
 +        cj4_offset  = nblc->ncj4;
 +        ci_offset   = nblc->nci_tot;
 +        excl_offset = nblc->nexcl;
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            sci_offset  += nbl[i]->nsci;
 +            cj4_offset  += nbl[i]->ncj4;
 +            ci_offset   += nbl[i]->nci_tot;
 +            excl_offset += nbl[i]->nexcl;
 +        }
 +
 +        nbli = nbl[n];
 +
 +        for (i = 0; i < nbli->nsci; i++)
 +        {
 +            nblc->sci[sci_offset+i]                = nbli->sci[i];
 +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
 +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
 +        }
 +
 +        for (j4 = 0; j4 < nbli->ncj4; j4++)
 +        {
 +            nblc->cj4[cj4_offset+j4]                   = nbli->cj4[j4];
 +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
 +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
 +        }
 +
 +        for (j4 = 0; j4 < nbli->nexcl; j4++)
 +        {
 +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
 +        }
 +    }
 +
 +    for (n = 0; n < nnbl; n++)
 +    {
 +        nblc->nsci    += nbl[n]->nsci;
 +        nblc->ncj4    += nbl[n]->ncj4;
 +        nblc->nci_tot += nbl[n]->nci_tot;
 +        nblc->nexcl   += nbl[n]->nexcl;
 +    }
 +}
 +
 +/* Returns the next ci to be processes by our thread */
 +static gmx_bool next_ci(const nbnxn_grid_t *grid,
 +                        int conv,
 +                        int nth, int ci_block,
 +                        int *ci_x, int *ci_y,
 +                        int *ci_b, int *ci)
 +{
 +    (*ci_b)++;
 +    (*ci)++;
 +
 +    if (*ci_b == ci_block)
 +    {
 +        /* Jump to the next block assigned to this task */
 +        *ci   += (nth - 1)*ci_block;
 +        *ci_b  = 0;
 +    }
 +
 +    if (*ci >= grid->nc*conv)
 +    {
 +        return FALSE;
 +    }
 +
 +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
 +    {
 +        *ci_y += 1;
 +        if (*ci_y == grid->ncy)
 +        {
 +            *ci_x += 1;
 +            *ci_y  = 0;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +/* Returns the distance^2 for which we put cell pairs in the list
 + * without checking atom pair distances. This is usually < rlist^2.
 + */
 +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
 +                                        const nbnxn_grid_t *gridj,
 +                                        real                rlist,
 +                                        gmx_bool            simple)
 +{
 +    /* If the distance between two sub-cell bounding boxes is less
 +     * than this distance, do not check the distance between
 +     * all particle pairs in the sub-cell, since then it is likely
 +     * that the box pair has atom pairs within the cut-off.
 +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
 +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
 +     * Using more than 0.5 gains at most 0.5%.
 +     * If forces are calculated more than twice, the performance gain
 +     * in the force calculation outweighs the cost of checking.
 +     * Note that with subcell lists, the atom-pair distance check
 +     * is only performed when only 1 out of 8 sub-cells in within range,
 +     * this is because the GPU is much faster than the cpu.
 +     */
 +    real bbx, bby;
 +    real rbb2;
 +
 +    bbx = 0.5*(gridi->sx + gridj->sx);
 +    bby = 0.5*(gridi->sy + gridj->sy);
 +    if (!simple)
 +    {
 +        bbx /= GPU_NSUBCELL_X;
 +        bby /= GPU_NSUBCELL_Y;
 +    }
 +
 +    rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
 +
 +#ifndef GMX_DOUBLE
 +    return rbb2;
 +#else
 +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
 +#endif
 +}
 +
 +static int get_ci_block_size(const nbnxn_grid_t *gridi,
 +                             gmx_bool bDomDec, int nth)
 +{
 +    const int ci_block_enum      = 5;
 +    const int ci_block_denom     = 11;
 +    const int ci_block_min_atoms = 16;
 +    int ci_block;
 +
 +    /* Here we decide how to distribute the blocks over the threads.
 +     * We use prime numbers to try to avoid that the grid size becomes
 +     * a multiple of the number of threads, which would lead to some
 +     * threads getting "inner" pairs and others getting boundary pairs,
 +     * which in turns will lead to load imbalance between threads.
 +     * Set the block size as 5/11/ntask times the average number of cells
 +     * in a y,z slab. This should ensure a quite uniform distribution
 +     * of the grid parts of the different thread along all three grid
 +     * zone boundaries with 3D domain decomposition. At the same time
 +     * the blocks will not become too small.
 +     */
 +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
 +
 +    /* Ensure the blocks are not too small: avoids cache invalidation */
 +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
 +    {
 +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
 +    }
 +
 +    /* Without domain decomposition
 +     * or with less than 3 blocks per task, divide in nth blocks.
 +     */
 +    if (!bDomDec || ci_block*3*nth > gridi->nc)
 +    {
 +        ci_block = (gridi->nc + nth - 1)/nth;
 +    }
 +
 +    return ci_block;
 +}
 +
 +/* Generates the part of pair-list nbl assigned to our thread */
 +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 +                                     const nbnxn_grid_t *gridi,
 +                                     const nbnxn_grid_t *gridj,
 +                                     nbnxn_search_work_t *work,
 +                                     const nbnxn_atomdata_t *nbat,
 +                                     const t_blocka *excl,
 +                                     real rlist,
 +                                     int nb_kernel_type,
 +                                     int ci_block,
 +                                     gmx_bool bFBufferFlag,
 +                                     int nsubpair_max,
 +                                     gmx_bool progBal,
 +                                     int min_ci_balanced,
 +                                     int th, int nth,
 +                                     nbnxn_pairlist_t *nbl)
 +{
 +    int  na_cj_2log;
 +    matrix box;
 +    real rl2;
 +    float rbb2;
 +    int  d;
 +    int  ci_b, ci, ci_x, ci_y, ci_xy, cj;
 +    ivec shp;
 +    int  tx, ty, tz;
 +    int  shift;
 +    gmx_bool bMakeList;
 +    real shx, shy, shz;
 +    int  conv_i, cell0_i;
 +    const float *bb_i, *bbcz_i, *bbcz_j;
 +    const int *flags_i;
 +    real bx0, bx1, by0, by1, bz0, bz1;
 +    real bz1_frac;
 +    real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
 +    int  cxf, cxl, cyf, cyf_x, cyl;
 +    int  cx, cy;
 +    int  c0, c1, cs, cf, cl;
 +    int  ndistc;
 +    int  ncpcheck;
 +    int  gridi_flag_shift = 0, gridj_flag_shift = 0;
 +    unsigned *gridj_flag  = NULL;
 +    int  ncj_old_i, ncj_old_j;
 +
 +    nbs_cycle_start(&work->cc[enbsCCsearch]);
 +
 +    if (gridj->bSimple != nbl->bSimple)
 +    {
 +        gmx_incons("Grid incompatible with pair-list");
 +    }
 +
 +    sync_work(nbl);
 +    nbl->na_sc = gridj->na_sc;
 +    nbl->na_ci = gridj->na_c;
 +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    na_cj_2log = get_2log(nbl->na_cj);
 +
 +    nbl->rlist  = rlist;
 +
 +    if (bFBufferFlag)
 +    {
 +        /* Determine conversion of clusters to flag blocks */
 +        gridi_flag_shift = 0;
 +        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridi_flag_shift++;
 +        }
 +        gridj_flag_shift = 0;
 +        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridj_flag_shift++;
 +        }
 +
 +        gridj_flag = work->buffer_flags.flag;
 +    }
 +
 +    copy_mat(nbs->box, box);
 +
 +    rl2 = nbl->rlist*nbl->rlist;
 +
 +    rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));
 +    }
 +
 +    /* Set the shift range */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    if (nbl->bSimple && !gridi->bSimple)
 +    {
 +        conv_i  = gridi->na_sc/gridj->na_sc;
 +        bb_i    = gridi->bb_simple;
 +        bbcz_i  = gridi->bbcz_simple;
 +        flags_i = gridi->flags_simple;
 +    }
 +    else
 +    {
 +        conv_i  = 1;
 +        bb_i    = gridi->bb;
 +        bbcz_i  = gridi->bbcz;
 +        flags_i = gridi->flags;
 +    }
 +    cell0_i = gridi->cell0*conv_i;
 +
 +    bbcz_j = gridj->bbcz;
 +
 +    if (conv_i != 1)
 +    {
 +        /* Blocks of the conversion factor - 1 give a large repeat count
 +         * combined with a small block size. This should result in good
 +         * load balancing for both small and large domains.
 +         */
 +        ci_block = conv_i - 1;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
 +                gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
 +    }
 +
 +    ndistc   = 0;
 +    ncpcheck = 0;
 +
 +    /* Initially ci_b and ci to 1 before where we want them to start,
 +     * as they will both be incremented in next_ci.
 +     */
 +    ci_b = -1;
 +    ci   = th*ci_block - 1;
 +    ci_x = 0;
 +    ci_y = 0;
 +    while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
 +    {
 +        if (nbl->bSimple && flags_i[ci] == 0)
 +        {
 +            continue;
 +        }
 +
 +        ncj_old_i = nbl->ncj;
 +
 +        d2cx = 0;
 +        if (gridj != gridi && shp[XX] == 0)
 +        {
 +            if (nbl->bSimple)
 +            {
 +                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
 +            }
 +            else
 +            {
 +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
 +            }
 +            if (bx1 < gridj->c0[XX])
 +            {
 +                d2cx = sqr(gridj->c0[XX] - bx1);
 +
 +                if (d2cx >= rl2)
 +                {
 +                    continue;
 +                }
 +            }
 +        }
 +
 +        ci_xy = ci_x*gridi->ncy + ci_y;
 +
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
 +        {
 +            shz = tz*box[ZZ][ZZ];
 +
 +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
 +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
 +
 +            if (tz == 0)
 +            {
 +                d2z = 0;
 +            }
 +            else if (tz < 0)
 +            {
 +                d2z = sqr(bz1);
 +            }
 +            else
 +            {
 +                d2z = sqr(bz0 - box[ZZ][ZZ]);
 +            }
 +
 +            d2z_cx = d2z + d2cx;
 +
 +            if (d2z_cx >= rl2)
 +            {
 +                continue;
 +            }
 +
 +            bz1_frac =
 +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
 +            if (bz1_frac < 0)
 +            {
 +                bz1_frac = 0;
 +            }
 +            /* The check with bz1_frac close to or larger than 1 comes later */
 +
 +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
 +            {
 +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 +
 +                if (nbl->bSimple)
 +                {
 +                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
 +                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
 +                }
 +                else
 +                {
 +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
 +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
 +                }
 +
 +                get_cell_range(by0, by1,
 +                               gridj->ncy, gridj->c0[YY], gridj->sy, gridj->inv_sy,
 +                               d2z_cx, rl2,
 +                               &cyf, &cyl);
 +
 +                if (cyf > cyl)
 +                {
 +                    continue;
 +                }
 +
 +                d2z_cy = d2z;
 +                if (by1 < gridj->c0[YY])
 +                {
 +                    d2z_cy += sqr(gridj->c0[YY] - by1);
 +                }
 +                else if (by0 > gridj->c1[YY])
 +                {
 +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
 +                }
 +
 +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
 +                {
 +                    shift = XYZ2IS(tx, ty, tz);
 +
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                    if (gridi == gridj && shift > CENTRAL)
 +                    {
 +                        continue;
 +                    }
 +#endif
 +
 +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
 +                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
 +                    }
 +                    else
 +                    {
 +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
 +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
 +                    }
 +
 +                    get_cell_range(bx0, bx1,
 +                                   gridj->ncx, gridj->c0[XX], gridj->sx, gridj->inv_sx,
 +                                   d2z_cy, rl2,
 +                                   &cxf, &cxl);
 +
 +                    if (cxf > cxl)
 +                    {
 +                        continue;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci],
 +                                     nbl->work);
 +                    }
 +                    else
 +                    {
 +                        new_sci_entry(nbl, cell0_i+ci, shift, flags_i[ci],
 +                                      nbl->work);
 +                    }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                    if (cxf < ci_x)
 +#else
 +                    if (shift == CENTRAL && gridi == gridj &&
 +                        cxf < ci_x)
 +#endif
 +                    {
 +                        /* Leave the pairs with i > j.
 +                         * x is the major index, so skip half of it.
 +                         */
 +                        cxf = ci_x;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        set_icell_bb_simple(bb_i, ci, shx, shy, shz,
 +                                            nbl->work->bb_ci);
 +                    }
 +                    else
 +                    {
 +                        set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
 +                                              nbl->work->bb_ci);
 +                    }
 +
 +                    nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
 +                                     gridi->na_c, nbat->xstride, nbat->x,
 +                                     nbl->work);
 +
 +                    for (cx = cxf; cx <= cxl; cx++)
 +                    {
 +                        d2zx = d2z;
 +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
 +                        }
 +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
 +                        }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                        if (gridi == gridj &&
 +                            cx == 0 && cyf < ci_y)
 +#else
 +                        if (gridi == gridj &&
 +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
 +#endif
 +                        {
 +                            /* Leave the pairs with i > j.
 +                             * Skip half of y when i and j have the same x.
 +                             */
 +                            cyf_x = ci_y;
 +                        }
 +                        else
 +                        {
 +                            cyf_x = cyf;
 +                        }
 +
 +                        for (cy = cyf_x; cy <= cyl; cy++)
 +                        {
 +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
 +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                            if (gridi == gridj &&
 +                                shift == CENTRAL && c0 < ci)
 +                            {
 +                                c0 = ci;
 +                            }
 +#endif
 +
 +                            d2zxy = d2zx;
 +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
 +                            }
 +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
 +                            }
 +                            if (c1 > c0 && d2zxy < rl2)
 +                            {
 +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
 +                                if (cs >= c1)
 +                                {
 +                                    cs = c1 - 1;
 +                                }
 +
 +                                d2xy = d2zxy - d2z;
 +
 +                                /* Find the lowest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cf = cs;
 +                                while (cf > c0 &&
 +                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
 +                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
 +                                {
 +                                    cf--;
 +                                }
 +
 +                                /* Find the highest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cl = cs;
 +                                while (cl < c1-1 &&
 +                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
 +                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
 +                                {
 +                                    cl++;
 +                                }
 +
 +#ifdef NBNXN_REFCODE
 +                                {
 +                                    /* Simple reference code, for debugging,
 +                                     * overrides the more complex code above.
 +                                     */
 +                                    int k;
 +                                    cf = c1;
 +                                    cl = -1;
 +                                    for (k = c0; k < c1; k++)
 +                                    {
 +                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k < cf)
 +                                        {
 +                                            cf = k;
 +                                        }
 +                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k > cl)
 +                                        {
 +                                            cl = k;
 +                                        }
 +                                    }
 +                                }
 +#endif
 +
 +                                if (gridi == gridj)
 +                                {
 +                                    /* We want each atom/cell pair only once,
 +                                     * only use cj >= ci.
 +                                     */
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                                    cf = max(cf, ci);
 +#else
 +                                    if (shift == CENTRAL)
 +                                    {
 +                                        cf = max(cf, ci);
 +                                    }
 +#endif
 +                                }
 +
 +                                if (cf <= cl)
 +                                {
 +                                    /* For f buffer flags with simple lists */
 +                                    ncj_old_j = nbl->ncj;
 +
 +                                    switch (nb_kernel_type)
 +                                    {
 +                                        case nbnxnk4x4_PlainC:
 +                                            check_subcell_list_space_simple(nbl, cl-cf+1);
 +
 +                                            make_cluster_list_simple(gridj,
 +                                                                     nbl, ci, cf, cl,
 +                                                                     (gridi == gridj && shift == CENTRAL),
 +                                                                     nbat->x,
 +                                                                     rl2, rbb2,
 +                                                                     &ndistc);
 +                                            break;
 +#ifdef GMX_NBNXN_SIMD_4XN
 +                                        case nbnxnk4xN_SIMD_4xN:
 +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
 +                                            make_cluster_list_simd_4xn(gridj,
 +                                                                       nbl, ci, cf, cl,
 +                                                                       (gridi == gridj && shift == CENTRAL),
 +                                                                       nbat->x,
 +                                                                       rl2, rbb2,
 +                                                                       &ndistc);
 +                                            break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +                                        case nbnxnk4xN_SIMD_2xNN:
 +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
 +                                            make_cluster_list_simd_2xnn(gridj,
 +                                                                        nbl, ci, cf, cl,
 +                                                                        (gridi == gridj && shift == CENTRAL),
 +                                                                        nbat->x,
 +                                                                        rl2, rbb2,
 +                                                                        &ndistc);
 +                                            break;
 +#endif
 +                                        case nbnxnk8x8x8_PlainC:
 +                                        case nbnxnk8x8x8_CUDA:
 +                                            check_subcell_list_space_supersub(nbl, cl-cf+1);
 +                                            for (cj = cf; cj <= cl; cj++)
 +                                            {
 +                                                make_cluster_list_supersub(nbs, gridi, gridj,
 +                                                                           nbl, ci, cj,
 +                                                                           (gridi == gridj && shift == CENTRAL && ci == cj),
 +                                                                           nbat->xstride, nbat->x,
 +                                                                           rl2, rbb2,
 +                                                                           &ndistc);
 +                                            }
 +                                            break;
 +                                    }
 +                                    ncpcheck += cl - cf + 1;
 +
 +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
 +                                    {
 +                                        int cbf, cbl, cb;
 +
 +                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
 +                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
 +                                        for (cb = cbf; cb <= cbl; cb++)
 +                                        {
 +                                            gridj_flag[cb] = 1U<<th;
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +
 +                    /* Set the exclusions for this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        set_ci_top_excls(nbs,
 +                                         nbl,
 +                                         shift == CENTRAL && gridi == gridj,
 +                                         gridj->na_c_2log,
 +                                         na_cj_2log,
 +                                         &(nbl->ci[nbl->nci]),
 +                                         excl);
 +                    }
 +                    else
 +                    {
 +                        set_sci_top_excls(nbs,
 +                                          nbl,
 +                                          shift == CENTRAL && gridi == gridj,
 +                                          gridj->na_c_2log,
 +                                          &(nbl->sci[nbl->nsci]),
 +                                          excl);
 +                    }
 +
 +                    /* Close this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        close_ci_entry_simple(nbl);
 +                    }
 +                    else
 +                    {
 +                        close_ci_entry_supersub(nbl,
 +                                                nsubpair_max,
 +                                                progBal, min_ci_balanced,
 +                                                th, nth);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
 +        {
 +            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
 +        }
 +    }
 +
 +    work->ndistc = ndistc;
 +
 +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "number of distance checks %d\n", ndistc);
 +        fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",
 +                ncpcheck);
 +
 +        if (nbl->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug, nbl, nbs, rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug, nbl, nbs, rlist);
 +        }
 +
 +    }
 +}
 +
 +static void reduce_buffer_flags(const nbnxn_search_t        nbs,
 +                                int                         nsrc,
 +                                const nbnxn_buffer_flags_t *dest)
 +{
 +    int s, b;
 +    const unsigned *flag;
 +
 +    for (s = 0; s < nsrc; s++)
 +    {
 +        flag = nbs->work[s].buffer_flags.flag;
 +
 +        for (b = 0; b < dest->nflag; b++)
 +        {
 +            dest->flag[b] |= flag[b];
 +        }
 +    }
 +}
 +
 +static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
 +{
 +    int nelem, nkeep, ncopy, nred, b, c, out;
 +
 +    nelem = 0;
 +    nkeep = 0;
 +    ncopy = 0;
 +    nred  = 0;
 +    for (b = 0; b < flags->nflag; b++)
 +    {
 +        if (flags->flag[b] == 1)
 +        {
 +            /* Only flag 0 is set, no copy of reduction required */
 +            nelem++;
 +            nkeep++;
 +        }
 +        else if (flags->flag[b] > 0)
 +        {
 +            c = 0;
 +            for (out = 0; out < nout; out++)
 +            {
 +                if (flags->flag[b] & (1U<<out))
 +                {
 +                    c++;
 +                }
 +            }
 +            nelem += c;
 +            if (c == 1)
 +            {
 +                ncopy++;
 +            }
 +            else
 +            {
 +                nred += c;
 +            }
 +        }
 +    }
 +
 +    fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
 +            flags->nflag, nout,
 +            nelem/(double)(flags->nflag),
 +            nkeep/(double)(flags->nflag),
 +            ncopy/(double)(flags->nflag),
 +            nred/(double)(flags->nflag));
 +}
 +
++/* Perform a count (linear) sort to sort the smaller lists to the end.
++ * This avoids load imbalance on the GPU, as large lists will be
++ * scheduled and executed first and the smaller lists later.
++ * Load balancing between multi-processors only happens at the end
++ * and there smaller lists lead to more effective load balancing.
++ * The sorting is done on the cj4 count, not on the actual pair counts.
++ * Not only does this make the sort faster, but it also results in
++ * better load balancing than using a list sorted on exact load.
++ * This function swaps the pointer in the pair list to avoid a copy operation.
++ */
++static void sort_sci(nbnxn_pairlist_t *nbl)
++{
++    nbnxn_list_work_t *work;
++    int                m, i, s, s0, s1;
++    nbnxn_sci_t       *sci_sort;
++
++    if (nbl->ncj4 <= nbl->nsci)
++    {
++        /* nsci = 0 or all sci have size 1, sorting won't change the order */
++        return;
++    }
++
++    work = nbl->work;
++
++    /* We will distinguish differences up to double the average */
++    m = (2*nbl->ncj4)/nbl->nsci;
++
++    if (m + 1 > work->sort_nalloc)
++    {
++        work->sort_nalloc = over_alloc_large(m + 1);
++        srenew(work->sort, work->sort_nalloc);
++    }
++
++    if (work->sci_sort_nalloc != nbl->sci_nalloc)
++    {
++        work->sci_sort_nalloc = nbl->sci_nalloc;
++        nbnxn_realloc_void((void **)&work->sci_sort,
++                           0,
++                           work->sci_sort_nalloc*sizeof(*work->sci_sort),
++                           nbl->alloc, nbl->free);
++    }
++
++    /* Count the entries of each size */
++    for(i = 0; i <= m; i++)
++    {
++        work->sort[i] = 0;
++    }
++    for(s = 0; s < nbl->nsci; s++)
++    {
++        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
++        work->sort[i]++;
++    }
++    /* Calculate the offset for each count */
++    s0           = work->sort[m];
++    work->sort[m] = 0;
++    for(i = m - 1; i >= 0; i--)
++    {
++        s1            = work->sort[i];
++        work->sort[i] = work->sort[i + 1] + s0;
++        s0            = s1;
++    }
++
++    /* Sort entries directly into place */
++    sci_sort = work->sci_sort;
++    for(s = 0; s < nbl->nsci; s++)
++    {
++        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
++        sci_sort[work->sort[i]++] = nbl->sci[s];
++    }
++
++    /* Swap the sci pointers so we use the new, sorted list */
++    work->sci_sort = nbl->sci;
++    nbl->sci       = sci_sort;
++}
++
 +/* Make a local or non-local pair-list, depending on iloc */
 +void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
 +                         nbnxn_atomdata_t     *nbat,
 +                         const t_blocka       *excl,
 +                         real                  rlist,
 +                         int                   min_ci_balanced,
 +                         nbnxn_pairlist_set_t *nbl_list,
 +                         int                   iloc,
 +                         int                   nb_kernel_type,
 +                         t_nrnb               *nrnb)
 +{
 +    nbnxn_grid_t *gridi, *gridj;
 +    gmx_bool bGPUCPU;
 +    int nzi, zi, zj0, zj1, zj;
 +    int nsubpair_max;
 +    int th;
 +    int nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int ci_block;
 +    gmx_bool CombineNBLists;
++    gmx_bool progBal;
 +    int np_tot, np_noq, np_hlj, nap;
 +
 +    /* Check if we are running hybrid GPU + CPU nbnxn mode */
 +    bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
 +
 +    nnbl            = nbl_list->nnbl;
 +    nbl             = nbl_list->nbl;
 +    CombineNBLists  = nbl_list->bCombined;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "ns making %d nblists\n", nnbl);
 +    }
 +
 +    nbat->bUseBufferFlags = (nbat->nout > 1);
 +    /* We should re-init the flags before making the first list */
 +    if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
 +    {
 +        init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
 +    }
 +
 +    if (nbl_list->bSimple)
 +    {
 +        switch (nb_kernel_type)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +            case nbnxnk4xN_SIMD_4xN:
 +                nbs->icell_set_x = icell_set_x_simd_4xn;
 +                break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +            case nbnxnk4xN_SIMD_2xNN:
 +                nbs->icell_set_x = icell_set_x_simd_2xnn;
 +                break;
 +#endif
 +            default:
 +                nbs->icell_set_x = icell_set_x_simple;
 +                break;
 +        }
 +    }
 +    else
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        nbs->icell_set_x = icell_set_x_supersub_sse8;
 +#else
 +        nbs->icell_set_x = icell_set_x_supersub;
 +#endif
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Only zone (grid) 0 vs 0 */
 +        nzi = 1;
 +        zj0 = 0;
 +        zj1 = 1;
 +    }
 +    else
 +    {
 +        nzi = nbs->zones->nizone;
 +    }
 +
 +    if (!nbl_list->bSimple && min_ci_balanced > 0)
 +    {
 +        nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);
 +    }
 +    else
 +    {
 +        nsubpair_max = 0;
 +    }
 +
 +    /* Clear all pair-lists */
 +    for (th = 0; th < nnbl; th++)
 +    {
 +        clear_pairlist(nbl[th]);
 +    }
 +
 +    for (zi = 0; zi < nzi; zi++)
 +    {
 +        gridi = &nbs->grid[zi];
 +
 +        if (NONLOCAL_I(iloc))
 +        {
 +            zj0 = nbs->zones->izone[zi].j0;
 +            zj1 = nbs->zones->izone[zi].j1;
 +            if (zi == 0)
 +            {
 +                zj0++;
 +            }
 +        }
 +        for (zj = zj0; zj < zj1; zj++)
 +        {
 +            gridj = &nbs->grid[zj];
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
 +            }
 +
 +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 +
 +            if (nbl[0]->bSimple && !gridi->bSimple)
 +            {
 +                /* Hybrid list, determine blocking later */
 +                ci_block = 0;
 +            }
 +            else
 +            {
 +                ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
 +            }
 +
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                /* Re-init the thread-local work flag data before making
 +                 * the first list (not an elegant conditional).
 +                 */
 +                if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
 +                                              (bGPUCPU && zi == 0 && zj == 1)))
 +                {
 +                    init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
 +                }
 +
 +                if (CombineNBLists && th > 0)
 +                {
 +                    clear_pairlist(nbl[th]);
 +                }
 +
++                /* With GPU: generate progressively smaller lists for
++                 * load balancing for local only or non-local with 2 zones.
++                 */
++                progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
++
 +                /* Divide the i super cell equally over the nblists */
 +                nbnxn_make_pairlist_part(nbs, gridi, gridj,
 +                                         &nbs->work[th], nbat, excl,
 +                                         rlist,
 +                                         nb_kernel_type,
 +                                         ci_block,
 +                                         nbat->bUseBufferFlags,
 +                                         nsubpair_max,
-     /*
-        print_supersub_nsp("nsubpair",nbl[0],iloc);
-      */
++                                         progBal, min_ci_balanced,
 +                                         th, nnbl,
 +                                         nbl[th]);
 +            }
 +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
 +
 +            np_tot = 0;
 +            np_noq = 0;
 +            np_hlj = 0;
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
 +
 +                if (nbl_list->bSimple)
 +                {
 +                    np_tot += nbl[th]->ncj;
 +                    np_noq += nbl[th]->work->ncj_noq;
 +                    np_hlj += nbl[th]->work->ncj_hlj;
 +                }
 +                else
 +                {
 +                    /* This count ignores potential subsequent pair pruning */
 +                    np_tot += nbl[th]->nci_tot;
 +                }
 +            }
 +            nap                   = nbl[0]->na_ci*nbl[0]->na_cj;
 +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
 +            nbl_list->natpair_lj  = np_noq*nap;
 +            nbl_list->natpair_q   = np_hlj*nap/2;
 +
 +            if (CombineNBLists && nnbl > 1)
 +            {
 +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
 +
 +                combine_nblists(nnbl-1, nbl+1, nbl[0]);
 +
 +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
 +            }
 +        }
 +    }
 +
++    if (!nbl_list->bSimple)
++    {
++        /* Sort the entries on size, large ones first */
++        if (CombineNBLists || nnbl == 1)
++        {
++            sort_sci(nbl[0]);
++        }
++        else
++        {
++#pragma omp parallel for num_threads(nnbl) schedule(static)
++            for (th = 0; th < nnbl; th++)
++            {
++                sort_sci(nbl[th]);
++            }
++        }
++    }
++
 +    if (nbat->bUseBufferFlags)
 +    {
 +        reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);
 +    }
 +
 +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
 +    if (LOCAL_I(iloc))
 +    {
 +        nbs->search_count++;
 +    }
 +    if (nbs->print_cycles &&
 +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
 +        nbs->search_count % 100 == 0)
 +    {
 +        nbs_cycle_print(stderr, nbs);
 +    }
 +
 +    if (debug && (CombineNBLists && nnbl > 1))
 +    {
 +        if (nbl[0]->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (gmx_debug_at)
 +        {
 +            if (nbl[0]->bSimple)
 +            {
 +                print_nblist_ci_cj(debug, nbl[0]);
 +            }
 +            else
 +            {
 +                print_nblist_sci_cj(debug, nbl[0]);
 +            }
 +        }
 +
 +        if (nbat->bUseBufferFlags)
 +        {
 +            print_reduction_cost(&nbat->buffer_flags, nnbl);
 +        }
 +    }
 +}
index 3431f3e4e90432082225f049de35cac96e32cf53,0000000000000000000000000000000000000000..209534910a516b5c7680df44eafa81cbaa09d41a
mode 100644,000000..100644
--- /dev/null
@@@ -1,2957 -1,0 +1,2982 @@@
-                     b_hybrid = !((wf[i_atom] == 1 && wf[jj] == 1) || (wf[i_atom] == 0 && wf[jj] == 0));
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "network.h"
 +#include "nsgrid.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "ns.h"
 +#include "pbc.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "nrnb.h"
 +#include "txtdump.h"
 +#include "mtop_util.h"
 +
 +#include "domdec.h"
 +#include "adress.h"
 +
 +
 +/*
 + *    E X C L U S I O N   H A N D L I N G
 + */
 +
 +#ifdef DEBUG
 +static void SETEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    e[j] = e[j] | (1<<i);
 +}
 +static void RMEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    e[j] = e[j] & ~(1<<i);
 +}
 +static gmx_bool ISEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    return (gmx_bool)(e[j] & (1<<i));
 +}
 +static gmx_bool NOTEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    return !(ISEXCL(e, i, j));
 +}
 +#else
 +#define SETEXCL(e, i, j) (e)[((atom_id) (j))] |= (1<<((atom_id) (i)))
 +#define RMEXCL(e, i, j)  (e)[((atom_id) (j))] &= (~(1<<((atom_id) (i))))
 +#define ISEXCL(e, i, j)  (gmx_bool) ((e)[((atom_id) (j))] & (1<<((atom_id) (i))))
 +#define NOTEXCL(e, i, j) !(ISEXCL(e, i, j))
 +#endif
 +
 +static int
 +round_up_to_simd_width(int length, int simd_width)
 +{
 +    int offset, newlength;
 +
 +    offset = (simd_width > 0) ? length % simd_width : 0;
 +
 +    return (offset == 0) ? length : length-offset+simd_width;
 +}
 +/************************************************
 + *
 + *  U T I L I T I E S    F O R    N S
 + *
 + ************************************************/
 +
 +static void reallocate_nblist(t_nblist *nl)
 +{
 +    if (gmx_debug_at)
 +    {
 +        fprintf(debug, "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
 +                nl->ielec, nl->ivdw, nl->igeometry, nl->type, nl->maxnri);
 +    }
 +    srenew(nl->iinr,   nl->maxnri);
 +    if (nl->igeometry == GMX_NBLIST_GEOMETRY_CG_CG)
 +    {
 +        srenew(nl->iinr_end, nl->maxnri);
 +    }
 +    srenew(nl->gid,    nl->maxnri);
 +    srenew(nl->shift,  nl->maxnri);
 +    srenew(nl->jindex, nl->maxnri+1);
 +}
 +
 +
 +static void init_nblist(FILE *log, t_nblist *nl_sr, t_nblist *nl_lr,
 +                        int maxsr, int maxlr,
 +                        int ivdw, int ivdwmod,
 +                        int ielec, int ielecmod,
 +                        int igeometry, int type)
 +{
 +    t_nblist *nl;
 +    int       homenr;
 +    int       i, nn;
 +
 +    for (i = 0; (i < 2); i++)
 +    {
 +        nl     = (i == 0) ? nl_sr : nl_lr;
 +        homenr = (i == 0) ? maxsr : maxlr;
 +
 +        if (nl == NULL)
 +        {
 +            continue;
 +        }
 +
 +
 +        /* Set coul/vdw in neighborlist, and for the normal loops we determine
 +         * an index of which one to call.
 +         */
 +        nl->ivdw        = ivdw;
 +        nl->ivdwmod     = ivdwmod;
 +        nl->ielec       = ielec;
 +        nl->ielecmod    = ielecmod;
 +        nl->type        = type;
 +        nl->igeometry   = igeometry;
 +
 +        if (nl->type == GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +        {
 +            nl->igeometry  = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +        }
 +
 +        /* This will also set the simd_padding_width field */
 +        gmx_nonbonded_set_kernel_pointers( (i == 0) ? log : NULL, nl);
 +
 +        /* maxnri is influenced by the number of shifts (maximum is 8)
 +         * and the number of energy groups.
 +         * If it is not enough, nl memory will be reallocated during the run.
 +         * 4 seems to be a reasonable factor, which only causes reallocation
 +         * during runs with tiny and many energygroups.
 +         */
 +        nl->maxnri      = homenr*4;
 +        nl->maxnrj      = 0;
 +        nl->maxlen      = 0;
 +        nl->nri         = -1;
 +        nl->nrj         = 0;
 +        nl->iinr        = NULL;
 +        nl->gid         = NULL;
 +        nl->shift       = NULL;
 +        nl->jindex      = NULL;
 +        reallocate_nblist(nl);
 +        nl->jindex[0] = 0;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Initiating neighbourlist (ielec=%d, ivdw=%d, type=%d) for %s interactions,\nwith %d SR, %d LR atoms.\n",
 +                    nl->ielec, nl->ivdw, nl->type, gmx_nblist_geometry_names[nl->igeometry], maxsr, maxlr);
 +        }
 +    }
 +}
 +
 +void init_neighbor_list(FILE *log, t_forcerec *fr, int homenr)
 +{
 +    /* Make maxlr tunable! (does not seem to be a big difference though)
 +     * This parameter determines the number of i particles in a long range
 +     * neighbourlist. Too few means many function calls, too many means
 +     * cache trashing.
 +     */
 +    int        maxsr, maxsr_wat, maxlr, maxlr_wat;
 +    int        ielec, ielecf, ivdw, ielecmod, ielecmodf, ivdwmod, type;
 +    int        solvent;
 +    int        igeometry_def, igeometry_w, igeometry_ww;
 +    int        i;
 +    t_nblists *nbl;
 +
 +    /* maxsr     = homenr-fr->nWatMol*3; */
 +    maxsr     = homenr;
 +
 +    if (maxsr < 0)
 +    {
 +        gmx_fatal(FARGS, "%s, %d: Negative number of short range atoms.\n"
 +                  "Call your Gromacs dealer for assistance.", __FILE__, __LINE__);
 +    }
 +    /* This is just for initial allocation, so we do not reallocate
 +     * all the nlist arrays many times in a row.
 +     * The numbers seem very accurate, but they are uncritical.
 +     */
 +    maxsr_wat = min(fr->nWatMol, (homenr+2)/3);
 +    if (fr->bTwinRange)
 +    {
 +        maxlr     = 50;
 +        maxlr_wat = min(maxsr_wat, maxlr);
 +    }
 +    else
 +    {
 +        maxlr = maxlr_wat = 0;
 +    }
 +
 +    /* Determine the values for ielec/ivdw. */
 +    ielec    = fr->nbkernel_elec_interaction;
 +    ivdw     = fr->nbkernel_vdw_interaction;
 +    ielecmod = fr->nbkernel_elec_modifier;
 +    ivdwmod  = fr->nbkernel_vdw_modifier;
 +    type     = GMX_NBLIST_INTERACTION_STANDARD;
 +
 +    fr->ns.bCGlist = (getenv("GMX_NBLISTCG") != 0);
 +    if (!fr->ns.bCGlist)
 +    {
 +        igeometry_def = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +    }
 +    else
 +    {
 +        igeometry_def = GMX_NBLIST_GEOMETRY_CG_CG;
 +        if (log != NULL)
 +        {
 +            fprintf(log, "\nUsing charge-group - charge-group neighbor lists and kernels\n\n");
 +        }
 +    }
 +
 +    if (fr->solvent_opt == esolTIP4P)
 +    {
 +        igeometry_w  = GMX_NBLIST_GEOMETRY_WATER4_PARTICLE;
 +        igeometry_ww = GMX_NBLIST_GEOMETRY_WATER4_WATER4;
 +    }
 +    else
 +    {
 +        igeometry_w  = GMX_NBLIST_GEOMETRY_WATER3_PARTICLE;
 +        igeometry_ww = GMX_NBLIST_GEOMETRY_WATER3_WATER3;
 +    }
 +
 +    for (i = 0; i < fr->nnblists; i++)
 +    {
 +        nbl = &(fr->nblists[i]);
 +
 +        if ((fr->adress_type != eAdressOff) && (i >= fr->nnblists/2))
 +        {
 +            type = GMX_NBLIST_INTERACTION_ADRESS;
 +        }
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ], &nbl->nlist_lr[eNL_VDWQQ],
 +                    maxsr, maxlr, ivdw, ivdwmod, ielec, ielecmod, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDW], &nbl->nlist_lr[eNL_VDW],
 +                    maxsr, maxlr, ivdw, ivdwmod, GMX_NBKERNEL_ELEC_NONE, eintmodNONE, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ], &nbl->nlist_lr[eNL_QQ],
 +                    maxsr, maxlr, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_WATER], &nbl->nlist_lr[eNL_VDWQQ_WATER],
 +                    maxsr_wat, maxlr_wat, ivdw, ivdwmod, ielec, ielecmod, igeometry_w, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ_WATER], &nbl->nlist_lr[eNL_QQ_WATER],
 +                    maxsr_wat, maxlr_wat, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_w, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_WATERWATER], &nbl->nlist_lr[eNL_VDWQQ_WATERWATER],
 +                    maxsr_wat, maxlr_wat, ivdw, ivdwmod, ielec, ielecmod, igeometry_ww, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ_WATERWATER], &nbl->nlist_lr[eNL_QQ_WATERWATER],
 +                    maxsr_wat, maxlr_wat, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_ww, type);
 +
 +        /* Did we get the solvent loops so we can use optimized water kernels? */
 +        if (nbl->nlist_sr[eNL_VDWQQ_WATER].kernelptr_vf == NULL
 +            || nbl->nlist_sr[eNL_QQ_WATER].kernelptr_vf == NULL
 +#ifndef DISABLE_WATERWATER_NLIST
 +            || nbl->nlist_sr[eNL_VDWQQ_WATERWATER].kernelptr_vf == NULL
 +            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL
 +#endif
 +            )
 +        {
 +            fr->solvent_opt = esolNO;
 +            fprintf(log, "Note: The available nonbonded kernels do not support water optimization - disabling.\n");
 +        }
 +
 +        if (fr->efep != efepNO)
 +        {
 +            if ((fr->bEwald) && (fr->sc_alphacoul > 0)) /* need to handle long range differently if using softcore */
 +            {
 +                ielecf    = GMX_NBKERNEL_ELEC_EWALD;
 +                ielecmodf = eintmodNONE;
 +            }
 +            else
 +            {
 +                ielecf    = ielec;
 +                ielecmodf = ielecmod;
 +            }
 +
 +            init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_FREE], &nbl->nlist_lr[eNL_VDWQQ_FREE],
 +                        maxsr, maxlr, ivdw, ivdwmod, ielecf, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +            init_nblist(log, &nbl->nlist_sr[eNL_VDW_FREE], &nbl->nlist_lr[eNL_VDW_FREE],
 +                        maxsr, maxlr, ivdw, ivdwmod, GMX_NBKERNEL_ELEC_NONE, eintmodNONE, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +            init_nblist(log, &nbl->nlist_sr[eNL_QQ_FREE], &nbl->nlist_lr[eNL_QQ_FREE],
 +                        maxsr, maxlr, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielecf, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +        }
 +    }
 +    /* QMMM MM list */
 +    if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +    {
 +        init_nblist(log, &fr->QMMMlist, NULL,
 +                    maxsr, maxlr, 0, 0, ielec, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_STANDARD);
 +    }
 +
 +    if (log != NULL)
 +    {
 +        fprintf(log, "\n");
 +    }
 +
 +    fr->ns.nblist_initialized = TRUE;
 +}
 +
 +static void reset_nblist(t_nblist *nl)
 +{
 +    nl->nri       = -1;
 +    nl->nrj       = 0;
 +    nl->maxlen    = 0;
 +    if (nl->jindex)
 +    {
 +        nl->jindex[0] = 0;
 +    }
 +}
 +
 +static void reset_neighbor_lists(t_forcerec *fr, gmx_bool bResetSR, gmx_bool bResetLR)
 +{
 +    int n, i;
 +
 +    if (fr->bQMMM)
 +    {
 +        /* only reset the short-range nblist */
 +        reset_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for (n = 0; n < fr->nnblists; n++)
 +    {
 +        for (i = 0; i < eNL_NR; i++)
 +        {
 +            if (bResetSR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_sr[i]) );
 +            }
 +            if (bResetLR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_lr[i]) );
 +            }
 +        }
 +    }
 +}
 +
 +
 +
 +
 +static inline void new_i_nblist(t_nblist *nlist,
 +                                gmx_bool bLR, atom_id i_atom, int shift, int gid)
 +{
 +    int    i, k, nri, nshift;
 +
 +    nri = nlist->nri;
 +
 +    /* Check whether we have to increase the i counter */
 +    if ((nri == -1) ||
 +        (nlist->iinr[nri]  != i_atom) ||
 +        (nlist->shift[nri] != shift) ||
 +        (nlist->gid[nri]   != gid))
 +    {
 +        /* This is something else. Now see if any entries have
 +         * been added in the list of the previous atom.
 +         */
 +        if ((nri == -1) ||
 +            ((nlist->jindex[nri+1] > nlist->jindex[nri]) &&
 +             (nlist->gid[nri] != -1)))
 +        {
 +            /* If so increase the counter */
 +            nlist->nri++;
 +            nri++;
 +            if (nlist->nri >= nlist->maxnri)
 +            {
 +                nlist->maxnri += over_alloc_large(nlist->nri);
 +                reallocate_nblist(nlist);
 +            }
 +        }
 +        /* Set the number of neighbours and the atom number */
 +        nlist->jindex[nri+1] = nlist->jindex[nri];
 +        nlist->iinr[nri]     = i_atom;
 +        nlist->gid[nri]      = gid;
 +        nlist->shift[nri]    = shift;
 +    }
 +}
 +
 +static inline void close_i_nblist(t_nblist *nlist)
 +{
 +    int nri = nlist->nri;
 +    int len;
 +
 +    if (nri >= 0)
 +    {
 +        /* Add elements up to padding. Since we allocate memory in units
 +         * of the simd_padding width, we do not have to check for possible
 +         * list reallocation here.
 +         */
 +        while ((nlist->nrj % nlist->simd_padding_width) != 0)
 +        {
 +            /* Use -4 here, so we can write forces for 4 atoms before real data */
 +            nlist->jjnr[nlist->nrj++] = -4;
 +        }
 +        nlist->jindex[nri+1] = nlist->nrj;
 +
 +        len = nlist->nrj -  nlist->jindex[nri];
 +
 +        /* nlist length for water i molecules is treated statically
 +         * in the innerloops
 +         */
 +        if (len > nlist->maxlen)
 +        {
 +            nlist->maxlen = len;
 +        }
 +    }
 +}
 +
 +static inline void close_nblist(t_nblist *nlist)
 +{
 +    /* Only close this nblist when it has been initialized.
 +     * Avoid the creation of i-lists with no j-particles.
 +     */
 +    if (nlist->nrj == 0)
 +    {
 +        /* Some assembly kernels do not support empty lists,
 +         * make sure here that we don't generate any empty lists.
 +         * With the current ns code this branch is taken in two cases:
 +         * No i-particles at all: nri=-1 here
 +         * There are i-particles, but no j-particles; nri=0 here
 +         */
 +        nlist->nri = 0;
 +    }
 +    else
 +    {
 +        /* Close list number nri by incrementing the count */
 +        nlist->nri++;
 +    }
 +}
 +
 +static inline void close_neighbor_lists(t_forcerec *fr, gmx_bool bMakeQMMMnblist)
 +{
 +    int n, i;
 +
 +    if (bMakeQMMMnblist)
 +    {
 +        close_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for (n = 0; n < fr->nnblists; n++)
 +    {
 +        for (i = 0; (i < eNL_NR); i++)
 +        {
 +            close_nblist(&(fr->nblists[n].nlist_sr[i]));
 +            close_nblist(&(fr->nblists[n].nlist_lr[i]));
 +        }
 +    }
 +}
 +
 +
 +static inline void add_j_to_nblist(t_nblist *nlist, atom_id j_atom, gmx_bool bLR)
 +{
 +    int nrj = nlist->nrj;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = round_up_to_simd_width(over_alloc_small(nlist->nrj + 1), nlist->simd_padding_width);
 +
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
 +                    bLR ? "LR" : "SR", nlist->ielec, nlist->ivdw, nlist->type, nlist->igeometry, nlist->maxnrj);
 +        }
 +
 +        srenew(nlist->jjnr, nlist->maxnrj);
 +    }
 +
 +    nlist->jjnr[nrj] = j_atom;
 +    nlist->nrj++;
 +}
 +
 +static inline void add_j_to_nblist_cg(t_nblist *nlist,
 +                                      atom_id j_start, int j_end,
 +                                      t_excl *bexcl, gmx_bool i_is_j,
 +                                      gmx_bool bLR)
 +{
 +    int nrj = nlist->nrj;
 +    int j;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = over_alloc_small(nlist->nrj + 1);
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
 +                    bLR ? "LR" : "SR", nlist->ielec, nlist->ivdw, nlist->type, nlist->igeometry, nlist->maxnrj);
 +        }
 +
 +        srenew(nlist->jjnr, nlist->maxnrj);
 +        srenew(nlist->jjnr_end, nlist->maxnrj);
 +        srenew(nlist->excl, nlist->maxnrj*MAX_CGCGSIZE);
 +    }
 +
 +    nlist->jjnr[nrj]     = j_start;
 +    nlist->jjnr_end[nrj] = j_end;
 +
 +    if (j_end - j_start > MAX_CGCGSIZE)
 +    {
 +        gmx_fatal(FARGS, "The charge-group - charge-group neighborlist do not support charge groups larger than %d, found a charge group of size %d", MAX_CGCGSIZE, j_end-j_start);
 +    }
 +
 +    /* Set the exclusions */
 +    for (j = j_start; j < j_end; j++)
 +    {
 +        nlist->excl[nrj*MAX_CGCGSIZE + j - j_start] = bexcl[j];
 +    }
 +    if (i_is_j)
 +    {
 +        /* Avoid double counting of intra-cg interactions */
 +        for (j = 1; j < j_end-j_start; j++)
 +        {
 +            nlist->excl[nrj*MAX_CGCGSIZE + j] |= (1<<j) - 1;
 +        }
 +    }
 +
 +    nlist->nrj++;
 +}
 +
 +typedef void
 +    put_in_list_t (gmx_bool              bHaveVdW[],
 +                   int                   ngid,
 +                   t_mdatoms     *       md,
 +                   int                   icg,
 +                   int                   jgid,
 +                   int                   nj,
 +                   atom_id               jjcg[],
 +                   atom_id               index[],
 +                   t_excl                bExcl[],
 +                   int                   shift,
 +                   t_forcerec     *      fr,
 +                   gmx_bool              bLR,
 +                   gmx_bool              bDoVdW,
 +                   gmx_bool              bDoCoul,
 +                   int                   solvent_opt);
 +
 +static void
 +put_in_list_at(gmx_bool              bHaveVdW[],
 +               int                   ngid,
 +               t_mdatoms     *       md,
 +               int                   icg,
 +               int                   jgid,
 +               int                   nj,
 +               atom_id               jjcg[],
 +               atom_id               index[],
 +               t_excl                bExcl[],
 +               int                   shift,
 +               t_forcerec     *      fr,
 +               gmx_bool              bLR,
 +               gmx_bool              bDoVdW,
 +               gmx_bool              bDoCoul,
 +               int                   solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist  *   vdwc;
 +    t_nblist  *   vdw;
 +    t_nblist  *   coul;
 +    t_nblist  *   vdwc_free  = NULL;
 +    t_nblist  *   vdw_free   = NULL;
 +    t_nblist  *   coul_free  = NULL;
 +    t_nblist  *   vdwc_ww    = NULL;
 +    t_nblist  *   coul_ww    = NULL;
 +
 +    int           i, j, jcg, igid, gid, nbl_ind, ind_ij;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg, len;
 +
 +    int          *cginfo;
 +    int          *type, *typeB;
 +    real         *charge, *chargeB;
 +    real          qi, qiB, qq, rlj;
 +    gmx_bool      bFreeEnergy, bFree, bFreeJ, bNotEx, *bPert;
 +    gmx_bool      bDoVdW_i, bDoCoul_i, bDoCoul_i_sol;
 +    int           iwater, jwater;
 +    t_nblist     *nlist;
 +
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt != esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +
 +    bFreeEnergy = FALSE;
 +    if (md->nPerturbed)
 +    {
 +        /* Check if any of the particles involved are perturbed.
 +         * If not we can do the cheaper normal put_in_list
 +         * and use more solvent optimization.
 +         */
 +        for (i = 0; i < nicg; i++)
 +        {
 +            bFreeEnergy |= bPert[i0+i];
 +        }
 +        /* Loop over the j charge groups */
 +        for (j = 0; (j < nj && !bFreeEnergy); j++)
 +        {
 +            jcg = jjcg[j];
 +            jj0 = index[jcg];
 +            jj1 = index[jcg+1];
 +            /* Finally loop over the atoms in the j-charge group */
 +            for (jj = jj0; jj < jj1; jj++)
 +            {
 +                bFreeEnergy |= bPert[jj];
 +            }
 +        }
 +    }
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[GID(igid, jgid, ngid)];
 +    }
 +    if (bLR)
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_sr;
 +    }
 +
 +    if (iwater != esolNO)
 +    {
 +        vdwc = &nlist[eNL_VDWQQ_WATER];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ_WATER];
 +#ifndef DISABLE_WATERWATER_NLIST
 +        vdwc_ww = &nlist[eNL_VDWQQ_WATERWATER];
 +        coul_ww = &nlist[eNL_QQ_WATERWATER];
 +#endif
 +    }
 +    else
 +    {
 +        vdwc = &nlist[eNL_VDWQQ];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ];
 +    }
 +
 +    if (!bFreeEnergy)
 +    {
 +        if (iwater != esolNO)
 +        {
 +            /* Loop over the atoms in the i charge group */
 +            i_atom  = i0;
 +            gid     = GID(igid, jgid, ngid);
 +            /* Create new i_atom for each energy group */
 +            if (bDoCoul && bDoVdW)
 +            {
 +                new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(vdwc_ww, bLR, i_atom, shift, gid);
 +#endif
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +            }
 +            if (bDoCoul)
 +            {
 +                new_i_nblist(coul, bLR, i_atom, shift, gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(coul_ww, bLR, i_atom, shift, gid);
 +#endif
 +            }
 +            /* Loop over the j charge groups */
 +            for (j = 0; (j < nj); j++)
 +            {
 +                jcg = jjcg[j];
 +
 +                if (jcg == icg)
 +                {
 +                    continue;
 +                }
 +
 +                jj0    = index[jcg];
 +                jwater = GET_CGINFO_SOLOPT(cginfo[jcg]);
 +
 +                if (iwater == esolSPC && jwater == esolSPC)
 +                {
 +                    /* Interaction between two SPC molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw, jj0, bLR);
 +                    }
 +                    else
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST
 +                        /* Add entries for the three atoms - only do VdW if we need to */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc, jj0, bLR);
 +                        }
 +                        add_j_to_nblist(coul, jj0+1, bLR);
 +                        add_j_to_nblist(coul, jj0+2, bLR);
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww, jj0, bLR);
 +                        }
 +#endif
 +                    }
 +                }
 +                else if (iwater == esolTIP4P && jwater == esolTIP4P)
 +                {
 +                    /* Interaction between two TIP4p molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw, jj0, bLR);
 +                    }
 +                    else
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST
 +                        /* Add entries for the four atoms - only do VdW if we need to */
 +                        if (bDoVdW)
 +                        {
 +                            add_j_to_nblist(vdw, jj0, bLR);
 +                        }
 +                        add_j_to_nblist(coul, jj0+1, bLR);
 +                        add_j_to_nblist(coul, jj0+2, bLR);
 +                        add_j_to_nblist(coul, jj0+3, bLR);
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww, jj0, bLR);
 +                        }
 +#endif
 +                    }
 +                }
 +                else
 +                {
 +                    /* j charge group is not water, but i is.
 +                     * Add entries to the water-other_atom lists; the geometry of the water
 +                     * molecule doesn't matter - that is taken care of in the nonbonded kernel,
 +                     * so we don't care if it is SPC or TIP4P...
 +                     */
 +
 +                    jj1 = index[jcg+1];
 +
 +                    if (!bDoVdW)
 +                    {
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                    else if (!bDoCoul)
 +                    {
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                add_j_to_nblist(vdw, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* _charge_ _groups_ interact with both coulomb and LJ */
 +                        /* Check which atoms we should add to the lists!       */
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    add_j_to_nblist(vdwc, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw, jj, bLR);
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +#ifndef DISABLE_WATERWATER_NLIST
 +            close_i_nblist(coul_ww);
 +            close_i_nblist(vdwc_ww);
 +#endif
 +        }
 +        else
 +        {
 +            /* no solvent as i charge group */
 +            /* Loop over the atoms in the i charge group */
 +            for (i = 0; i < nicg; i++)
 +            {
 +                i_atom  = i0+i;
 +                gid     = GID(igid, jgid, ngid);
 +                qi      = charge[i_atom];
 +
 +                /* Create new i_atom for each energy group */
 +                if (bDoVdW && bDoCoul)
 +                {
 +                    new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +                }
 +                if (bDoVdW)
 +                {
 +                    new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +                }
 +                if (bDoCoul)
 +                {
 +                    new_i_nblist(coul, bLR, i_atom, shift, gid);
 +                }
 +                bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +                bDoCoul_i = (bDoCoul && qi != 0);
 +
 +                if (bDoVdW_i || bDoCoul_i)
 +                {
 +                    /* Loop over the j charge groups */
 +                    for (j = 0; (j < nj); j++)
 +                    {
 +                        jcg = jjcg[j];
 +
 +                        /* Check for large charge groups */
 +                        if (jcg == icg)
 +                        {
 +                            jj0 = i0 + i + 1;
 +                        }
 +                        else
 +                        {
 +                            jj0 = index[jcg];
 +                        }
 +
 +                        jj1 = index[jcg+1];
 +                        /* Finally loop over the atoms in the j-charge group */
 +                        for (jj = jj0; jj < jj1; jj++)
 +                        {
 +                            bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                            if (bNotEx)
 +                            {
 +                                if (!bDoVdW_i)
 +                                {
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i)
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc, jj, bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +                close_i_nblist(vdw);
 +                close_i_nblist(coul);
 +                close_i_nblist(vdwc);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* we are doing free energy */
 +        vdwc_free = &nlist[eNL_VDWQQ_FREE];
 +        vdw_free  = &nlist[eNL_VDW_FREE];
 +        coul_free = &nlist[eNL_QQ_FREE];
 +        /* Loop over the atoms in the i charge group */
 +        for (i = 0; i < nicg; i++)
 +        {
 +            i_atom  = i0+i;
 +            gid     = GID(igid, jgid, ngid);
 +            qi      = charge[i_atom];
 +            qiB     = chargeB[i_atom];
 +
 +            /* Create new i_atom for each energy group */
 +            if (bDoVdW && bDoCoul)
 +            {
 +                new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +            }
 +            if (bDoCoul)
 +            {
 +                new_i_nblist(coul, bLR, i_atom, shift, gid);
 +            }
 +
 +            new_i_nblist(vdw_free, bLR, i_atom, shift, gid);
 +            new_i_nblist(coul_free, bLR, i_atom, shift, gid);
 +            new_i_nblist(vdwc_free, bLR, i_atom, shift, gid);
 +
 +            bDoVdW_i  = (bDoVdW  &&
 +                         (bHaveVdW[type[i_atom]] || bHaveVdW[typeB[i_atom]]));
 +            bDoCoul_i = (bDoCoul && (qi != 0 || qiB != 0));
 +            /* For TIP4P the first atom does not have a charge,
 +             * but the last three do. So we should still put an atom
 +             * without LJ but with charge in the water-atom neighborlist
 +             * for a TIP4p i charge group.
 +             * For SPC type water the first atom has LJ and charge,
 +             * so there is no such problem.
 +             */
 +            if (iwater == esolNO)
 +            {
 +                bDoCoul_i_sol = bDoCoul_i;
 +            }
 +            else
 +            {
 +                bDoCoul_i_sol = bDoCoul;
 +            }
 +
 +            if (bDoVdW_i || bDoCoul_i_sol)
 +            {
 +                /* Loop over the j charge groups */
 +                for (j = 0; (j < nj); j++)
 +                {
 +                    jcg = jjcg[j];
 +
 +                    /* Check for large charge groups */
 +                    if (jcg == icg)
 +                    {
 +                        jj0 = i0 + i + 1;
 +                    }
 +                    else
 +                    {
 +                        jj0 = index[jcg];
 +                    }
 +
 +                    jj1 = index[jcg+1];
 +                    /* Finally loop over the atoms in the j-charge group */
 +                    bFree = bPert[i_atom];
 +                    for (jj = jj0; (jj < jj1); jj++)
 +                    {
 +                        bFreeJ = bFree || bPert[jj];
 +                        /* Complicated if, because the water H's should also
 +                         * see perturbed j-particles
 +                         */
 +                        if (iwater == esolNO || i == 0 || bFreeJ)
 +                        {
 +                            bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                            if (bNotEx)
 +                            {
 +                                if (bFreeJ)
 +                                {
 +                                    if (!bDoVdW_i)
 +                                    {
 +                                        if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(coul_free, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (!bDoCoul_i)
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            add_j_to_nblist(vdw_free, jj, bLR);
 +                                        }
 +                                    }
 +                                    else
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                            {
 +                                                add_j_to_nblist(vdwc_free, jj, bLR);
 +                                            }
 +                                            else
 +                                            {
 +                                                add_j_to_nblist(vdw_free, jj, bLR);
 +                                            }
 +                                        }
 +                                        else if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(coul_free, jj, bLR);
 +                                        }
 +                                    }
 +                                }
 +                                else if (!bDoVdW_i)
 +                                {
 +                                    /* This is done whether or not bWater is set */
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i_sol)
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc, jj, bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_free);
 +            close_i_nblist(coul_free);
 +            close_i_nblist(vdwc_free);
 +        }
 +    }
 +}
 +
 +static void
 +put_in_list_adress(gmx_bool              bHaveVdW[],
 +                   int                   ngid,
 +                   t_mdatoms     *       md,
 +                   int                   icg,
 +                   int                   jgid,
 +                   int                   nj,
 +                   atom_id               jjcg[],
 +                   atom_id               index[],
 +                   t_excl                bExcl[],
 +                   int                   shift,
 +                   t_forcerec     *      fr,
 +                   gmx_bool              bLR,
 +                   gmx_bool              bDoVdW,
 +                   gmx_bool              bDoCoul,
 +                   int                   solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist  *   vdwc;
 +    t_nblist  *   vdw;
 +    t_nblist  *   coul;
 +    t_nblist  *   vdwc_adress  = NULL;
 +    t_nblist  *   vdw_adress   = NULL;
 +    t_nblist  *   coul_adress  = NULL;
 +    t_nblist  *   vdwc_ww      = NULL;
 +    t_nblist  *   coul_ww      = NULL;
 +
 +    int           i, j, jcg, igid, gid, nbl_ind, nbl_ind_adress;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg, len;
 +
 +    int          *cginfo;
 +    int          *type, *typeB;
 +    real         *charge, *chargeB;
 +    real         *wf;
 +    real          qi, qiB, qq, rlj;
 +    gmx_bool      bFreeEnergy, bFree, bFreeJ, bNotEx, *bPert;
 +    gmx_bool      bDoVdW_i, bDoCoul_i, bDoCoul_i_sol;
 +    gmx_bool      b_hybrid;
 +    gmx_bool      j_all_atom;
 +    int           iwater, jwater;
 +    t_nblist     *nlist, *nlist_adress;
++    gmx_bool      bEnergyGroupCG;
 +
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +    wf      = md->wf;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt != esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +
 +    if (md->nPerturbed)
 +    {
 +        gmx_fatal(FARGS, "AdResS does not support free energy pertubation\n");
 +    }
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 2)
 +    {
 +        nbl_ind        = 0;
 +        nbl_ind_adress = 1;
 +    }
 +    else
 +    {
 +        nbl_ind        = fr->gid2nblists[GID(igid, jgid, ngid)];
 +        nbl_ind_adress = nbl_ind+fr->nnblists/2;
 +    }
 +    if (bLR)
 +    {
 +        nlist        = fr->nblists[nbl_ind].nlist_lr;
 +        nlist_adress = fr->nblists[nbl_ind_adress].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist        = fr->nblists[nbl_ind].nlist_sr;
 +        nlist_adress = fr->nblists[nbl_ind_adress].nlist_sr;
 +    }
 +
 +
 +    vdwc = &nlist[eNL_VDWQQ];
 +    vdw  = &nlist[eNL_VDW];
 +    coul = &nlist[eNL_QQ];
 +
 +    vdwc_adress = &nlist_adress[eNL_VDWQQ];
 +    vdw_adress  = &nlist_adress[eNL_VDW];
 +    coul_adress = &nlist_adress[eNL_QQ];
 +
 +    /* We do not support solvent optimization with AdResS for now.
 +       For this we would need hybrid solvent-other kernels */
 +
 +    /* no solvent as i charge group */
 +    /* Loop over the atoms in the i charge group */
 +    for (i = 0; i < nicg; i++)
 +    {
 +        i_atom  = i0+i;
 +        gid     = GID(igid, jgid, ngid);
 +        qi      = charge[i_atom];
 +
 +        /* Create new i_atom for each energy group */
 +        if (bDoVdW && bDoCoul)
 +        {
 +            new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +            new_i_nblist(vdwc_adress, bLR, i_atom, shift, gid);
 +
 +        }
 +        if (bDoVdW)
 +        {
 +            new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +            new_i_nblist(vdw_adress, bLR, i_atom, shift, gid);
 +
 +        }
 +        if (bDoCoul)
 +        {
 +            new_i_nblist(coul, bLR, i_atom, shift, gid);
 +            new_i_nblist(coul_adress, bLR, i_atom, shift, gid);
 +        }
 +        bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +        bDoCoul_i = (bDoCoul && qi != 0);
 +
++        /* Here we find out whether the energy groups interaction belong to a
++         * coarse-grained (vsite) or atomistic interaction. Note that, beacuse
++         * interactions between coarse-grained and other (atomistic) energygroups
++         * are excluded automatically by grompp, it is sufficient to check for
++         * the group id of atom i (igid) */
++        bEnergyGroupCG = !egp_explicit(fr, igid);
++
 +        if (bDoVdW_i || bDoCoul_i)
 +        {
 +            /* Loop over the j charge groups */
 +            for (j = 0; (j < nj); j++)
 +            {
 +                jcg = jjcg[j];
 +
 +                /* Check for large charge groups */
 +                if (jcg == icg)
 +                {
 +                    jj0 = i0 + i + 1;
 +                }
 +                else
 +                {
 +                    jj0 = index[jcg];
 +                }
 +
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for (jj = jj0; jj < jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl, i, jj);
 +
-                         if (md->wf[cgs->index[icg]] == 0 && egp_explicit(fr, igid))
++                    /* Now we have to exclude interactions which will be zero
++                     * anyway due to the AdResS weights (in previous implementations
++                     * this was done in the force kernel). This is necessary as
++                     * pure interactions (those with b_hybrid=false, i.e. w_i*w_j==1 or 0)
++                     * are put into neighbour lists which will be passed to the
++                     * standard (optimized) kernels for speed. The interactions with
++                     * b_hybrid=true are placed into the _adress neighbour lists and
++                     * processed by the generic AdResS kernel.
++                     */
++                    if ( (bEnergyGroupCG &&
++                         wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS ) ||
++                           ( !bEnergyGroupCG && wf[jj] <= GMX_REAL_EPS ) )
++                    {
++                        continue;
++                    }
++
++                    b_hybrid = !((wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS) ||
++                        (wf[i_atom] <= GMX_REAL_EPS && wf[jj] <= GMX_REAL_EPS));
 +
 +                    if (bNotEx)
 +                    {
 +                        if (!bDoVdW_i)
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(coul, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(coul_adress, jj, bLR);
 +                                }
 +                            }
 +                        }
 +                        else if (!bDoCoul_i)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(vdw, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw_adress, jj, bLR);
 +                                }
 +                            }
 +                        }
 +                        else
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    if (!b_hybrid)
 +                                    {
 +                                        add_j_to_nblist(vdwc, jj, bLR);
 +                                    }
 +                                    else
 +                                    {
 +                                        add_j_to_nblist(vdwc_adress, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (!b_hybrid)
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                    else
 +                                    {
 +                                        add_j_to_nblist(vdw_adress, jj, bLR);
 +                                    }
 +
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(coul, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(coul_adress, jj, bLR);
 +                                }
 +
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_adress);
 +            close_i_nblist(coul_adress);
 +            close_i_nblist(vdwc_adress);
 +        }
 +    }
 +}
 +
 +static void
 +put_in_list_qmmm(gmx_bool              bHaveVdW[],
 +                 int                   ngid,
 +                 t_mdatoms     *       md,
 +                 int                   icg,
 +                 int                   jgid,
 +                 int                   nj,
 +                 atom_id               jjcg[],
 +                 atom_id               index[],
 +                 t_excl                bExcl[],
 +                 int                   shift,
 +                 t_forcerec     *      fr,
 +                 gmx_bool              bLR,
 +                 gmx_bool              bDoVdW,
 +                 gmx_bool              bDoCoul,
 +                 int                   solvent_opt)
 +{
 +    t_nblist  *   coul;
 +    int           i, j, jcg, igid, gid;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg;
 +    gmx_bool      bNotEx;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(fr->cginfo[icg]);
 +
 +    coul = &fr->QMMMlist;
 +
 +    /* Loop over atoms in the ith charge group */
 +    for (i = 0; i < nicg; i++)
 +    {
 +        i_atom = i0+i;
 +        gid    = GID(igid, jgid, ngid);
 +        /* Create new i_atom for each energy group */
 +        new_i_nblist(coul, bLR, i_atom, shift, gid);
 +
 +        /* Loop over the j charge groups */
 +        for (j = 0; j < nj; j++)
 +        {
 +            jcg = jjcg[j];
 +
 +            /* Charge groups cannot have QM and MM atoms simultaneously */
 +            if (jcg != icg)
 +            {
 +                jj0 = index[jcg];
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for (jj = jj0; jj < jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl, i, jj);
 +                    if (bNotEx)
 +                    {
 +                        add_j_to_nblist(coul, jj, bLR);
 +                    }
 +                }
 +            }
 +        }
 +        close_i_nblist(coul);
 +    }
 +}
 +
 +static void
 +put_in_list_cg(gmx_bool              bHaveVdW[],
 +               int                   ngid,
 +               t_mdatoms     *       md,
 +               int                   icg,
 +               int                   jgid,
 +               int                   nj,
 +               atom_id               jjcg[],
 +               atom_id               index[],
 +               t_excl                bExcl[],
 +               int                   shift,
 +               t_forcerec     *      fr,
 +               gmx_bool              bLR,
 +               gmx_bool              bDoVdW,
 +               gmx_bool              bDoCoul,
 +               int                   solvent_opt)
 +{
 +    int          cginfo;
 +    int          igid, gid, nbl_ind;
 +    t_nblist *   vdwc;
 +    int          j, jcg;
 +
 +    cginfo = fr->cginfo[icg];
 +
 +    igid = GET_CGINFO_GID(cginfo);
 +    gid  = GID(igid, jgid, ngid);
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[gid];
 +    }
 +    if (bLR)
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_lr[eNL_VDWQQ];
 +    }
 +    else
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_sr[eNL_VDWQQ];
 +    }
 +
 +    /* Make a new neighbor list for charge group icg.
 +     * Currently simply one neighbor list is made with LJ and Coulomb.
 +     * If required, zero interactions could be removed here
 +     * or in the force loop.
 +     */
 +    new_i_nblist(vdwc, bLR, index[icg], shift, gid);
 +    vdwc->iinr_end[vdwc->nri] = index[icg+1];
 +
 +    for (j = 0; (j < nj); j++)
 +    {
 +        jcg = jjcg[j];
 +        /* Skip the icg-icg pairs if all self interactions are excluded */
 +        if (!(jcg == icg && GET_CGINFO_EXCL_INTRA(cginfo)))
 +        {
 +            /* Here we add the j charge group jcg to the list,
 +             * exclusions are also added to the list.
 +             */
 +            add_j_to_nblist_cg(vdwc, index[jcg], index[jcg+1], bExcl, icg == jcg, bLR);
 +        }
 +    }
 +
 +    close_i_nblist(vdwc);
 +}
 +
 +static void setexcl(atom_id start, atom_id end, t_blocka *excl, gmx_bool b,
 +                    t_excl bexcl[])
 +{
 +    atom_id i, k;
 +
 +    if (b)
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            for (k = excl->index[i]; k < excl->index[i+1]; k++)
 +            {
 +                SETEXCL(bexcl, i-start, excl->a[k]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            for (k = excl->index[i]; k < excl->index[i+1]; k++)
 +            {
 +                RMEXCL(bexcl, i-start, excl->a[k]);
 +            }
 +        }
 +    }
 +}
 +
 +int calc_naaj(int icg, int cgtot)
 +{
 +    int naaj;
 +
 +    if ((cgtot % 2) == 1)
 +    {
 +        /* Odd number of charge groups, easy */
 +        naaj = 1 + (cgtot/2);
 +    }
 +    else if ((cgtot % 4) == 0)
 +    {
 +        /* Multiple of four is hard */
 +        if (icg < cgtot/2)
 +        {
 +            if ((icg % 2) == 0)
 +            {
 +                naaj = 1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj = cgtot/2;
 +            }
 +        }
 +        else
 +        {
 +            if ((icg % 2) == 1)
 +            {
 +                naaj = 1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj = cgtot/2;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* cgtot/2 = odd */
 +        if ((icg % 2) == 0)
 +        {
 +            naaj = 1+(cgtot/2);
 +        }
 +        else
 +        {
 +            naaj = cgtot/2;
 +        }
 +    }
 +#ifdef DEBUG
 +    fprintf(log, "naaj=%d\n", naaj);
 +#endif
 +
 +    return naaj;
 +}
 +
 +/************************************************
 + *
 + *  S I M P L E      C O R E     S T U F F
 + *
 + ************************************************/
 +
 +static real calc_image_tric(rvec xi, rvec xj, matrix box,
 +                            rvec b_inv, int *shift)
 +{
 +    /* This code assumes that the cut-off is smaller than
 +     * a half times the smallest diagonal element of the box.
 +     */
 +    const real h25 = 2.5;
 +    real       dx, dy, dz;
 +    real       r2;
 +    int        tx, ty, tz;
 +
 +    /* Compute diff vector */
 +    dz = xj[ZZ] - xi[ZZ];
 +    dy = xj[YY] - xi[YY];
 +    dx = xj[XX] - xi[XX];
 +
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 2.5 then subtract 2 again
 +     */
 +    tz  = dz*b_inv[ZZ] + h25;
 +    tz -= 2;
 +    dz -= tz*box[ZZ][ZZ];
 +    dy -= tz*box[ZZ][YY];
 +    dx -= tz*box[ZZ][XX];
 +
 +    ty  = dy*b_inv[YY] + h25;
 +    ty -= 2;
 +    dy -= ty*box[YY][YY];
 +    dx -= ty*box[YY][XX];
 +
 +    tx  = dx*b_inv[XX]+h25;
 +    tx -= 2;
 +    dx -= tx*box[XX][XX];
 +
 +    /* Distance squared */
 +    r2 = (dx*dx) + (dy*dy) + (dz*dz);
 +
 +    *shift = XYZ2IS(tx, ty, tz);
 +
 +    return r2;
 +}
 +
 +static real calc_image_rect(rvec xi, rvec xj, rvec box_size,
 +                            rvec b_inv, int *shift)
 +{
 +    const real h15 = 1.5;
 +    real       ddx, ddy, ddz;
 +    real       dx, dy, dz;
 +    real       r2;
 +    int        tx, ty, tz;
 +
 +    /* Compute diff vector */
 +    dx = xj[XX] - xi[XX];
 +    dy = xj[YY] - xi[YY];
 +    dz = xj[ZZ] - xi[ZZ];
 +
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 1.5 then subtract 1 again
 +     */
 +    tx = dx*b_inv[XX] + h15;
 +    ty = dy*b_inv[YY] + h15;
 +    tz = dz*b_inv[ZZ] + h15;
 +    tx--;
 +    ty--;
 +    tz--;
 +
 +    /* Correct diff vector for translation */
 +    ddx = tx*box_size[XX] - dx;
 +    ddy = ty*box_size[YY] - dy;
 +    ddz = tz*box_size[ZZ] - dz;
 +
 +    /* Distance squared */
 +    r2 = (ddx*ddx) + (ddy*ddy) + (ddz*ddz);
 +
 +    *shift = XYZ2IS(tx, ty, tz);
 +
 +    return r2;
 +}
 +
 +static void add_simple(t_ns_buf *nsbuf, int nrj, atom_id cg_j,
 +                       gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                       int icg, int jgid, t_block *cgs, t_excl bexcl[],
 +                       int shift, t_forcerec *fr, put_in_list_t *put_in_list)
 +{
 +    if (nsbuf->nj + nrj > MAX_CG)
 +    {
 +        put_in_list(bHaveVdW, ngid, md, icg, jgid, nsbuf->ncg, nsbuf->jcg,
 +                    cgs->index, bexcl, shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +        /* Reset buffer contents */
 +        nsbuf->ncg = nsbuf->nj = 0;
 +    }
 +    nsbuf->jcg[nsbuf->ncg++] = cg_j;
 +    nsbuf->nj               += nrj;
 +}
 +
 +static void ns_inner_tric(rvec x[], int icg, int *i_egp_flags,
 +                          int njcg, atom_id jcg[],
 +                          matrix box, rvec b_inv, real rcut2,
 +                          t_block *cgs, t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                          t_excl bexcl[], t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int       shift;
 +    int       j, nrj, jgid;
 +    int      *cginfo = fr->cginfo;
 +    atom_id   cg_j, *cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    shift   = CENTRAL;
 +    for (j = 0; (j < njcg); j++)
 +    {
 +        cg_j   = jcg[j];
 +        nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +        if (calc_image_tric(x[icg], x[cg_j], box, b_inv, &shift) < rcut2)
 +        {
 +            jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +            if (!(i_egp_flags[jgid] & EGP_EXCL))
 +            {
 +                add_simple(&ns_buf[jgid][shift], nrj, cg_j,
 +                           bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, shift, fr,
 +                           put_in_list);
 +            }
 +        }
 +    }
 +}
 +
 +static void ns_inner_rect(rvec x[], int icg, int *i_egp_flags,
 +                          int njcg, atom_id jcg[],
 +                          gmx_bool bBox, rvec box_size, rvec b_inv, real rcut2,
 +                          t_block *cgs, t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                          t_excl bexcl[], t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int       shift;
 +    int       j, nrj, jgid;
 +    int      *cginfo = fr->cginfo;
 +    atom_id   cg_j, *cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    if (bBox)
 +    {
 +        shift = CENTRAL;
 +        for (j = 0; (j < njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if (calc_image_rect(x[icg], x[cg_j], box_size, b_inv, &shift) < rcut2)
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][shift], nrj, cg_j,
 +                               bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, shift, fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (j = 0; (j < njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if ((rcut2 == 0) || (distance2(x[icg], x[cg_j]) < rcut2))
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][CENTRAL], nrj, cg_j,
 +                               bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, CENTRAL, fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* ns_simple_core needs to be adapted for QMMM still 2005 */
 +
 +static int ns_simple_core(t_forcerec *fr,
 +                          gmx_localtop_t *top,
 +                          t_mdatoms *md,
 +                          matrix box, rvec box_size,
 +                          t_excl bexcl[], atom_id *aaj,
 +                          int ngid, t_ns_buf **ns_buf,
 +                          put_in_list_t *put_in_list, gmx_bool bHaveVdW[])
 +{
 +    int          naaj, k;
 +    real         rlist2;
 +    int          nsearch, icg, jcg, igid, i0, nri, nn;
 +    int         *cginfo;
 +    t_ns_buf    *nsbuf;
 +    /* atom_id  *i_atoms; */
 +    t_block     *cgs  = &(top->cgs);
 +    t_blocka    *excl = &(top->excls);
 +    rvec         b_inv;
 +    int          m;
 +    gmx_bool     bBox, bTriclinic;
 +    int         *i_egp_flags;
 +
 +    rlist2 = sqr(fr->rlist);
 +
 +    bBox = (fr->ePBC != epbcNONE);
 +    if (bBox)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            b_inv[m] = divide_err(1.0, box_size[m]);
 +        }
 +        bTriclinic = TRICLINIC(box);
 +    }
 +    else
 +    {
 +        bTriclinic = FALSE;
 +    }
 +
 +    cginfo = fr->cginfo;
 +
 +    nsearch = 0;
 +    for (icg = fr->cg0; (icg < fr->hcg); icg++)
 +    {
 +        /*
 +           i0        = cgs->index[icg];
 +           nri       = cgs->index[icg+1]-i0;
 +           i_atoms   = &(cgs->a[i0]);
 +           i_eg_excl = fr->eg_excl + ngid*md->cENER[*i_atoms];
 +           setexcl(nri,i_atoms,excl,TRUE,bexcl);
 +         */
 +        igid        = GET_CGINFO_GID(cginfo[icg]);
 +        i_egp_flags = fr->egp_flags + ngid*igid;
 +        setexcl(cgs->index[icg], cgs->index[icg+1], excl, TRUE, bexcl);
 +
 +        naaj = calc_naaj(icg, cgs->nr);
 +        if (bTriclinic)
 +        {
 +            ns_inner_tric(fr->cg_cm, icg, i_egp_flags, naaj, &(aaj[icg]),
 +                          box, b_inv, rlist2, cgs, ns_buf,
 +                          bHaveVdW, ngid, md, bexcl, fr, put_in_list);
 +        }
 +        else
 +        {
 +            ns_inner_rect(fr->cg_cm, icg, i_egp_flags, naaj, &(aaj[icg]),
 +                          bBox, box_size, b_inv, rlist2, cgs, ns_buf,
 +                          bHaveVdW, ngid, md, bexcl, fr, put_in_list);
 +        }
 +        nsearch += naaj;
 +
 +        for (nn = 0; (nn < ngid); nn++)
 +        {
 +            for (k = 0; (k < SHIFTS); k++)
 +            {
 +                nsbuf = &(ns_buf[nn][k]);
 +                if (nsbuf->ncg > 0)
 +                {
 +                    put_in_list(bHaveVdW, ngid, md, icg, nn, nsbuf->ncg, nsbuf->jcg,
 +                                cgs->index, bexcl, k, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                    nsbuf->ncg = nsbuf->nj = 0;
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg], cgs->index[icg+1], excl, FALSE, bexcl);
 +    }
 +    close_neighbor_lists(fr, FALSE);
 +
 +    return nsearch;
 +}
 +
 +/************************************************
 + *
 + *    N S 5     G R I D     S T U F F
 + *
 + ************************************************/
 +
 +static inline void get_dx(int Nx, real gridx, real rc2, int xgi, real x,
 +                          int *dx0, int *dx1, real *dcx2)
 +{
 +    real dcx, tmp;
 +    int  xgi0, xgi1, i;
 +
 +    if (xgi < 0)
 +    {
 +        *dx0 = 0;
 +        xgi0 = -1;
 +        *dx1 = -1;
 +        xgi1 = 0;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        *dx0 = Nx;
 +        xgi0 = Nx-1;
 +        *dx1 = Nx-1;
 +        xgi1 = Nx;
 +    }
 +    else
 +    {
 +        dcx2[xgi] = 0;
 +        *dx0      = xgi;
 +        xgi0      = xgi-1;
 +        *dx1      = xgi;
 +        xgi1      = xgi+1;
 +    }
 +
 +    for (i = xgi0; i >= 0; i--)
 +    {
 +        dcx = (i+1)*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx0    = i;
 +        dcx2[i] = tmp;
 +    }
 +    for (i = xgi1; i < Nx; i++)
 +    {
 +        dcx = i*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx1    = i;
 +        dcx2[i] = tmp;
 +    }
 +}
 +
 +static inline void get_dx_dd(int Nx, real gridx, real rc2, int xgi, real x,
 +                             int ncpddc, int shift_min, int shift_max,
 +                             int *g0, int *g1, real *dcx2)
 +{
 +    real dcx, tmp;
 +    int  g_min, g_max, shift_home;
 +
 +    if (xgi < 0)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = 0;
 +        *g1   = -1;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = Nx;
 +        *g1   = Nx - 1;
 +    }
 +    else
 +    {
 +        if (ncpddc == 0)
 +        {
 +            g_min = 0;
 +            g_max = Nx - 1;
 +        }
 +        else
 +        {
 +            if (xgi < ncpddc)
 +            {
 +                shift_home = 0;
 +            }
 +            else
 +            {
 +                shift_home = -1;
 +            }
 +            g_min = (shift_min == shift_home ? 0          : ncpddc);
 +            g_max = (shift_max == shift_home ? ncpddc - 1 : Nx - 1);
 +        }
 +        if (shift_min > 0)
 +        {
 +            *g0 = g_min;
 +            *g1 = g_min - 1;
 +        }
 +        else if (shift_max < 0)
 +        {
 +            *g0 = g_max + 1;
 +            *g1 = g_max;
 +        }
 +        else
 +        {
 +            *g0       = xgi;
 +            *g1       = xgi;
 +            dcx2[xgi] = 0;
 +        }
 +    }
 +
 +    while (*g0 > g_min)
 +    {
 +        /* Check one grid cell down */
 +        dcx = ((*g0 - 1) + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g0)--;
 +        dcx2[*g0] = tmp;
 +    }
 +
 +    while (*g1 < g_max)
 +    {
 +        /* Check one grid cell up */
 +        dcx = (*g1 + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g1)++;
 +        dcx2[*g1] = tmp;
 +    }
 +}
 +
 +
 +#define sqr(x) ((x)*(x))
 +#define calc_dx2(XI, YI, ZI, y) (sqr(XI-y[XX]) + sqr(YI-y[YY]) + sqr(ZI-y[ZZ]))
 +#define calc_cyl_dx2(XI, YI, y) (sqr(XI-y[XX]) + sqr(YI-y[YY]))
 +/****************************************************
 + *
 + *    F A S T   N E I G H B O R  S E A R C H I N G
 + *
 + *    Optimized neighboursearching routine using grid
 + *    at least 1x1x1, see GROMACS manual
 + *
 + ****************************************************/
 +
 +
 +static void get_cutoff2(t_forcerec *fr, gmx_bool bDoLongRange,
 +                        real *rvdw2, real *rcoul2,
 +                        real *rs2, real *rm2, real *rl2)
 +{
 +    *rs2 = sqr(fr->rlist);
 +
 +    if (bDoLongRange && fr->bTwinRange)
 +    {
 +        /* The VdW and elec. LR cut-off's could be different,
 +         * so we can not simply set them to rlistlong.
 +         */
 +        if (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(fr->vdwtype) &&
 +            fr->rvdw > fr->rlist)
 +        {
 +            *rvdw2  = sqr(fr->rlistlong);
 +        }
 +        else
 +        {
 +            *rvdw2  = sqr(fr->rvdw);
 +        }
 +        if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(fr->eeltype) &&
 +            fr->rcoulomb > fr->rlist)
 +        {
 +            *rcoul2 = sqr(fr->rlistlong);
 +        }
 +        else
 +        {
 +            *rcoul2 = sqr(fr->rcoulomb);
 +        }
 +    }
 +    else
 +    {
 +        /* Workaround for a gcc -O3 or -ffast-math problem */
 +        *rvdw2  = *rs2;
 +        *rcoul2 = *rs2;
 +    }
 +    *rm2 = min(*rvdw2, *rcoul2);
 +    *rl2 = max(*rvdw2, *rcoul2);
 +}
 +
 +static void init_nsgrid_lists(t_forcerec *fr, int ngid, gmx_ns_t *ns)
 +{
 +    real rvdw2, rcoul2, rs2, rm2, rl2;
 +    int  j;
 +
 +    get_cutoff2(fr, TRUE, &rvdw2, &rcoul2, &rs2, &rm2, &rl2);
 +
 +    /* Short range buffers */
 +    snew(ns->nl_sr, ngid);
 +    /* Counters */
 +    snew(ns->nsr, ngid);
 +    snew(ns->nlr_ljc, ngid);
 +    snew(ns->nlr_one, ngid);
 +
 +    /* Always allocate both list types, since rcoulomb might now change with PME load balancing */
 +    /* Long range VdW and Coul buffers */
 +    snew(ns->nl_lr_ljc, ngid);
 +    /* Long range VdW or Coul only buffers */
 +    snew(ns->nl_lr_one, ngid);
 +
 +    for (j = 0; (j < ngid); j++)
 +    {
 +        snew(ns->nl_sr[j], MAX_CG);
 +        snew(ns->nl_lr_ljc[j], MAX_CG);
 +        snew(ns->nl_lr_one[j], MAX_CG);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "ns5_core: rs2 = %g, rm2 = %g, rl2 = %g (nm^2)\n",
 +                rs2, rm2, rl2);
 +    }
 +}
 +
 +static int nsgrid_core(FILE *log, t_commrec *cr, t_forcerec *fr,
 +                       matrix box, rvec box_size, int ngid,
 +                       gmx_localtop_t *top,
 +                       t_grid *grid, rvec x[],
 +                       t_excl bexcl[], gmx_bool *bExcludeAlleg,
 +                       t_nrnb *nrnb, t_mdatoms *md,
 +                       real *lambda, real *dvdlambda,
 +                       gmx_grppairener_t *grppener,
 +                       put_in_list_t *put_in_list,
 +                       gmx_bool bHaveVdW[],
 +                       gmx_bool bDoLongRange, gmx_bool bMakeQMMMnblist)
 +{
 +    gmx_ns_t     *ns;
 +    atom_id     **nl_lr_ljc, **nl_lr_one, **nl_sr;
 +    int          *nlr_ljc, *nlr_one, *nsr;
 +    gmx_domdec_t *dd     = NULL;
 +    t_block      *cgs    = &(top->cgs);
 +    int          *cginfo = fr->cginfo;
 +    /* atom_id *i_atoms,*cgsindex=cgs->index; */
 +    ivec          sh0, sh1, shp;
 +    int           cell_x, cell_y, cell_z;
 +    int           d, tx, ty, tz, dx, dy, dz, cj;
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    int           zsh_ty, zsh_tx, ysh_tx;
 +#endif
 +    int           dx0, dx1, dy0, dy1, dz0, dz1;
 +    int           Nx, Ny, Nz, shift = -1, j, nrj, nns, nn = -1;
 +    real          gridx, gridy, gridz, grid_x, grid_y, grid_z;
 +    real         *dcx2, *dcy2, *dcz2;
 +    int           zgi, ygi, xgi;
 +    int           cg0, cg1, icg = -1, cgsnr, i0, igid, nri, naaj, max_jcg;
 +    int           jcg0, jcg1, jjcg, cgj0, jgid;
 +    int          *grida, *gridnra, *gridind;
 +    gmx_bool      rvdw_lt_rcoul, rcoul_lt_rvdw;
 +    rvec          xi, *cgcm, grid_offset;
 +    real          r2, rs2, rvdw2, rcoul2, rm2, rl2, XI, YI, ZI, dcx, dcy, dcz, tmp1, tmp2;
 +    int          *i_egp_flags;
 +    gmx_bool      bDomDec, bTriclinicX, bTriclinicY;
 +    ivec          ncpddc;
 +
 +    ns = &fr->ns;
 +
 +    bDomDec = DOMAINDECOMP(cr);
 +    if (bDomDec)
 +    {
 +        dd = cr->dd;
 +    }
 +
 +    bTriclinicX = ((YY < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[YY] == 1) && box[YY][XX] != 0) ||
 +                   (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ] == 1) && box[ZZ][XX] != 0));
 +    bTriclinicY =  (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ] == 1) && box[ZZ][YY] != 0);
 +
 +    cgsnr    = cgs->nr;
 +
 +    get_cutoff2(fr, bDoLongRange, &rvdw2, &rcoul2, &rs2, &rm2, &rl2);
 +
 +    rvdw_lt_rcoul = (rvdw2 >= rcoul2);
 +    rcoul_lt_rvdw = (rcoul2 >= rvdw2);
 +
 +    if (bMakeQMMMnblist)
 +    {
 +        rm2 = rl2;
 +        rs2 = rl2;
 +    }
 +
 +    nl_sr     = ns->nl_sr;
 +    nsr       = ns->nsr;
 +    nl_lr_ljc = ns->nl_lr_ljc;
 +    nl_lr_one = ns->nl_lr_one;
 +    nlr_ljc   = ns->nlr_ljc;
 +    nlr_one   = ns->nlr_one;
 +
 +    /* Unpack arrays */
 +    cgcm    = fr->cg_cm;
 +    Nx      = grid->n[XX];
 +    Ny      = grid->n[YY];
 +    Nz      = grid->n[ZZ];
 +    grida   = grid->a;
 +    gridind = grid->index;
 +    gridnra = grid->nra;
 +    nns     = 0;
 +
 +    gridx      = grid->cell_size[XX];
 +    gridy      = grid->cell_size[YY];
 +    gridz      = grid->cell_size[ZZ];
 +    grid_x     = 1/gridx;
 +    grid_y     = 1/gridy;
 +    grid_z     = 1/gridz;
 +    copy_rvec(grid->cell_offset, grid_offset);
 +    copy_ivec(grid->ncpddc, ncpddc);
 +    dcx2       = grid->dcx2;
 +    dcy2       = grid->dcy2;
 +    dcz2       = grid->dcz2;
 +
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    zsh_ty = floor(-box[ZZ][YY]/box[YY][YY]+0.5);
 +    zsh_tx = floor(-box[ZZ][XX]/box[XX][XX]+0.5);
 +    ysh_tx = floor(-box[YY][XX]/box[XX][XX]+0.5);
 +    if (zsh_tx != 0 && ysh_tx != 0)
 +    {
 +        /* This could happen due to rounding, when both ratios are 0.5 */
 +        ysh_tx = 0;
 +    }
 +#endif
 +
 +    debug_gmx();
 +
 +    if (fr->n_tpi)
 +    {
 +        /* We only want a list for the test particle */
 +        cg0 = cgsnr - 1;
 +    }
 +    else
 +    {
 +        cg0 = grid->icg0;
 +    }
 +    cg1 = grid->icg1;
 +
 +    /* Set the shift range */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        sh0[d] = -1;
 +        sh1[d] = 1;
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(fr->ePBC) || (bDomDec && dd->nc[d] > 1))
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    /* Loop over charge groups */
 +    for (icg = cg0; (icg < cg1); icg++)
 +    {
 +        igid = GET_CGINFO_GID(cginfo[icg]);
 +        /* Skip this charge group if all energy groups are excluded! */
 +        if (bExcludeAlleg[igid])
 +        {
 +            continue;
 +        }
 +
 +        i0   = cgs->index[icg];
 +
 +        if (bMakeQMMMnblist)
 +        {
 +            /* Skip this charge group if it is not a QM atom while making a
 +             * QM/MM neighbourlist
 +             */
 +            if (md->bQM[i0] == FALSE)
 +            {
 +                continue; /* MM particle, go to next particle */
 +            }
 +
 +            /* Compute the number of charge groups that fall within the control
 +             * of this one (icg)
 +             */
 +            naaj    = calc_naaj(icg, cgsnr);
 +            jcg0    = icg;
 +            jcg1    = icg + naaj;
 +            max_jcg = cgsnr;
 +        }
 +        else
 +        {
 +            /* make a normal neighbourlist */
 +
 +            if (bDomDec)
 +            {
 +                /* Get the j charge-group and dd cell shift ranges */
 +                dd_get_ns_ranges(cr->dd, icg, &jcg0, &jcg1, sh0, sh1);
 +                max_jcg = 0;
 +            }
 +            else
 +            {
 +                /* Compute the number of charge groups that fall within the control
 +                 * of this one (icg)
 +                 */
 +                naaj = calc_naaj(icg, cgsnr);
 +                jcg0 = icg;
 +                jcg1 = icg + naaj;
 +
 +                if (fr->n_tpi)
 +                {
 +                    /* The i-particle is awlways the test particle,
 +                     * so we want all j-particles
 +                     */
 +                    max_jcg = cgsnr - 1;
 +                }
 +                else
 +                {
 +                    max_jcg  = jcg1 - cgsnr;
 +                }
 +            }
 +        }
 +
 +        i_egp_flags = fr->egp_flags + igid*ngid;
 +
 +        /* Set the exclusions for the atoms in charge group icg using a bitmask */
 +        setexcl(i0, cgs->index[icg+1], &top->excls, TRUE, bexcl);
 +
 +        ci2xyz(grid, icg, &cell_x, &cell_y, &cell_z);
 +
 +        /* Changed iicg to icg, DvdS 990115
 +         * (but see consistency check above, DvdS 990330)
 +         */
 +#ifdef NS5DB
 +        fprintf(log, "icg=%5d, naaj=%5d, cell %d %d %d\n",
 +                icg, naaj, cell_x, cell_y, cell_z);
 +#endif
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
 +        {
 +            ZI = cgcm[icg][ZZ]+tz*box[ZZ][ZZ];
 +            /* Calculate range of cells in Z direction that have the shift tz */
 +            zgi = cell_z + tz*Nz;
 +#define FAST_DD_NS
 +#ifndef FAST_DD_NS
 +            get_dx(Nz, gridz, rl2, zgi, ZI, &dz0, &dz1, dcz2);
 +#else
 +            get_dx_dd(Nz, gridz, rl2, zgi, ZI-grid_offset[ZZ],
 +                      ncpddc[ZZ], sh0[ZZ], sh1[ZZ], &dz0, &dz1, dcz2);
 +#endif
 +            if (dz0 > dz1)
 +            {
 +                continue;
 +            }
 +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
 +            {
 +                YI = cgcm[icg][YY]+ty*box[YY][YY]+tz*box[ZZ][YY];
 +                /* Calculate range of cells in Y direction that have the shift ty */
 +                if (bTriclinicY)
 +                {
 +                    ygi = (int)(Ny + (YI - grid_offset[YY])*grid_y) - Ny;
 +                }
 +                else
 +                {
 +                    ygi = cell_y + ty*Ny;
 +                }
 +#ifndef FAST_DD_NS
 +                get_dx(Ny, gridy, rl2, ygi, YI, &dy0, &dy1, dcy2);
 +#else
 +                get_dx_dd(Ny, gridy, rl2, ygi, YI-grid_offset[YY],
 +                          ncpddc[YY], sh0[YY], sh1[YY], &dy0, &dy1, dcy2);
 +#endif
 +                if (dy0 > dy1)
 +                {
 +                    continue;
 +                }
 +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
 +                {
 +                    XI = cgcm[icg][XX]+tx*box[XX][XX]+ty*box[YY][XX]+tz*box[ZZ][XX];
 +                    /* Calculate range of cells in X direction that have the shift tx */
 +                    if (bTriclinicX)
 +                    {
 +                        xgi = (int)(Nx + (XI - grid_offset[XX])*grid_x) - Nx;
 +                    }
 +                    else
 +                    {
 +                        xgi = cell_x + tx*Nx;
 +                    }
 +#ifndef FAST_DD_NS
 +                    get_dx(Nx, gridx, rl2, xgi*Nx, XI, &dx0, &dx1, dcx2);
 +#else
 +                    get_dx_dd(Nx, gridx, rl2, xgi, XI-grid_offset[XX],
 +                              ncpddc[XX], sh0[XX], sh1[XX], &dx0, &dx1, dcx2);
 +#endif
 +                    if (dx0 > dx1)
 +                    {
 +                        continue;
 +                    }
 +                    /* Adress: an explicit cg that has a weigthing function of 0 is excluded
 +                     *  from the neigbour list as it will not interact  */
 +                    if (fr->adress_type != eAdressOff)
 +                    {
++                        if (md->wf[cgs->index[icg]] <= GMX_REAL_EPS && egp_explicit(fr, igid))
 +                        {
 +                            continue;
 +                        }
 +                    }
 +                    /* Get shift vector */
 +                    shift = XYZ2IS(tx, ty, tz);
 +#ifdef NS5DB
 +                    range_check(shift, 0, SHIFTS);
 +#endif
 +                    for (nn = 0; (nn < ngid); nn++)
 +                    {
 +                        nsr[nn]      = 0;
 +                        nlr_ljc[nn]  = 0;
 +                        nlr_one[nn]  = 0;
 +                    }
 +#ifdef NS5DB
 +                    fprintf(log, "shift: %2d, dx0,1: %2d,%2d, dy0,1: %2d,%2d, dz0,1: %2d,%2d\n",
 +                            shift, dx0, dx1, dy0, dy1, dz0, dz1);
 +                    fprintf(log, "cgcm: %8.3f  %8.3f  %8.3f\n", cgcm[icg][XX],
 +                            cgcm[icg][YY], cgcm[icg][ZZ]);
 +                    fprintf(log, "xi:   %8.3f  %8.3f  %8.3f\n", XI, YI, ZI);
 +#endif
 +                    for (dx = dx0; (dx <= dx1); dx++)
 +                    {
 +                        tmp1 = rl2 - dcx2[dx];
 +                        for (dy = dy0; (dy <= dy1); dy++)
 +                        {
 +                            tmp2 = tmp1 - dcy2[dy];
 +                            if (tmp2 > 0)
 +                            {
 +                                for (dz = dz0; (dz <= dz1); dz++)
 +                                {
 +                                    if (tmp2 > dcz2[dz])
 +                                    {
 +                                        /* Find grid-cell cj in which possible neighbours are */
 +                                        cj   = xyz2ci(Ny, Nz, dx, dy, dz);
 +
 +                                        /* Check out how many cgs (nrj) there in this cell */
 +                                        nrj  = gridnra[cj];
 +
 +                                        /* Find the offset in the cg list */
 +                                        cgj0 = gridind[cj];
 +
 +                                        /* Check if all j's are out of range so we
 +                                         * can skip the whole cell.
 +                                         * Should save some time, especially with DD.
 +                                         */
 +                                        if (nrj == 0 ||
 +                                            (grida[cgj0] >= max_jcg &&
 +                                             (grida[cgj0] >= jcg1 || grida[cgj0+nrj-1] < jcg0)))
 +                                        {
 +                                            continue;
 +                                        }
 +
 +                                        /* Loop over cgs */
 +                                        for (j = 0; (j < nrj); j++)
 +                                        {
 +                                            jjcg = grida[cgj0+j];
 +
 +                                            /* check whether this guy is in range! */
 +                                            if ((jjcg >= jcg0 && jjcg < jcg1) ||
 +                                                (jjcg < max_jcg))
 +                                            {
 +                                                r2 = calc_dx2(XI, YI, ZI, cgcm[jjcg]);
 +                                                if (r2 < rl2)
 +                                                {
 +                                                    /* jgid = gid[cgsatoms[cgsindex[jjcg]]]; */
 +                                                    jgid = GET_CGINFO_GID(cginfo[jjcg]);
 +                                                    /* check energy group exclusions */
 +                                                    if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                                                    {
 +                                                        if (r2 < rs2)
 +                                                        {
 +                                                            if (nsr[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to short-range list */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nsr[jgid], nl_sr[jgid],
 +                                                                            cgs->index, /* cgsatoms, */ bexcl,
 +                                                                            shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                                                                nsr[jgid] = 0;
 +                                                            }
 +                                                            nl_sr[jgid][nsr[jgid]++] = jjcg;
 +                                                        }
 +                                                        else if (r2 < rm2)
 +                                                        {
 +                                                            if (nlr_ljc[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to LJ+coulomb long-range list */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nlr_ljc[jgid], nl_lr_ljc[jgid], top->cgs.index,
 +                                                                            bexcl, shift, fr, TRUE, TRUE, TRUE, fr->solvent_opt);
 +                                                                nlr_ljc[jgid] = 0;
 +                                                            }
 +                                                            nl_lr_ljc[jgid][nlr_ljc[jgid]++] = jjcg;
 +                                                        }
 +                                                        else
 +                                                        {
 +                                                            if (nlr_one[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to long-range list with only coul, or only LJ */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nlr_one[jgid], nl_lr_one[jgid], top->cgs.index,
 +                                                                            bexcl, shift, fr, TRUE, rvdw_lt_rcoul, rcoul_lt_rvdw, fr->solvent_opt);
 +                                                                nlr_one[jgid] = 0;
 +                                                            }
 +                                                            nl_lr_one[jgid][nlr_one[jgid]++] = jjcg;
 +                                                        }
 +                                                    }
 +                                                }
 +                                                nns++;
 +                                            }
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                    /* CHECK whether there is anything left in the buffers */
 +                    for (nn = 0; (nn < ngid); nn++)
 +                    {
 +                        if (nsr[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nsr[nn], nl_sr[nn],
 +                                        cgs->index, /* cgsatoms, */ bexcl,
 +                                        shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                        }
 +
 +                        if (nlr_ljc[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nlr_ljc[nn],
 +                                        nl_lr_ljc[nn], top->cgs.index,
 +                                        bexcl, shift, fr, TRUE, TRUE, TRUE, fr->solvent_opt);
 +                        }
 +
 +                        if (nlr_one[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nlr_one[nn],
 +                                        nl_lr_one[nn], top->cgs.index,
 +                                        bexcl, shift, fr, TRUE, rvdw_lt_rcoul, rcoul_lt_rvdw, fr->solvent_opt);
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,&top->atoms.excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg], cgs->index[icg+1], &top->excls, FALSE, bexcl);
 +    }
 +    /* No need to perform any left-over force calculations anymore (as we used to do here)
 +     * since we now save the proper long-range lists for later evaluation.
 +     */
 +
 +    debug_gmx();
 +
 +    /* Close neighbourlists */
 +    close_neighbor_lists(fr, bMakeQMMMnblist);
 +
 +    return nns;
 +}
 +
 +void ns_realloc_natoms(gmx_ns_t *ns, int natoms)
 +{
 +    int i;
 +
 +    if (natoms > ns->nra_alloc)
 +    {
 +        ns->nra_alloc = over_alloc_dd(natoms);
 +        srenew(ns->bexcl, ns->nra_alloc);
 +        for (i = 0; i < ns->nra_alloc; i++)
 +        {
 +            ns->bexcl[i] = 0;
 +        }
 +    }
 +}
 +
 +void init_ns(FILE *fplog, const t_commrec *cr,
 +             gmx_ns_t *ns, t_forcerec *fr,
 +             const gmx_mtop_t *mtop,
 +             matrix box)
 +{
 +    int  mt, icg, nr_in_cg, maxcg, i, j, jcg, ngid, ncg;
 +    t_block *cgs;
 +    char *ptr;
 +
 +    /* Compute largest charge groups size (# atoms) */
 +    nr_in_cg = 1;
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        cgs = &mtop->moltype[mt].cgs;
 +        for (icg = 0; (icg < cgs->nr); icg++)
 +        {
 +            nr_in_cg = max(nr_in_cg, (int)(cgs->index[icg+1]-cgs->index[icg]));
 +        }
 +    }
 +
 +    /* Verify whether largest charge group is <= max cg.
 +     * This is determined by the type of the local exclusion type
 +     * Exclusions are stored in bits. (If the type is not large
 +     * enough, enlarge it, unsigned char -> unsigned short -> unsigned long)
 +     */
 +    maxcg = sizeof(t_excl)*8;
 +    if (nr_in_cg > maxcg)
 +    {
 +        gmx_fatal(FARGS, "Max #atoms in a charge group: %d > %d\n",
 +                  nr_in_cg, maxcg);
 +    }
 +
 +    ngid = mtop->groups.grps[egcENER].nr;
 +    snew(ns->bExcludeAlleg, ngid);
 +    for (i = 0; i < ngid; i++)
 +    {
 +        ns->bExcludeAlleg[i] = TRUE;
 +        for (j = 0; j < ngid; j++)
 +        {
 +            if (!(fr->egp_flags[i*ngid+j] & EGP_EXCL))
 +            {
 +                ns->bExcludeAlleg[i] = FALSE;
 +            }
 +        }
 +    }
 +
 +    if (fr->bGrid)
 +    {
 +        /* Grid search */
 +        ns->grid = init_grid(fplog, fr);
 +        init_nsgrid_lists(fr, ngid, ns);
 +    }
 +    else
 +    {
 +        /* Simple search */
 +        snew(ns->ns_buf, ngid);
 +        for (i = 0; (i < ngid); i++)
 +        {
 +            snew(ns->ns_buf[i], SHIFTS);
 +        }
 +        ncg = ncg_mtop(mtop);
 +        snew(ns->simple_aaj, 2*ncg);
 +        for (jcg = 0; (jcg < ncg); jcg++)
 +        {
 +            ns->simple_aaj[jcg]     = jcg;
 +            ns->simple_aaj[jcg+ncg] = jcg;
 +        }
 +    }
 +
 +    /* Create array that determines whether or not atoms have VdW */
 +    snew(ns->bHaveVdW, fr->ntype);
 +    for (i = 0; (i < fr->ntype); i++)
 +    {
 +        for (j = 0; (j < fr->ntype); j++)
 +        {
 +            ns->bHaveVdW[i] = (ns->bHaveVdW[i] ||
 +                               (fr->bBHAM ?
 +                                ((BHAMA(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (BHAMB(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (BHAMC(fr->nbfp, fr->ntype, i, j) != 0)) :
 +                                ((C6(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (C12(fr->nbfp, fr->ntype, i, j) != 0))));
 +        }
 +    }
 +    if (debug)
 +    {
 +        pr_bvec(debug, 0, "bHaveVdW", ns->bHaveVdW, fr->ntype, TRUE);
 +    }
 +
 +    ns->nra_alloc = 0;
 +    ns->bexcl     = NULL;
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* This could be reduced with particle decomposition */
 +        ns_realloc_natoms(ns, mtop->natoms);
 +    }
 +
 +    ns->nblist_initialized = FALSE;
 +
 +    /* nbr list debug dump */
 +    {
 +        char *ptr = getenv("GMX_DUMP_NL");
 +        if (ptr)
 +        {
 +            ns->dump_nl = strtol(ptr, NULL, 10);
 +            if (fplog)
 +            {
 +                fprintf(fplog, "GMX_DUMP_NL = %d", ns->dump_nl);
 +            }
 +        }
 +        else
 +        {
 +            ns->dump_nl = 0;
 +        }
 +    }
 +}
 +
 +
 +int search_neighbours(FILE *log, t_forcerec *fr,
 +                      rvec x[], matrix box,
 +                      gmx_localtop_t *top,
 +                      gmx_groups_t *groups,
 +                      t_commrec *cr,
 +                      t_nrnb *nrnb, t_mdatoms *md,
 +                      real *lambda, real *dvdlambda,
 +                      gmx_grppairener_t *grppener,
 +                      gmx_bool bFillGrid,
 +                      gmx_bool bDoLongRangeNS,
 +                      gmx_bool bPadListsForKernels)
 +{
 +    t_block  *cgs = &(top->cgs);
 +    rvec     box_size, grid_x0, grid_x1;
 +    int      i, j, m, ngid;
 +    real     min_size, grid_dens;
 +    int      nsearch;
 +    gmx_bool     bGrid;
 +    char     *ptr;
 +    gmx_bool     *i_egp_flags;
 +    int      cg_start, cg_end, start, end;
 +    gmx_ns_t *ns;
 +    t_grid   *grid;
 +    gmx_domdec_zones_t *dd_zones;
 +    put_in_list_t *put_in_list;
 +
 +    ns = &fr->ns;
 +
 +    /* Set some local variables */
 +    bGrid = fr->bGrid;
 +    ngid  = groups->grps[egcENER].nr;
 +
 +    for (m = 0; (m < DIM); m++)
 +    {
 +        box_size[m] = box[m][m];
 +    }
 +
 +    if (fr->ePBC != epbcNONE)
 +    {
 +        if (sqr(fr->rlistlong) >= max_cutoff2(fr->ePBC, box))
 +        {
 +            gmx_fatal(FARGS, "One of the box vectors has become shorter than twice the cut-off length or box_yy-|box_zy| or box_zz has become smaller than the cut-off.");
 +        }
 +        if (!bGrid)
 +        {
 +            min_size = min(box_size[XX], min(box_size[YY], box_size[ZZ]));
 +            if (2*fr->rlistlong >= min_size)
 +            {
 +                gmx_fatal(FARGS, "One of the box diagonal elements has become smaller than twice the cut-off length.");
 +            }
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        ns_realloc_natoms(ns, cgs->index[cgs->nr]);
 +    }
 +    debug_gmx();
 +
 +    /* Reset the neighbourlists */
 +    reset_neighbor_lists(fr, TRUE, TRUE);
 +
 +    if (bGrid && bFillGrid)
 +    {
 +
 +        grid = ns->grid;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_zones = domdec_zones(cr->dd);
 +        }
 +        else
 +        {
 +            dd_zones = NULL;
 +
 +            get_nsgrid_boundaries(grid->nboundeddim, box, NULL, NULL, NULL, NULL,
 +                                  cgs->nr, fr->cg_cm, grid_x0, grid_x1, &grid_dens);
 +
 +            grid_first(log, grid, NULL, NULL, fr->ePBC, box, grid_x0, grid_x1,
 +                       fr->rlistlong, grid_dens);
 +        }
 +        debug_gmx();
 +
 +        /* Don't know why this all is... (DvdS 3/99) */
 +#ifndef SEGV
 +        start = 0;
 +        end   = cgs->nr;
 +#else
 +        start = fr->cg0;
 +        end   = (cgs->nr+1)/2;
 +#endif
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            end = cgs->nr;
 +            fill_grid(log, dd_zones, grid, end, -1, end, fr->cg_cm);
 +            grid->icg0 = 0;
 +            grid->icg1 = dd_zones->izone[dd_zones->nizone-1].cg1;
 +        }
 +        else
 +        {
 +            fill_grid(log, NULL, grid, cgs->nr, fr->cg0, fr->hcg, fr->cg_cm);
 +            grid->icg0 = fr->cg0;
 +            grid->icg1 = fr->hcg;
 +            debug_gmx();
 +
 +            if (PARTDECOMP(cr))
 +            {
 +                mv_grid(cr, grid);
 +            }
 +            debug_gmx();
 +        }
 +
 +        calc_elemnr(log, grid, start, end, cgs->nr);
 +        calc_ptrs(grid);
 +        grid_last(log, grid, start, end, cgs->nr);
 +
 +        if (gmx_debug_at)
 +        {
 +            check_grid(debug, grid);
 +            print_grid(debug, grid);
 +        }
 +    }
 +    else if (fr->n_tpi)
 +    {
 +        /* Set the grid cell index for the test particle only.
 +         * The cell to cg index is not corrected, but that does not matter.
 +         */
 +        fill_grid(log, NULL, ns->grid, fr->hcg, fr->hcg-1, fr->hcg, fr->cg_cm);
 +    }
 +    debug_gmx();
 +
 +    if (fr->adress_type == eAdressOff)
 +    {
 +        if (!fr->ns.bCGlist)
 +        {
 +            put_in_list = put_in_list_at;
 +        }
 +        else
 +        {
 +            put_in_list = put_in_list_cg;
 +        }
 +    }
 +    else
 +    {
 +        put_in_list = put_in_list_adress;
 +    }
 +
 +    /* Do the core! */
 +    if (bGrid)
 +    {
 +        grid    = ns->grid;
 +        nsearch = nsgrid_core(log, cr, fr, box, box_size, ngid, top,
 +                              grid, x, ns->bexcl, ns->bExcludeAlleg,
 +                              nrnb, md, lambda, dvdlambda, grppener,
 +                              put_in_list, ns->bHaveVdW,
 +                              bDoLongRangeNS, FALSE);
 +
 +        /* neighbour searching withouth QMMM! QM atoms have zero charge in
 +         * the classical calculation. The charge-charge interaction
 +         * between QM and MM atoms is handled in the QMMM core calculation
 +         * (see QMMM.c). The VDW however, we'd like to compute classically
 +         * and the QM MM atom pairs have just been put in the
 +         * corresponding neighbourlists. in case of QMMM we still need to
 +         * fill a special QMMM neighbourlist that contains all neighbours
 +         * of the QM atoms. If bQMMM is true, this list will now be made:
 +         */
 +        if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +        {
 +            nsearch += nsgrid_core(log, cr, fr, box, box_size, ngid, top,
 +                                   grid, x, ns->bexcl, ns->bExcludeAlleg,
 +                                   nrnb, md, lambda, dvdlambda, grppener,
 +                                   put_in_list_qmmm, ns->bHaveVdW,
 +                                   bDoLongRangeNS, TRUE);
 +        }
 +    }
 +    else
 +    {
 +        nsearch = ns_simple_core(fr, top, md, box, box_size,
 +                                 ns->bexcl, ns->simple_aaj,
 +                                 ngid, ns->ns_buf, put_in_list, ns->bHaveVdW);
 +    }
 +    debug_gmx();
 +
 +#ifdef DEBUG
 +    pr_nsblock(log);
 +#endif
 +
 +    inc_nrnb(nrnb, eNR_NS, nsearch);
 +    /* inc_nrnb(nrnb,eNR_LR,fr->nlr); */
 +
 +    return nsearch;
 +}
 +
 +int natoms_beyond_ns_buffer(t_inputrec *ir, t_forcerec *fr, t_block *cgs,
 +                            matrix scale_tot, rvec *x)
 +{
 +    int  cg0, cg1, cg, a0, a1, a, i, j;
 +    real rint, hbuf2, scale;
 +    rvec *cg_cm, cgsc;
 +    gmx_bool bIsotropic;
 +    int  nBeyond;
 +
 +    nBeyond = 0;
 +
 +    rint = max(ir->rcoulomb, ir->rvdw);
 +    if (ir->rlist < rint)
 +    {
 +        gmx_fatal(FARGS, "The neighbor search buffer has negative size: %f nm",
 +                  ir->rlist - rint);
 +    }
 +    cg_cm = fr->cg_cm;
 +
 +    cg0 = fr->cg0;
 +    cg1 = fr->hcg;
 +
 +    if (!EI_DYNAMICS(ir->eI) || !DYNAMIC_BOX(*ir))
 +    {
 +        hbuf2 = sqr(0.5*(ir->rlist - rint));
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            a0 = cgs->index[cg];
 +            a1 = cgs->index[cg+1];
 +            for (a = a0; a < a1; a++)
 +            {
 +                if (distance2(cg_cm[cg], x[a]) > hbuf2)
 +                {
 +                    nBeyond++;
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        bIsotropic = TRUE;
 +        scale      = scale_tot[0][0];
 +        for (i = 1; i < DIM; i++)
 +        {
 +            /* With anisotropic scaling, the original spherical ns volumes become
 +             * ellipsoids. To avoid costly transformations we use the minimum
 +             * eigenvalue of the scaling matrix for determining the buffer size.
 +             * Since the lower half is 0, the eigenvalues are the diagonal elements.
 +             */
 +            scale = min(scale, scale_tot[i][i]);
 +            if (scale_tot[i][i] != scale_tot[i-1][i-1])
 +            {
 +                bIsotropic = FALSE;
 +            }
 +            for (j = 0; j < i; j++)
 +            {
 +                if (scale_tot[i][j] != 0)
 +                {
 +                    bIsotropic = FALSE;
 +                }
 +            }
 +        }
 +        hbuf2 = sqr(0.5*(scale*ir->rlist - rint));
 +        if (bIsotropic)
 +        {
 +            for (cg = cg0; cg < cg1; cg++)
 +            {
 +                svmul(scale, cg_cm[cg], cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for (a = a0; a < a1; a++)
 +                {
 +                    if (distance2(cgsc, x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Anistropic scaling */
 +            for (cg = cg0; cg < cg1; cg++)
 +            {
 +                /* Since scale_tot contains the transpose of the scaling matrix,
 +                 * we need to multiply with the transpose.
 +                 */
 +                tmvmul_ur0(scale_tot, cg_cm[cg], cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for (a = a0; a < a1; a++)
 +                {
 +                    if (distance2(cgsc, x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return nBeyond;
 +}
index bd92238b99f6508c349ed94921d0ff938a04b70a,0000000000000000000000000000000000000000..1f10a1796d7397ab09c680d2ae621ab420e7ebea
mode 100644,000000..100644
--- /dev/null
@@@ -1,4071 -1,0 +1,4071 @@@
-             /*         v x (xi - xcn)          */
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include "domdec.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_cyclecounter.h"
 +#include "trnio.h"
 +#include "smalloc.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "futil.h"
 +#include "mdrun.h"
 +#include "txtdump.h"
 +#include "names.h"
 +#include "mtop_util.h"
 +#include "names.h"
 +#include "nrjac.h"
 +#include "vec.h"
 +#include "gmx_ga2la.h"
 +#include "xvgr.h"
 +#include "gmxfio.h"
 +#include "groupcoord.h"
 +#include "pull_rotation.h"
 +#include "gmx_sort.h"
 +#include "copyrite.h"
 +#include "macros.h"
 +
 +
 +static char *RotStr = {"Enforced rotation:"};
 +
 +
 +/* Set the minimum weight for the determination of the slab centers */
 +#define WEIGHT_MIN (10*GMX_FLOAT_MIN)
 +
 +/* Helper structure for sorting positions along rotation vector             */
 +typedef struct {
 +    real xcproj;            /* Projection of xc on the rotation vector        */
 +    int  ind;               /* Index of xc                                    */
 +    real m;                 /* Mass                                           */
 +    rvec x;                 /* Position                                       */
 +    rvec x_ref;             /* Reference position                             */
 +} sort_along_vec_t;
 +
 +
 +/* Enforced rotation / flexible: determine the angle of each slab             */
 +typedef struct gmx_slabdata
 +{
 +    int   nat;              /* Number of atoms belonging to this slab         */
 +    rvec *x;                /* The positions belonging to this slab. In
 +                               general, this should be all positions of the
 +                               whole rotation group, but we leave those away
 +                               that have a small enough weight                */
 +    rvec *ref;              /* Same for reference                             */
 +    real *weight;           /* The weight for each atom                       */
 +} t_gmx_slabdata;
 +
 +
 +/* Helper structure for potential fitting */
 +typedef struct gmx_potfit
 +{
 +    real   *degangle;       /* Set of angles for which the potential is
 +                               calculated. The optimum fit is determined as
 +                               the angle for with the potential is minimal    */
 +    real   *V;              /* Potential for the different angles             */
 +    matrix *rotmat;         /* Rotation matrix corresponding to the angles    */
 +} t_gmx_potfit;
 +
 +
 +/* Enforced rotation data for all groups                                      */
 +typedef struct gmx_enfrot
 +{
 +    FILE             *out_rot;     /* Output file for rotation data                  */
 +    FILE             *out_torque;  /* Output file for torque data                    */
 +    FILE             *out_angles;  /* Output file for slab angles for flexible type  */
 +    FILE             *out_slabs;   /* Output file for slab centers                   */
 +    int               bufsize;     /* Allocation size of buf                         */
 +    rvec             *xbuf;        /* Coordinate buffer variable for sorting         */
 +    real             *mbuf;        /* Masses buffer variable for sorting             */
 +    sort_along_vec_t *data;        /* Buffer variable needed for position sorting    */
 +    real             *mpi_inbuf;   /* MPI buffer                                     */
 +    real             *mpi_outbuf;  /* MPI buffer                                     */
 +    int               mpi_bufsize; /* Allocation size of in & outbuf                 */
 +    unsigned long     Flags;       /* mdrun flags                                    */
 +    gmx_bool          bOut;        /* Used to skip first output when appending to
 +                                    * avoid duplicate entries in rotation outfiles   */
 +} t_gmx_enfrot;
 +
 +
 +/* Global enforced rotation data for a single rotation group                  */
 +typedef struct gmx_enfrotgrp
 +{
 +    real     degangle;      /* Rotation angle in degrees                      */
 +    matrix   rotmat;        /* Rotation matrix                                */
 +    atom_id *ind_loc;       /* Local rotation indices                         */
 +    int      nat_loc;       /* Number of local group atoms                    */
 +    int      nalloc_loc;    /* Allocation size for ind_loc and weight_loc     */
 +
 +    real     V;             /* Rotation potential for this rotation group     */
 +    rvec    *f_rot_loc;     /* Array to store the forces on the local atoms
 +                               resulting from enforced rotation potential     */
 +
 +    /* Collective coordinates for the whole rotation group */
 +    real  *xc_ref_length;   /* Length of each x_rotref vector after x_rotref
 +                               has been put into origin                       */
 +    int   *xc_ref_ind;      /* Position of each local atom in the collective
 +                               array                                          */
 +    rvec   xc_center;       /* Center of the rotation group positions, may
 +                               be mass weighted                               */
 +    rvec   xc_ref_center;   /* dito, for the reference positions              */
 +    rvec  *xc;              /* Current (collective) positions                 */
 +    ivec  *xc_shifts;       /* Current (collective) shifts                    */
 +    ivec  *xc_eshifts;      /* Extra shifts since last DD step                */
 +    rvec  *xc_old;          /* Old (collective) positions                     */
 +    rvec  *xc_norm;         /* Normalized form of the current positions       */
 +    rvec  *xc_ref_sorted;   /* Reference positions (sorted in the same order
 +                               as xc when sorted)                             */
 +    int   *xc_sortind;      /* Where is a position found after sorting?       */
 +    real  *mc;              /* Collective masses                              */
 +    real  *mc_sorted;
 +    real   invmass;         /* one over the total mass of the rotation group  */
 +
 +    real   torque_v;        /* Torque in the direction of rotation vector     */
 +    real   angle_v;         /* Actual angle of the whole rotation group       */
 +    /* Fixed rotation only */
 +    real   weight_v;        /* Weights for angle determination                */
 +    rvec  *xr_loc;          /* Local reference coords, correctly rotated      */
 +    rvec  *x_loc_pbc;       /* Local current coords, correct PBC image        */
 +    real  *m_loc;           /* Masses of the current local atoms              */
 +
 +    /* Flexible rotation only */
 +    int    nslabs_alloc;              /* For this many slabs memory is allocated        */
 +    int    slab_first;                /* Lowermost slab for that the calculation needs
 +                                         to be performed at a given time step           */
 +    int    slab_last;                 /* Uppermost slab ...                             */
 +    int    slab_first_ref;            /* First slab for which ref. center is stored     */
 +    int    slab_last_ref;             /* Last ...                                       */
 +    int    slab_buffer;               /* Slab buffer region around reference slabs      */
 +    int   *firstatom;                 /* First relevant atom for a slab                 */
 +    int   *lastatom;                  /* Last relevant atom for a slab                  */
 +    rvec  *slab_center;               /* Gaussian-weighted slab center                  */
 +    rvec  *slab_center_ref;           /* Gaussian-weighted slab center for the
 +                                         reference positions                            */
 +    real  *slab_weights;              /* Sum of gaussian weights in a slab              */
 +    real  *slab_torque_v;             /* Torque T = r x f for each slab.                */
 +                                      /* torque_v = m.v = angular momentum in the
 +                                         direction of v                                 */
 +    real  max_beta;                   /* min_gaussian from inputrec->rotgrp is the
 +                                         minimum value the gaussian must have so that
 +                                         the force is actually evaluated max_beta is
 +                                         just another way to put it                     */
 +    real           *gn_atom;          /* Precalculated gaussians for a single atom      */
 +    int            *gn_slabind;       /* Tells to which slab each precalculated gaussian
 +                                         belongs                                        */
 +    rvec           *slab_innersumvec; /* Inner sum of the flexible2 potential per slab;
 +                                         this is precalculated for optimization reasons */
 +    t_gmx_slabdata *slab_data;        /* Holds atom positions and gaussian weights
 +                                         of atoms belonging to a slab                   */
 +
 +    /* For potential fits with varying angle: */
 +    t_gmx_potfit *PotAngleFit;  /* Used for fit type 'potential'              */
 +} t_gmx_enfrotgrp;
 +
 +
 +/* Activate output of forces for correctness checks */
 +/* #define PRINT_FORCES */
 +#ifdef PRINT_FORCES
 +#define PRINT_FORCE_J  fprintf(stderr, "f%d = %15.8f %15.8f %15.8f\n", erg->xc_ref_ind[j], erg->f_rot_loc[j][XX], erg->f_rot_loc[j][YY], erg->f_rot_loc[j][ZZ]);
 +#define PRINT_POT_TAU  if (MASTER(cr)) { \
 +        fprintf(stderr, "potential = %15.8f\n" "torque    = %15.8f\n", erg->V, erg->torque_v); \
 +}
 +#else
 +#define PRINT_FORCE_J
 +#define PRINT_POT_TAU
 +#endif
 +
 +/* Shortcuts for often used queries */
 +#define ISFLEX(rg) ( (rg->eType == erotgFLEX) || (rg->eType == erotgFLEXT) || (rg->eType == erotgFLEX2) || (rg->eType == erotgFLEX2T) )
 +#define ISCOLL(rg) ( (rg->eType == erotgFLEX) || (rg->eType == erotgFLEXT) || (rg->eType == erotgFLEX2) || (rg->eType == erotgFLEX2T) || (rg->eType == erotgRMPF) || (rg->eType == erotgRM2PF) )
 +
 +
 +/* Does any of the rotation groups use slab decomposition? */
 +static gmx_bool HaveFlexibleGroups(t_rot *rot)
 +{
 +    int       g;
 +    t_rotgrp *rotg;
 +
 +
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +        if (ISFLEX(rotg))
 +        {
 +            return TRUE;
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +
 +/* Is for any group the fit angle determined by finding the minimum of the
 + * rotation potential? */
 +static gmx_bool HavePotFitGroups(t_rot *rot)
 +{
 +    int       g;
 +    t_rotgrp *rotg;
 +
 +
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +        if (erotgFitPOT == rotg->eFittype)
 +        {
 +            return TRUE;
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +
 +static double** allocate_square_matrix(int dim)
 +{
 +    int      i;
 +    double** mat = NULL;
 +
 +
 +    snew(mat, dim);
 +    for (i = 0; i < dim; i++)
 +    {
 +        snew(mat[i], dim);
 +    }
 +
 +    return mat;
 +}
 +
 +
 +static void free_square_matrix(double** mat, int dim)
 +{
 +    int i;
 +
 +
 +    for (i = 0; i < dim; i++)
 +    {
 +        sfree(mat[i]);
 +    }
 +    sfree(mat);
 +}
 +
 +
 +/* Return the angle for which the potential is minimal */
 +static real get_fitangle(t_rotgrp *rotg, gmx_enfrotgrp_t erg)
 +{
 +    int           i;
 +    real          fitangle = -999.9;
 +    real          pot_min  = GMX_FLOAT_MAX;
 +    t_gmx_potfit *fit;
 +
 +
 +    fit = erg->PotAngleFit;
 +
 +    for (i = 0; i < rotg->PotAngle_nstep; i++)
 +    {
 +        if (fit->V[i] < pot_min)
 +        {
 +            pot_min  = fit->V[i];
 +            fitangle = fit->degangle[i];
 +        }
 +    }
 +
 +    return fitangle;
 +}
 +
 +
 +/* Reduce potential angle fit data for this group at this time step? */
 +static gmx_inline gmx_bool bPotAngle(t_rot *rot, t_rotgrp *rotg, gmx_large_int_t step)
 +{
 +    return ( (erotgFitPOT == rotg->eFittype) && (do_per_step(step, rot->nstsout) || do_per_step(step, rot->nstrout)) );
 +}
 +
 +/* Reduce slab torqe data for this group at this time step? */
 +static gmx_inline gmx_bool bSlabTau(t_rot *rot, t_rotgrp *rotg, gmx_large_int_t step)
 +{
 +    return ( (ISFLEX(rotg)) && do_per_step(step, rot->nstsout) );
 +}
 +
 +/* Output rotation energy, torques, etc. for each rotation group */
 +static void reduce_output(t_commrec *cr, t_rot *rot, real t, gmx_large_int_t step)
 +{
 +    int             g, i, islab, nslabs = 0;
 +    int             count; /* MPI element counter                               */
 +    t_rotgrp       *rotg;
 +    gmx_enfrot_t    er;    /* Pointer to the enforced rotation buffer variables */
 +    gmx_enfrotgrp_t erg;   /* Pointer to enforced rotation group data           */
 +    real            fitangle;
 +    gmx_bool        bFlex;
 +
 +
 +    er = rot->enfrot;
 +
 +    /* Fill the MPI buffer with stuff to reduce. If items are added for reduction
 +     * here, the MPI buffer size has to be enlarged also in calc_mpi_bufsize() */
 +    if (PAR(cr))
 +    {
 +        count = 0;
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg                   = &rot->grp[g];
 +            erg                    = rotg->enfrotgrp;
 +            nslabs                 = erg->slab_last - erg->slab_first + 1;
 +            er->mpi_inbuf[count++] = erg->V;
 +            er->mpi_inbuf[count++] = erg->torque_v;
 +            er->mpi_inbuf[count++] = erg->angle_v;
 +            er->mpi_inbuf[count++] = erg->weight_v; /* weights are not needed for flex types, but this is just a single value */
 +
 +            if (bPotAngle(rot, rotg, step))
 +            {
 +                for (i = 0; i < rotg->PotAngle_nstep; i++)
 +                {
 +                    er->mpi_inbuf[count++] = erg->PotAngleFit->V[i];
 +                }
 +            }
 +            if (bSlabTau(rot, rotg, step))
 +            {
 +                for (i = 0; i < nslabs; i++)
 +                {
 +                    er->mpi_inbuf[count++] = erg->slab_torque_v[i];
 +                }
 +            }
 +        }
 +        if (count > er->mpi_bufsize)
 +        {
 +            gmx_fatal(FARGS, "%s MPI buffer overflow, please report this error.", RotStr);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Reduce(er->mpi_inbuf, er->mpi_outbuf, count, GMX_MPI_REAL, MPI_SUM, MASTERRANK(cr), cr->mpi_comm_mygroup);
 +#endif
 +
 +        /* Copy back the reduced data from the buffer on the master */
 +        if (MASTER(cr))
 +        {
 +            count = 0;
 +            for (g = 0; g < rot->ngrp; g++)
 +            {
 +                rotg          = &rot->grp[g];
 +                erg           = rotg->enfrotgrp;
 +                nslabs        = erg->slab_last - erg->slab_first + 1;
 +                erg->V        = er->mpi_outbuf[count++];
 +                erg->torque_v = er->mpi_outbuf[count++];
 +                erg->angle_v  = er->mpi_outbuf[count++];
 +                erg->weight_v = er->mpi_outbuf[count++];
 +
 +                if (bPotAngle(rot, rotg, step))
 +                {
 +                    for (i = 0; i < rotg->PotAngle_nstep; i++)
 +                    {
 +                        erg->PotAngleFit->V[i] = er->mpi_outbuf[count++];
 +                    }
 +                }
 +                if (bSlabTau(rot, rotg, step))
 +                {
 +                    for (i = 0; i < nslabs; i++)
 +                    {
 +                        erg->slab_torque_v[i] = er->mpi_outbuf[count++];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Output */
 +    if (MASTER(cr))
 +    {
 +        /* Angle and torque for each rotation group */
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg  = &rot->grp[g];
 +            bFlex = ISFLEX(rotg);
 +
 +            erg = rotg->enfrotgrp;
 +
 +            /* Output to main rotation output file: */
 +            if (do_per_step(step, rot->nstrout) )
 +            {
 +                if (erotgFitPOT == rotg->eFittype)
 +                {
 +                    fitangle = get_fitangle(rotg, erg);
 +                }
 +                else
 +                {
 +                    if (bFlex)
 +                    {
 +                        fitangle = erg->angle_v; /* RMSD fit angle */
 +                    }
 +                    else
 +                    {
 +                        fitangle = (erg->angle_v/erg->weight_v)*180.0*M_1_PI;
 +                    }
 +                }
 +                fprintf(er->out_rot, "%12.4f", fitangle);
 +                fprintf(er->out_rot, "%12.3e", erg->torque_v);
 +                fprintf(er->out_rot, "%12.3e", erg->V);
 +            }
 +
 +            if (do_per_step(step, rot->nstsout) )
 +            {
 +                /* Output to torque log file: */
 +                if (bFlex)
 +                {
 +                    fprintf(er->out_torque, "%12.3e%6d", t, g);
 +                    for (i = erg->slab_first; i <= erg->slab_last; i++)
 +                    {
 +                        islab = i - erg->slab_first;  /* slab index */
 +                        /* Only output if enough weight is in slab */
 +                        if (erg->slab_weights[islab] > rotg->min_gaussian)
 +                        {
 +                            fprintf(er->out_torque, "%6d%12.3e", i, erg->slab_torque_v[islab]);
 +                        }
 +                    }
 +                    fprintf(er->out_torque, "\n");
 +                }
 +
 +                /* Output to angles log file: */
 +                if (erotgFitPOT == rotg->eFittype)
 +                {
 +                    fprintf(er->out_angles, "%12.3e%6d%12.4f", t, g, erg->degangle);
 +                    /* Output energies at a set of angles around the reference angle */
 +                    for (i = 0; i < rotg->PotAngle_nstep; i++)
 +                    {
 +                        fprintf(er->out_angles, "%12.3e", erg->PotAngleFit->V[i]);
 +                    }
 +                    fprintf(er->out_angles, "\n");
 +                }
 +            }
 +        }
 +        if (do_per_step(step, rot->nstrout) )
 +        {
 +            fprintf(er->out_rot, "\n");
 +        }
 +    }
 +}
 +
 +
 +/* Add the forces from enforced rotation potential to the local forces.
 + * Should be called after the SR forces have been evaluated */
 +extern real add_rot_forces(t_rot *rot, rvec f[], t_commrec *cr, gmx_large_int_t step, real t)
 +{
 +    int             g, l, ii;
 +    t_rotgrp       *rotg;
 +    gmx_enfrot_t    er;         /* Pointer to the enforced rotation buffer variables */
 +    gmx_enfrotgrp_t erg;        /* Pointer to enforced rotation group data           */
 +    real            Vrot = 0.0; /* If more than one rotation group is present, Vrot
 +                                   assembles the local parts from all groups         */
 +
 +
 +    er = rot->enfrot;
 +
 +    /* Loop over enforced rotation groups (usually 1, though)
 +     * Apply the forces from rotation potentials */
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg  = &rot->grp[g];
 +        erg   = rotg->enfrotgrp;
 +        Vrot += erg->V;  /* add the local parts from the nodes */
 +        for (l = 0; l < erg->nat_loc; l++)
 +        {
 +            /* Get the right index of the local force */
 +            ii = erg->ind_loc[l];
 +            /* Add */
 +            rvec_inc(f[ii], erg->f_rot_loc[l]);
 +        }
 +    }
 +
 +    /* Reduce energy,torque, angles etc. to get the sum values (per rotation group)
 +     * on the master and output these values to file. */
 +    if ( (do_per_step(step, rot->nstrout) || do_per_step(step, rot->nstsout)) && er->bOut)
 +    {
 +        reduce_output(cr, rot, t, step);
 +    }
 +
 +    /* When appending, er->bOut is FALSE the first time to avoid duplicate entries */
 +    er->bOut = TRUE;
 +
 +    PRINT_POT_TAU
 +
 +    return Vrot;
 +}
 +
 +
 +/* The Gaussian norm is chosen such that the sum of the gaussian functions
 + * over the slabs is approximately 1.0 everywhere */
 +#define GAUSS_NORM   0.569917543430618
 +
 +
 +/* Calculate the maximum beta that leads to a gaussian larger min_gaussian,
 + * also does some checks
 + */
 +static double calc_beta_max(real min_gaussian, real slab_dist)
 +{
 +    double sigma;
 +    double arg;
 +
 +
 +    /* Actually the next two checks are already made in grompp */
 +    if (slab_dist <= 0)
 +    {
 +        gmx_fatal(FARGS, "Slab distance of flexible rotation groups must be >=0 !");
 +    }
 +    if (min_gaussian <= 0)
 +    {
 +        gmx_fatal(FARGS, "Cutoff value for Gaussian must be > 0. (You requested %f)");
 +    }
 +
 +    /* Define the sigma value */
 +    sigma = 0.7*slab_dist;
 +
 +    /* Calculate the argument for the logarithm and check that the log() result is negative or 0 */
 +    arg = min_gaussian/GAUSS_NORM;
 +    if (arg > 1.0)
 +    {
 +        gmx_fatal(FARGS, "min_gaussian of flexible rotation groups must be <%g", GAUSS_NORM);
 +    }
 +
 +    return sqrt(-2.0*sigma*sigma*log(min_gaussian/GAUSS_NORM));
 +}
 +
 +
 +static gmx_inline real calc_beta(rvec curr_x, t_rotgrp *rotg, int n)
 +{
 +    return iprod(curr_x, rotg->vec) - rotg->slab_dist * n;
 +}
 +
 +
 +static gmx_inline real gaussian_weight(rvec curr_x, t_rotgrp *rotg, int n)
 +{
 +    const real norm = GAUSS_NORM;
 +    real       sigma;
 +
 +
 +    /* Define the sigma value */
 +    sigma = 0.7*rotg->slab_dist;
 +    /* Calculate the Gaussian value of slab n for position curr_x */
 +    return norm * exp( -0.5 * sqr( calc_beta(curr_x, rotg, n)/sigma ) );
 +}
 +
 +
 +/* Returns the weight in a single slab, also calculates the Gaussian- and mass-
 + * weighted sum of positions for that slab */
 +static real get_slab_weight(int j, t_rotgrp *rotg, rvec xc[], real mc[], rvec *x_weighted_sum)
 +{
 +    rvec            curr_x;           /* The position of an atom                      */
 +    rvec            curr_x_weighted;  /* The gaussian-weighted position               */
 +    real            gaussian;         /* A single gaussian weight                     */
 +    real            wgauss;           /* gaussian times current mass                  */
 +    real            slabweight = 0.0; /* The sum of weights in the slab               */
 +    int             i, islab;
 +    gmx_enfrotgrp_t erg;              /* Pointer to enforced rotation group data      */
 +
 +
 +    erg = rotg->enfrotgrp;
 +    clear_rvec(*x_weighted_sum);
 +
 +    /* Slab index */
 +    islab = j - erg->slab_first;
 +
 +    /* Loop over all atoms in the rotation group */
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        copy_rvec(xc[i], curr_x);
 +        gaussian = gaussian_weight(curr_x, rotg, j);
 +        wgauss   = gaussian * mc[i];
 +        svmul(wgauss, curr_x, curr_x_weighted);
 +        rvec_add(*x_weighted_sum, curr_x_weighted, *x_weighted_sum);
 +        slabweight += wgauss;
 +    }  /* END of loop over rotation group atoms */
 +
 +    return slabweight;
 +}
 +
 +
 +static void get_slab_centers(
 +        t_rotgrp  *rotg,       /* The rotation group information               */
 +        rvec      *xc,         /* The rotation group positions; will
 +                                  typically be enfrotgrp->xc, but at first call
 +                                  it is enfrotgrp->xc_ref                      */
 +        real      *mc,         /* The masses of the rotation group atoms       */
 +        int        g,          /* The number of the rotation group             */
 +        real       time,       /* Used for output only                         */
 +        FILE      *out_slabs,  /* For outputting center per slab information   */
 +        gmx_bool   bOutStep,   /* Is this an output step?                      */
 +        gmx_bool   bReference) /* If this routine is called from
 +                                  init_rot_group we need to store
 +                                  the reference slab centers                   */
 +{
 +    int             j, islab;
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Loop over slabs */
 +    for (j = erg->slab_first; j <= erg->slab_last; j++)
 +    {
 +        islab                    = j - erg->slab_first;
 +        erg->slab_weights[islab] = get_slab_weight(j, rotg, xc, mc, &erg->slab_center[islab]);
 +
 +        /* We can do the calculations ONLY if there is weight in the slab! */
 +        if (erg->slab_weights[islab] > WEIGHT_MIN)
 +        {
 +            svmul(1.0/erg->slab_weights[islab], erg->slab_center[islab], erg->slab_center[islab]);
 +        }
 +        else
 +        {
 +            /* We need to check this here, since we divide through slab_weights
 +             * in the flexible low-level routines! */
 +            gmx_fatal(FARGS, "Not enough weight in slab %d. Slab center cannot be determined!", j);
 +        }
 +
 +        /* At first time step: save the centers of the reference structure */
 +        if (bReference)
 +        {
 +            copy_rvec(erg->slab_center[islab], erg->slab_center_ref[islab]);
 +        }
 +    } /* END of loop over slabs */
 +
 +    /* Output on the master */
 +    if ( (NULL != out_slabs) && bOutStep)
 +    {
 +        fprintf(out_slabs, "%12.3e%6d", time, g);
 +        for (j = erg->slab_first; j <= erg->slab_last; j++)
 +        {
 +            islab = j - erg->slab_first;
 +            fprintf(out_slabs, "%6d%12.3e%12.3e%12.3e",
 +                    j, erg->slab_center[islab][XX], erg->slab_center[islab][YY], erg->slab_center[islab][ZZ]);
 +        }
 +        fprintf(out_slabs, "\n");
 +    }
 +}
 +
 +
 +static void calc_rotmat(
 +        rvec   vec,
 +        real   degangle,      /* Angle alpha of rotation at time t in degrees       */
 +        matrix rotmat)        /* Rotation matrix                                    */
 +{
 +    real radangle;            /* Rotation angle in radians */
 +    real cosa;                /* cosine alpha              */
 +    real sina;                /* sine alpha                */
 +    real OMcosa;              /* 1 - cos(alpha)            */
 +    real dumxy, dumxz, dumyz; /* save computations         */
 +    rvec rot_vec;             /* Rotate around rot_vec ... */
 +
 +
 +    radangle = degangle * M_PI/180.0;
 +    copy_rvec(vec, rot_vec );
 +
 +    /* Precompute some variables: */
 +    cosa   = cos(radangle);
 +    sina   = sin(radangle);
 +    OMcosa = 1.0 - cosa;
 +    dumxy  = rot_vec[XX]*rot_vec[YY]*OMcosa;
 +    dumxz  = rot_vec[XX]*rot_vec[ZZ]*OMcosa;
 +    dumyz  = rot_vec[YY]*rot_vec[ZZ]*OMcosa;
 +
 +    /* Construct the rotation matrix for this rotation group: */
 +    /* 1st column: */
 +    rotmat[XX][XX] = cosa  + rot_vec[XX]*rot_vec[XX]*OMcosa;
 +    rotmat[YY][XX] = dumxy + rot_vec[ZZ]*sina;
 +    rotmat[ZZ][XX] = dumxz - rot_vec[YY]*sina;
 +    /* 2nd column: */
 +    rotmat[XX][YY] = dumxy - rot_vec[ZZ]*sina;
 +    rotmat[YY][YY] = cosa  + rot_vec[YY]*rot_vec[YY]*OMcosa;
 +    rotmat[ZZ][YY] = dumyz + rot_vec[XX]*sina;
 +    /* 3rd column: */
 +    rotmat[XX][ZZ] = dumxz + rot_vec[YY]*sina;
 +    rotmat[YY][ZZ] = dumyz - rot_vec[XX]*sina;
 +    rotmat[ZZ][ZZ] = cosa  + rot_vec[ZZ]*rot_vec[ZZ]*OMcosa;
 +
 +#ifdef PRINTMATRIX
 +    int iii, jjj;
 +
 +    for (iii = 0; iii < 3; iii++)
 +    {
 +        for (jjj = 0; jjj < 3; jjj++)
 +        {
 +            fprintf(stderr, " %10.8f ",  rotmat[iii][jjj]);
 +        }
 +        fprintf(stderr, "\n");
 +    }
 +#endif
 +}
 +
 +
 +/* Calculates torque on the rotation axis tau = position x force */
 +static gmx_inline real torque(
 +        rvec rotvec,  /* rotation vector; MUST be normalized!                 */
 +        rvec force,   /* force                                                */
 +        rvec x,       /* position of atom on which the force acts             */
 +        rvec pivot)   /* pivot point of rotation axis                         */
 +{
 +    rvec vectmp, tau;
 +
 +
 +    /* Subtract offset */
 +    rvec_sub(x, pivot, vectmp);
 +
 +    /* position x force */
 +    cprod(vectmp, force, tau);
 +
 +    /* Return the part of the torque which is parallel to the rotation vector */
 +    return iprod(tau, rotvec);
 +}
 +
 +
 +/* Right-aligned output of value with standard width */
 +static void print_aligned(FILE *fp, char *str)
 +{
 +    fprintf(fp, "%12s", str);
 +}
 +
 +
 +/* Right-aligned output of value with standard short width */
 +static void print_aligned_short(FILE *fp, char *str)
 +{
 +    fprintf(fp, "%6s", str);
 +}
 +
 +
 +static FILE *open_output_file(const char *fn, int steps, const char what[])
 +{
 +    FILE *fp;
 +
 +
 +    fp = ffopen(fn, "w");
 +
 +    fprintf(fp, "# Output of %s is written in intervals of %d time step%s.\n#\n",
 +            what, steps, steps > 1 ? "s" : "");
 +
 +    return fp;
 +}
 +
 +
 +/* Open output file for slab center data. Call on master only */
 +static FILE *open_slab_out(const char *fn, t_rot *rot, const output_env_t oenv)
 +{
 +    FILE      *fp;
 +    int        g, i;
 +    t_rotgrp  *rotg;
 +
 +
 +    if (rot->enfrot->Flags & MD_APPENDFILES)
 +    {
 +        fp = gmx_fio_fopen(fn, "a");
 +    }
 +    else
 +    {
 +        fp = open_output_file(fn, rot->nstsout, "gaussian weighted slab centers");
 +
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg = &rot->grp[g];
 +            if (ISFLEX(rotg))
 +            {
 +                fprintf(fp, "# Rotation group %d (%s), slab distance %f nm, %s.\n",
 +                        g, erotg_names[rotg->eType], rotg->slab_dist,
 +                        rotg->bMassW ? "centers of mass" : "geometrical centers");
 +            }
 +        }
 +
 +        fprintf(fp, "# Reference centers are listed first (t=-1).\n");
 +        fprintf(fp, "# The following columns have the syntax:\n");
 +        fprintf(fp, "#     ");
 +        print_aligned_short(fp, "t");
 +        print_aligned_short(fp, "grp");
 +        /* Print legend for the first two entries only ... */
 +        for (i = 0; i < 2; i++)
 +        {
 +            print_aligned_short(fp, "slab");
 +            print_aligned(fp, "X center");
 +            print_aligned(fp, "Y center");
 +            print_aligned(fp, "Z center");
 +        }
 +        fprintf(fp, " ...\n");
 +        fflush(fp);
 +    }
 +
 +    return fp;
 +}
 +
 +
 +/* Adds 'buf' to 'str' */
 +static void add_to_string(char **str, char *buf)
 +{
 +    int len;
 +
 +
 +    len = strlen(*str) + strlen(buf) + 1;
 +    srenew(*str, len);
 +    strcat(*str, buf);
 +}
 +
 +
 +static void add_to_string_aligned(char **str, char *buf)
 +{
 +    char buf_aligned[STRLEN];
 +
 +    sprintf(buf_aligned, "%12s", buf);
 +    add_to_string(str, buf_aligned);
 +}
 +
 +
 +/* Open output file and print some general information about the rotation groups.
 + * Call on master only */
 +static FILE *open_rot_out(const char *fn, t_rot *rot, const output_env_t oenv)
 +{
 +    FILE           *fp;
 +    int             g, nsets;
 +    t_rotgrp       *rotg;
 +    const char    **setname;
 +    char            buf[50], buf2[75];
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +    gmx_bool        bFlex;
 +    char           *LegendStr = NULL;
 +
 +
 +    if (rot->enfrot->Flags & MD_APPENDFILES)
 +    {
 +        fp = gmx_fio_fopen(fn, "a");
 +    }
 +    else
 +    {
 +        fp = xvgropen(fn, "Rotation angles and energy", "Time (ps)", "angles (degrees) and energies (kJ/mol)", oenv);
 +        fprintf(fp, "# Output of enforced rotation data is written in intervals of %d time step%s.\n#\n", rot->nstrout, rot->nstrout > 1 ? "s" : "");
 +        fprintf(fp, "# The scalar tau is the torque (kJ/mol) in the direction of the rotation vector v.\n");
 +        fprintf(fp, "# To obtain the vectorial torque, multiply tau with the group's rot_vec.\n");
 +        fprintf(fp, "# For flexible groups, tau(t,n) from all slabs n have been summed in a single value tau(t) here.\n");
 +        fprintf(fp, "# The torques tau(t,n) are found in the rottorque.log (-rt) output file\n");
 +
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg  = &rot->grp[g];
 +            erg   = rotg->enfrotgrp;
 +            bFlex = ISFLEX(rotg);
 +
 +            fprintf(fp, "#\n");
 +            fprintf(fp, "# ROTATION GROUP %d, potential type '%s':\n", g, erotg_names[rotg->eType]);
 +            fprintf(fp, "# rot_massw%d          %s\n", g, yesno_names[rotg->bMassW]);
 +            fprintf(fp, "# rot_vec%d            %12.5e %12.5e %12.5e\n", g, rotg->vec[XX], rotg->vec[YY], rotg->vec[ZZ]);
 +            fprintf(fp, "# rot_rate%d           %12.5e degrees/ps\n", g, rotg->rate);
 +            fprintf(fp, "# rot_k%d              %12.5e kJ/(mol*nm^2)\n", g, rotg->k);
 +            if (rotg->eType == erotgISO || rotg->eType == erotgPM || rotg->eType == erotgRM || rotg->eType == erotgRM2)
 +            {
 +                fprintf(fp, "# rot_pivot%d          %12.5e %12.5e %12.5e  nm\n", g, rotg->pivot[XX], rotg->pivot[YY], rotg->pivot[ZZ]);
 +            }
 +
 +            if (bFlex)
 +            {
 +                fprintf(fp, "# rot_slab_distance%d   %f nm\n", g, rotg->slab_dist);
 +                fprintf(fp, "# rot_min_gaussian%d   %12.5e\n", g, rotg->min_gaussian);
 +            }
 +
 +            /* Output the centers of the rotation groups for the pivot-free potentials */
 +            if ((rotg->eType == erotgISOPF) || (rotg->eType == erotgPMPF) || (rotg->eType == erotgRMPF) || (rotg->eType == erotgRM2PF
 +                                                                                                            || (rotg->eType == erotgFLEXT) || (rotg->eType == erotgFLEX2T)) )
 +            {
 +                fprintf(fp, "# ref. grp. %d center  %12.5e %12.5e %12.5e\n", g,
 +                        erg->xc_ref_center[XX], erg->xc_ref_center[YY], erg->xc_ref_center[ZZ]);
 +
 +                fprintf(fp, "# grp. %d init.center  %12.5e %12.5e %12.5e\n", g,
 +                        erg->xc_center[XX], erg->xc_center[YY], erg->xc_center[ZZ]);
 +            }
 +
 +            if ( (rotg->eType == erotgRM2) || (rotg->eType == erotgFLEX2) || (rotg->eType == erotgFLEX2T) )
 +            {
 +                fprintf(fp, "# rot_eps%d            %12.5e nm^2\n", g, rotg->eps);
 +            }
 +            if (erotgFitPOT == rotg->eFittype)
 +            {
 +                fprintf(fp, "#\n");
 +                fprintf(fp, "# theta_fit%d is determined by first evaluating the potential for %d angles around theta_ref%d.\n",
 +                        g, rotg->PotAngle_nstep, g);
 +                fprintf(fp, "# The fit angle is the one with the smallest potential. It is given as the deviation\n");
 +                fprintf(fp, "# from the reference angle, i.e. if theta_ref=X and theta_fit=Y, then the angle with\n");
 +                fprintf(fp, "# minimal value of the potential is X+Y. Angular resolution is %g degrees.\n", rotg->PotAngle_step);
 +            }
 +        }
 +
 +        /* Print a nice legend */
 +        snew(LegendStr, 1);
 +        LegendStr[0] = '\0';
 +        sprintf(buf, "#     %6s", "time");
 +        add_to_string_aligned(&LegendStr, buf);
 +
 +        nsets = 0;
 +        snew(setname, 4*rot->ngrp);
 +
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg = &rot->grp[g];
 +            sprintf(buf, "theta_ref%d", g);
 +            add_to_string_aligned(&LegendStr, buf);
 +
 +            sprintf(buf2, "%s (degrees)", buf);
 +            setname[nsets] = strdup(buf2);
 +            nsets++;
 +        }
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg  = &rot->grp[g];
 +            bFlex = ISFLEX(rotg);
 +
 +            /* For flexible axis rotation we use RMSD fitting to determine the
 +             * actual angle of the rotation group */
 +            if (bFlex || erotgFitPOT == rotg->eFittype)
 +            {
 +                sprintf(buf, "theta_fit%d", g);
 +            }
 +            else
 +            {
 +                sprintf(buf, "theta_av%d", g);
 +            }
 +            add_to_string_aligned(&LegendStr, buf);
 +            sprintf(buf2, "%s (degrees)", buf);
 +            setname[nsets] = strdup(buf2);
 +            nsets++;
 +
 +            sprintf(buf, "tau%d", g);
 +            add_to_string_aligned(&LegendStr, buf);
 +            sprintf(buf2, "%s (kJ/mol)", buf);
 +            setname[nsets] = strdup(buf2);
 +            nsets++;
 +
 +            sprintf(buf, "energy%d", g);
 +            add_to_string_aligned(&LegendStr, buf);
 +            sprintf(buf2, "%s (kJ/mol)", buf);
 +            setname[nsets] = strdup(buf2);
 +            nsets++;
 +        }
 +        fprintf(fp, "#\n");
 +
 +        if (nsets > 1)
 +        {
 +            xvgr_legend(fp, nsets, setname, oenv);
 +        }
 +        sfree(setname);
 +
 +        fprintf(fp, "#\n# Legend for the following data columns:\n");
 +        fprintf(fp, "%s\n", LegendStr);
 +        sfree(LegendStr);
 +
 +        fflush(fp);
 +    }
 +
 +    return fp;
 +}
 +
 +
 +/* Call on master only */
 +static FILE *open_angles_out(const char *fn, t_rot *rot, const output_env_t oenv)
 +{
 +    int             g, i;
 +    FILE           *fp;
 +    t_rotgrp       *rotg;
 +    gmx_enfrotgrp_t erg;        /* Pointer to enforced rotation group data */
 +    char            buf[100];
 +
 +
 +    if (rot->enfrot->Flags & MD_APPENDFILES)
 +    {
 +        fp = gmx_fio_fopen(fn, "a");
 +    }
 +    else
 +    {
 +        /* Open output file and write some information about it's structure: */
 +        fp = open_output_file(fn, rot->nstsout, "rotation group angles");
 +        fprintf(fp, "# All angles given in degrees, time in ps.\n");
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg = &rot->grp[g];
 +            erg  = rotg->enfrotgrp;
 +
 +            /* Output for this group happens only if potential type is flexible or
 +             * if fit type is potential! */
 +            if (ISFLEX(rotg) || (erotgFitPOT == rotg->eFittype) )
 +            {
 +                if (ISFLEX(rotg))
 +                {
 +                    sprintf(buf, " slab distance %f nm, ", rotg->slab_dist);
 +                }
 +                else
 +                {
 +                    buf[0] = '\0';
 +                }
 +
 +                fprintf(fp, "#\n# ROTATION GROUP %d '%s',%s fit type '%s'.\n",
 +                        g, erotg_names[rotg->eType], buf, erotg_fitnames[rotg->eFittype]);
 +
 +                /* Special type of fitting using the potential minimum. This is
 +                 * done for the whole group only, not for the individual slabs. */
 +                if (erotgFitPOT == rotg->eFittype)
 +                {
 +                    fprintf(fp, "#    To obtain theta_fit%d, the potential is evaluated for %d angles around theta_ref%d\n", g, rotg->PotAngle_nstep, g);
 +                    fprintf(fp, "#    The fit angle in the rotation standard outfile is the one with minimal energy E(theta_fit) [kJ/mol].\n");
 +                    fprintf(fp, "#\n");
 +                }
 +
 +                fprintf(fp, "# Legend for the group %d data columns:\n", g);
 +                fprintf(fp, "#     ");
 +                print_aligned_short(fp, "time");
 +                print_aligned_short(fp, "grp");
 +                print_aligned(fp, "theta_ref");
 +
 +                if (erotgFitPOT == rotg->eFittype)
 +                {
 +                    /* Output the set of angles around the reference angle */
 +                    for (i = 0; i < rotg->PotAngle_nstep; i++)
 +                    {
 +                        sprintf(buf, "E(%g)", erg->PotAngleFit->degangle[i]);
 +                        print_aligned(fp, buf);
 +                    }
 +                }
 +                else
 +                {
 +                    /* Output fit angle for each slab */
 +                    print_aligned_short(fp, "slab");
 +                    print_aligned_short(fp, "atoms");
 +                    print_aligned(fp, "theta_fit");
 +                    print_aligned_short(fp, "slab");
 +                    print_aligned_short(fp, "atoms");
 +                    print_aligned(fp, "theta_fit");
 +                    fprintf(fp, " ...");
 +                }
 +                fprintf(fp, "\n");
 +            }
 +        }
 +        fflush(fp);
 +    }
 +
 +    return fp;
 +}
 +
 +
 +/* Open torque output file and write some information about it's structure.
 + * Call on master only */
 +static FILE *open_torque_out(const char *fn, t_rot *rot, const output_env_t oenv)
 +{
 +    FILE      *fp;
 +    int        g;
 +    t_rotgrp  *rotg;
 +
 +
 +    if (rot->enfrot->Flags & MD_APPENDFILES)
 +    {
 +        fp = gmx_fio_fopen(fn, "a");
 +    }
 +    else
 +    {
 +        fp = open_output_file(fn, rot->nstsout, "torques");
 +
 +        for (g = 0; g < rot->ngrp; g++)
 +        {
 +            rotg = &rot->grp[g];
 +            if (ISFLEX(rotg))
 +            {
 +                fprintf(fp, "# Rotation group %d (%s), slab distance %f nm.\n", g, erotg_names[rotg->eType], rotg->slab_dist);
 +                fprintf(fp, "# The scalar tau is the torque (kJ/mol) in the direction of the rotation vector.\n");
 +                fprintf(fp, "# To obtain the vectorial torque, multiply tau with\n");
 +                fprintf(fp, "# rot_vec%d            %10.3e %10.3e %10.3e\n", g, rotg->vec[XX], rotg->vec[YY], rotg->vec[ZZ]);
 +                fprintf(fp, "#\n");
 +            }
 +        }
 +        fprintf(fp, "# Legend for the following data columns: (tau=torque for that slab):\n");
 +        fprintf(fp, "#     ");
 +        print_aligned_short(fp, "t");
 +        print_aligned_short(fp, "grp");
 +        print_aligned_short(fp, "slab");
 +        print_aligned(fp, "tau");
 +        print_aligned_short(fp, "slab");
 +        print_aligned(fp, "tau");
 +        fprintf(fp, " ...\n");
 +        fflush(fp);
 +    }
 +
 +    return fp;
 +}
 +
 +
 +static void swap_val(double* vec, int i, int j)
 +{
 +    double tmp = vec[j];
 +
 +
 +    vec[j] = vec[i];
 +    vec[i] = tmp;
 +}
 +
 +
 +static void swap_col(double **mat, int i, int j)
 +{
 +    double tmp[3] = {mat[0][j], mat[1][j], mat[2][j]};
 +
 +
 +    mat[0][j] = mat[0][i];
 +    mat[1][j] = mat[1][i];
 +    mat[2][j] = mat[2][i];
 +
 +    mat[0][i] = tmp[0];
 +    mat[1][i] = tmp[1];
 +    mat[2][i] = tmp[2];
 +}
 +
 +
 +/* Eigenvectors are stored in columns of eigen_vec */
 +static void diagonalize_symmetric(
 +        double **matrix,
 +        double **eigen_vec,
 +        double   eigenval[3])
 +{
 +    int n_rot;
 +
 +
 +    jacobi(matrix, 3, eigenval, eigen_vec, &n_rot);
 +
 +    /* sort in ascending order */
 +    if (eigenval[0] > eigenval[1])
 +    {
 +        swap_val(eigenval, 0, 1);
 +        swap_col(eigen_vec, 0, 1);
 +    }
 +    if (eigenval[1] > eigenval[2])
 +    {
 +        swap_val(eigenval, 1, 2);
 +        swap_col(eigen_vec, 1, 2);
 +    }
 +    if (eigenval[0] > eigenval[1])
 +    {
 +        swap_val(eigenval, 0, 1);
 +        swap_col(eigen_vec, 0, 1);
 +    }
 +}
 +
 +
 +static void align_with_z(
 +        rvec* s,           /* Structure to align */
 +        int   natoms,
 +        rvec  axis)
 +{
 +    int     i, j, k;
 +    rvec    zet         = {0.0, 0.0, 1.0};
 +    rvec    rot_axis    = {0.0, 0.0, 0.0};
 +    rvec   *rotated_str = NULL;
 +    real    ooanorm;
 +    real    angle;
 +    matrix  rotmat;
 +
 +
 +    snew(rotated_str, natoms);
 +
 +    /* Normalize the axis */
 +    ooanorm = 1.0/norm(axis);
 +    svmul(ooanorm, axis, axis);
 +
 +    /* Calculate the angle for the fitting procedure */
 +    cprod(axis, zet, rot_axis);
 +    angle = acos(axis[2]);
 +    if (angle < 0.0)
 +    {
 +        angle += M_PI;
 +    }
 +
 +    /* Calculate the rotation matrix */
 +    calc_rotmat(rot_axis, angle*180.0/M_PI, rotmat);
 +
 +    /* Apply the rotation matrix to s */
 +    for (i = 0; i < natoms; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            for (k = 0; k < 3; k++)
 +            {
 +                rotated_str[i][j] += rotmat[j][k]*s[i][k];
 +            }
 +        }
 +    }
 +
 +    /* Rewrite the rotated structure to s */
 +    for (i = 0; i < natoms; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            s[i][j] = rotated_str[i][j];
 +        }
 +    }
 +
 +    sfree(rotated_str);
 +}
 +
 +
 +static void calc_correl_matrix(rvec* Xstr, rvec* Ystr, double** Rmat, int natoms)
 +{
 +    int i, j, k;
 +
 +
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            Rmat[i][j] = 0.0;
 +        }
 +    }
 +
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            for (k = 0; k < natoms; k++)
 +            {
 +                Rmat[i][j] += Ystr[k][i] * Xstr[k][j];
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void weigh_coords(rvec* str, real* weight, int natoms)
 +{
 +    int i, j;
 +
 +
 +    for (i = 0; i < natoms; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            str[i][j] *= sqrt(weight[i]);
 +        }
 +    }
 +}
 +
 +
 +static real opt_angle_analytic(
 +        rvec* ref_s,
 +        rvec* act_s,
 +        real* weight,
 +        int   natoms,
 +        rvec  ref_com,
 +        rvec  act_com,
 +        rvec  axis)
 +{
 +    int      i, j, k;
 +    rvec    *ref_s_1 = NULL;
 +    rvec    *act_s_1 = NULL;
 +    rvec     shift;
 +    double **Rmat, **RtR, **eigvec;
 +    double   eigval[3];
 +    double   V[3][3], WS[3][3];
 +    double   rot_matrix[3][3];
 +    double   opt_angle;
 +
 +
 +    /* Do not change the original coordinates */
 +    snew(ref_s_1, natoms);
 +    snew(act_s_1, natoms);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        copy_rvec(ref_s[i], ref_s_1[i]);
 +        copy_rvec(act_s[i], act_s_1[i]);
 +    }
 +
 +    /* Translate the structures to the origin */
 +    shift[XX] = -ref_com[XX];
 +    shift[YY] = -ref_com[YY];
 +    shift[ZZ] = -ref_com[ZZ];
 +    translate_x(ref_s_1, natoms, shift);
 +
 +    shift[XX] = -act_com[XX];
 +    shift[YY] = -act_com[YY];
 +    shift[ZZ] = -act_com[ZZ];
 +    translate_x(act_s_1, natoms, shift);
 +
 +    /* Align rotation axis with z */
 +    align_with_z(ref_s_1, natoms, axis);
 +    align_with_z(act_s_1, natoms, axis);
 +
 +    /* Correlation matrix */
 +    Rmat = allocate_square_matrix(3);
 +
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ref_s_1[i][2] = 0.0;
 +        act_s_1[i][2] = 0.0;
 +    }
 +
 +    /* Weight positions with sqrt(weight) */
 +    if (NULL != weight)
 +    {
 +        weigh_coords(ref_s_1, weight, natoms);
 +        weigh_coords(act_s_1, weight, natoms);
 +    }
 +
 +    /* Calculate correlation matrices R=YXt (X=ref_s; Y=act_s) */
 +    calc_correl_matrix(ref_s_1, act_s_1, Rmat, natoms);
 +
 +    /* Calculate RtR */
 +    RtR = allocate_square_matrix(3);
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            for (k = 0; k < 3; k++)
 +            {
 +                RtR[i][j] += Rmat[k][i] * Rmat[k][j];
 +            }
 +        }
 +    }
 +    /* Diagonalize RtR */
 +    snew(eigvec, 3);
 +    for (i = 0; i < 3; i++)
 +    {
 +        snew(eigvec[i], 3);
 +    }
 +
 +    diagonalize_symmetric(RtR, eigvec, eigval);
 +    swap_col(eigvec, 0, 1);
 +    swap_col(eigvec, 1, 2);
 +    swap_val(eigval, 0, 1);
 +    swap_val(eigval, 1, 2);
 +
 +    /* Calculate V */
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            V[i][j]  = 0.0;
 +            WS[i][j] = 0.0;
 +        }
 +    }
 +
 +    for (i = 0; i < 2; i++)
 +    {
 +        for (j = 0; j < 2; j++)
 +        {
 +            WS[i][j] = eigvec[i][j] / sqrt(eigval[j]);
 +        }
 +    }
 +
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            for (k = 0; k < 3; k++)
 +            {
 +                V[i][j] += Rmat[i][k]*WS[k][j];
 +            }
 +        }
 +    }
 +    free_square_matrix(Rmat, 3);
 +
 +    /* Calculate optimal rotation matrix */
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            rot_matrix[i][j] = 0.0;
 +        }
 +    }
 +
 +    for (i = 0; i < 3; i++)
 +    {
 +        for (j = 0; j < 3; j++)
 +        {
 +            for (k = 0; k < 3; k++)
 +            {
 +                rot_matrix[i][j] += eigvec[i][k]*V[j][k];
 +            }
 +        }
 +    }
 +    rot_matrix[2][2] = 1.0;
 +
 +    /* In some cases abs(rot_matrix[0][0]) can be slighly larger
 +     * than unity due to numerical inacurracies. To be able to calculate
 +     * the acos function, we put these values back in range. */
 +    if (rot_matrix[0][0] > 1.0)
 +    {
 +        rot_matrix[0][0] = 1.0;
 +    }
 +    else if (rot_matrix[0][0] < -1.0)
 +    {
 +        rot_matrix[0][0] = -1.0;
 +    }
 +
 +    /* Determine the optimal rotation angle: */
 +    opt_angle = (-1.0)*acos(rot_matrix[0][0])*180.0/M_PI;
 +    if (rot_matrix[0][1] < 0.0)
 +    {
 +        opt_angle = (-1.0)*opt_angle;
 +    }
 +
 +    /* Give back some memory */
 +    free_square_matrix(RtR, 3);
 +    sfree(ref_s_1);
 +    sfree(act_s_1);
 +    for (i = 0; i < 3; i++)
 +    {
 +        sfree(eigvec[i]);
 +    }
 +    sfree(eigvec);
 +
 +    return (real) opt_angle;
 +}
 +
 +
 +/* Determine angle of the group by RMSD fit to the reference */
 +/* Not parallelized, call this routine only on the master */
 +static real flex_fit_angle(t_rotgrp *rotg)
 +{
 +    int             i;
 +    rvec           *fitcoords = NULL;
 +    rvec            center;     /* Center of positions passed to the fit routine */
 +    real            fitangle;   /* Angle of the rotation group derived by fitting */
 +    rvec            coord;
 +    real            scal;
 +    gmx_enfrotgrp_t erg;        /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Get the center of the rotation group.
 +     * Note, again, erg->xc has been sorted in do_flexible */
 +    get_center(erg->xc, erg->mc_sorted, rotg->nat, center);
 +
 +    /* === Determine the optimal fit angle for the rotation group === */
 +    if (rotg->eFittype == erotgFitNORM)
 +    {
 +        /* Normalize every position to it's reference length */
 +        for (i = 0; i < rotg->nat; i++)
 +        {
 +            /* Put the center of the positions into the origin */
 +            rvec_sub(erg->xc[i], center, coord);
 +            /* Determine the scaling factor for the length: */
 +            scal = erg->xc_ref_length[erg->xc_sortind[i]] / norm(coord);
 +            /* Get position, multiply with the scaling factor and save  */
 +            svmul(scal, coord, erg->xc_norm[i]);
 +        }
 +        fitcoords = erg->xc_norm;
 +    }
 +    else
 +    {
 +        fitcoords = erg->xc;
 +    }
 +    /* From the point of view of the current positions, the reference has rotated
 +     * backwards. Since we output the angle relative to the fixed reference,
 +     * we need the minus sign. */
 +    fitangle = -opt_angle_analytic(erg->xc_ref_sorted, fitcoords, erg->mc_sorted,
 +                                   rotg->nat, erg->xc_ref_center, center, rotg->vec);
 +
 +    return fitangle;
 +}
 +
 +
 +/* Determine actual angle of each slab by RMSD fit to the reference */
 +/* Not parallelized, call this routine only on the master */
 +static void flex_fit_angle_perslab(
 +        int       g,
 +        t_rotgrp *rotg,
 +        double    t,
 +        real      degangle,
 +        FILE     *fp)
 +{
 +    int             i, l, n, islab, ind;
 +    rvec            curr_x, ref_x;
 +    rvec            act_center; /* Center of actual positions that are passed to the fit routine */
 +    rvec            ref_center; /* Same for the reference positions */
 +    real            fitangle;   /* Angle of a slab derived from an RMSD fit to
 +                                 * the reference structure at t=0  */
 +    t_gmx_slabdata *sd;
 +    gmx_enfrotgrp_t erg;        /* Pointer to enforced rotation group data */
 +    real            OOm_av;     /* 1/average_mass of a rotation group atom */
 +    real            m_rel;      /* Relative mass of a rotation group atom  */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Average mass of a rotation group atom: */
 +    OOm_av = erg->invmass*rotg->nat;
 +
 +    /**********************************/
 +    /* First collect the data we need */
 +    /**********************************/
 +
 +    /* Collect the data for the individual slabs */
 +    for (n = erg->slab_first; n <= erg->slab_last; n++)
 +    {
 +        islab   = n - erg->slab_first; /* slab index */
 +        sd      = &(rotg->enfrotgrp->slab_data[islab]);
 +        sd->nat = erg->lastatom[islab]-erg->firstatom[islab]+1;
 +        ind     = 0;
 +
 +        /* Loop over the relevant atoms in the slab */
 +        for (l = erg->firstatom[islab]; l <= erg->lastatom[islab]; l++)
 +        {
 +            /* Current position of this atom: x[ii][XX/YY/ZZ] */
 +            copy_rvec(erg->xc[l], curr_x);
 +
 +            /* The (unrotated) reference position of this atom is copied to ref_x.
 +             * Beware, the xc coords have been sorted in do_flexible */
 +            copy_rvec(erg->xc_ref_sorted[l], ref_x);
 +
 +            /* Save data for doing angular RMSD fit later */
 +            /* Save the current atom position */
 +            copy_rvec(curr_x, sd->x[ind]);
 +            /* Save the corresponding reference position */
 +            copy_rvec(ref_x, sd->ref[ind]);
 +
 +            /* Maybe also mass-weighting was requested. If yes, additionally
 +             * multiply the weights with the relative mass of the atom. If not,
 +             * multiply with unity. */
 +            m_rel = erg->mc_sorted[l]*OOm_av;
 +
 +            /* Save the weight for this atom in this slab */
 +            sd->weight[ind] = gaussian_weight(curr_x, rotg, n) * m_rel;
 +
 +            /* Next atom in this slab */
 +            ind++;
 +        }
 +    }
 +
 +    /******************************/
 +    /* Now do the fit calculation */
 +    /******************************/
 +
 +    fprintf(fp, "%12.3e%6d%12.3f", t, g, degangle);
 +
 +    /* === Now do RMSD fitting for each slab === */
 +    /* We require at least SLAB_MIN_ATOMS in a slab, such that the fit makes sense. */
 +#define SLAB_MIN_ATOMS 4
 +
 +    for (n = erg->slab_first; n <= erg->slab_last; n++)
 +    {
 +        islab = n - erg->slab_first; /* slab index */
 +        sd    = &(rotg->enfrotgrp->slab_data[islab]);
 +        if (sd->nat >= SLAB_MIN_ATOMS)
 +        {
 +            /* Get the center of the slabs reference and current positions */
 +            get_center(sd->ref, sd->weight, sd->nat, ref_center);
 +            get_center(sd->x, sd->weight, sd->nat, act_center);
 +            if (rotg->eFittype == erotgFitNORM)
 +            {
 +                /* Normalize every position to it's reference length
 +                 * prior to performing the fit */
 +                for (i = 0; i < sd->nat; i++) /* Center */
 +                {
 +                    rvec_dec(sd->ref[i], ref_center);
 +                    rvec_dec(sd->x[i], act_center);
 +                    /* Normalize x_i such that it gets the same length as ref_i */
 +                    svmul( norm(sd->ref[i])/norm(sd->x[i]), sd->x[i], sd->x[i] );
 +                }
 +                /* We already subtracted the centers */
 +                clear_rvec(ref_center);
 +                clear_rvec(act_center);
 +            }
 +            fitangle = -opt_angle_analytic(sd->ref, sd->x, sd->weight, sd->nat,
 +                                           ref_center, act_center, rotg->vec);
 +            fprintf(fp, "%6d%6d%12.3f", n, sd->nat, fitangle);
 +        }
 +    }
 +    fprintf(fp, "\n");
 +
 +#undef SLAB_MIN_ATOMS
 +}
 +
 +
 +/* Shift x with is */
 +static gmx_inline void shift_single_coord(matrix box, rvec x, const ivec is)
 +{
 +    int tx, ty, tz;
 +
 +
 +    tx = is[XX];
 +    ty = is[YY];
 +    tz = is[ZZ];
 +
 +    if (TRICLINIC(box))
 +    {
 +        x[XX] += tx*box[XX][XX]+ty*box[YY][XX]+tz*box[ZZ][XX];
 +        x[YY] += ty*box[YY][YY]+tz*box[ZZ][YY];
 +        x[ZZ] += tz*box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        x[XX] += tx*box[XX][XX];
 +        x[YY] += ty*box[YY][YY];
 +        x[ZZ] += tz*box[ZZ][ZZ];
 +    }
 +}
 +
 +
 +/* Determine the 'home' slab of this atom which is the
 + * slab with the highest Gaussian weight of all */
 +#define round(a) (int)(a+0.5)
 +static gmx_inline int get_homeslab(
 +        rvec curr_x,   /* The position for which the home slab shall be determined */
 +        rvec rotvec,   /* The rotation vector */
 +        real slabdist) /* The slab distance */
 +{
 +    real dist;
 +
 +
 +    /* The distance of the atom to the coordinate center (where the
 +     * slab with index 0) is */
 +    dist = iprod(rotvec, curr_x);
 +
 +    return round(dist / slabdist);
 +}
 +
 +
 +/* For a local atom determine the relevant slabs, i.e. slabs in
 + * which the gaussian is larger than min_gaussian
 + */
 +static int get_single_atom_gaussians(
 +        rvec       curr_x,
 +        t_rotgrp  *rotg)
 +{
 +    int             slab, homeslab;
 +    real            g;
 +    int             count = 0;
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Determine the 'home' slab of this atom: */
 +    homeslab = get_homeslab(curr_x, rotg->vec, rotg->slab_dist);
 +
 +    /* First determine the weight in the atoms home slab: */
 +    g = gaussian_weight(curr_x, rotg, homeslab);
 +
 +    erg->gn_atom[count]    = g;
 +    erg->gn_slabind[count] = homeslab;
 +    count++;
 +
 +
 +    /* Determine the max slab */
 +    slab = homeslab;
 +    while (g > rotg->min_gaussian)
 +    {
 +        slab++;
 +        g = gaussian_weight(curr_x, rotg, slab);
 +        erg->gn_slabind[count] = slab;
 +        erg->gn_atom[count]    = g;
 +        count++;
 +    }
 +    count--;
 +
 +    /* Determine the max slab */
 +    slab = homeslab;
 +    do
 +    {
 +        slab--;
 +        g = gaussian_weight(curr_x, rotg, slab);
 +        erg->gn_slabind[count] = slab;
 +        erg->gn_atom[count]    = g;
 +        count++;
 +    }
 +    while (g > rotg->min_gaussian);
 +    count--;
 +
 +    return count;
 +}
 +
 +
 +static void flex2_precalc_inner_sum(t_rotgrp *rotg)
 +{
 +    int             i, n, islab;
 +    rvec            xi;       /* positions in the i-sum                        */
 +    rvec            xcn, ycn; /* the current and the reference slab centers    */
 +    real            gaussian_xi;
 +    rvec            yi0;
 +    rvec            rin;     /* Helper variables                              */
 +    real            fac, fac2;
 +    rvec            innersumvec;
 +    real            OOpsii, OOpsiistar;
 +    real            sin_rin; /* s_ii.r_ii */
 +    rvec            s_in, tmpvec, tmpvec2;
 +    real            mi, wi;  /* Mass-weighting of the positions                 */
 +    real            N_M;     /* N/M                                             */
 +    gmx_enfrotgrp_t erg;     /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Loop over all slabs that contain something */
 +    for (n = erg->slab_first; n <= erg->slab_last; n++)
 +    {
 +        islab = n - erg->slab_first; /* slab index */
 +
 +        /* The current center of this slab is saved in xcn: */
 +        copy_rvec(erg->slab_center[islab], xcn);
 +        /* ... and the reference center in ycn: */
 +        copy_rvec(erg->slab_center_ref[islab+erg->slab_buffer], ycn);
 +
 +        /*** D. Calculate the whole inner sum used for second and third sum */
 +        /* For slab n, we need to loop over all atoms i again. Since we sorted
 +         * the atoms with respect to the rotation vector, we know that it is sufficient
 +         * to calculate from firstatom to lastatom only. All other contributions will
 +         * be very small. */
 +        clear_rvec(innersumvec);
 +        for (i = erg->firstatom[islab]; i <= erg->lastatom[islab]; i++)
 +        {
 +            /* Coordinate xi of this atom */
 +            copy_rvec(erg->xc[i], xi);
 +
 +            /* The i-weights */
 +            gaussian_xi = gaussian_weight(xi, rotg, n);
 +            mi          = erg->mc_sorted[i]; /* need the sorted mass here */
 +            wi          = N_M*mi;
 +
 +            /* Calculate rin */
 +            copy_rvec(erg->xc_ref_sorted[i], yi0); /* Reference position yi0   */
 +            rvec_sub(yi0, ycn, tmpvec2);           /* tmpvec2 = yi0 - ycn      */
 +            mvmul(erg->rotmat, tmpvec2, rin);      /* rin = Omega.(yi0 - ycn)  */
 +
 +            /* Calculate psi_i* and sin */
 +            rvec_sub(xi, xcn, tmpvec2);           /* tmpvec2 = xi - xcn       */
 +            cprod(rotg->vec, tmpvec2, tmpvec);    /* tmpvec = v x (xi - xcn)  */
 +            OOpsiistar = norm2(tmpvec)+rotg->eps; /* OOpsii* = 1/psii* = |v x (xi-xcn)|^2 + eps */
 +            OOpsii     = norm(tmpvec);            /* OOpsii = 1 / psii = |v x (xi - xcn)| */
 +
-             /*        v x Omega*(yi0-ycn)    */
++            /*                           *         v x (xi - xcn)          */
 +            unitv(tmpvec, s_in);        /*  sin = ----------------         */
 +                                        /*        |v x (xi - xcn)|         */
 +
 +            sin_rin = iprod(s_in, rin); /* sin_rin = sin . rin             */
 +
 +            /* Now the whole sum */
 +            fac = OOpsii/OOpsiistar;
 +            svmul(fac, rin, tmpvec);
 +            fac2 = fac*fac*OOpsii;
 +            svmul(fac2*sin_rin, s_in, tmpvec2);
 +            rvec_dec(tmpvec, tmpvec2);
 +
 +            svmul(wi*gaussian_xi*sin_rin, tmpvec, tmpvec2);
 +
 +            rvec_inc(innersumvec, tmpvec2);
 +        } /* now we have the inner sum, used both for sum2 and sum3 */
 +
 +        /* Save it to be used in do_flex2_lowlevel */
 +        copy_rvec(innersumvec, erg->slab_innersumvec[islab]);
 +    } /* END of loop over slabs */
 +}
 +
 +
 +static void flex_precalc_inner_sum(t_rotgrp *rotg)
 +{
 +    int             i, n, islab;
 +    rvec            xi;       /* position                                      */
 +    rvec            xcn, ycn; /* the current and the reference slab centers    */
 +    rvec            qin, rin; /* q_i^n and r_i^n                               */
 +    real            bin;
 +    rvec            tmpvec;
 +    rvec            innersumvec; /* Inner part of sum_n2                          */
 +    real            gaussian_xi; /* Gaussian weight gn(xi)                        */
 +    real            mi, wi;      /* Mass-weighting of the positions               */
 +    real            N_M;         /* N/M                                           */
 +
 +    gmx_enfrotgrp_t erg;         /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Loop over all slabs that contain something */
 +    for (n = erg->slab_first; n <= erg->slab_last; n++)
 +    {
 +        islab = n - erg->slab_first; /* slab index */
 +
 +        /* The current center of this slab is saved in xcn: */
 +        copy_rvec(erg->slab_center[islab], xcn);
 +        /* ... and the reference center in ycn: */
 +        copy_rvec(erg->slab_center_ref[islab+erg->slab_buffer], ycn);
 +
 +        /* For slab n, we need to loop over all atoms i again. Since we sorted
 +         * the atoms with respect to the rotation vector, we know that it is sufficient
 +         * to calculate from firstatom to lastatom only. All other contributions will
 +         * be very small. */
 +        clear_rvec(innersumvec);
 +        for (i = erg->firstatom[islab]; i <= erg->lastatom[islab]; i++)
 +        {
 +            /* Coordinate xi of this atom */
 +            copy_rvec(erg->xc[i], xi);
 +
 +            /* The i-weights */
 +            gaussian_xi = gaussian_weight(xi, rotg, n);
 +            mi          = erg->mc_sorted[i]; /* need the sorted mass here */
 +            wi          = N_M*mi;
 +
 +            /* Calculate rin and qin */
 +            rvec_sub(erg->xc_ref_sorted[i], ycn, tmpvec); /* tmpvec = yi0-ycn */
 +            mvmul(erg->rotmat, tmpvec, rin);              /* rin = Omega.(yi0 - ycn)  */
 +            cprod(rotg->vec, rin, tmpvec);                /* tmpvec = v x Omega*(yi0-ycn) */
 +
-             /*         v x (xj - xcn)          */
++            /*                                *        v x Omega*(yi0-ycn)    */
 +            unitv(tmpvec, qin);              /* qin = ---------------------   */
 +                                             /*       |v x Omega*(yi0-ycn)|   */
 +
 +            /* Calculate bin */
 +            rvec_sub(xi, xcn, tmpvec);            /* tmpvec = xi-xcn          */
 +            bin = iprod(qin, tmpvec);             /* bin  = qin*(xi-xcn)      */
 +
 +            svmul(wi*gaussian_xi*bin, qin, tmpvec);
 +
 +            /* Add this contribution to the inner sum: */
 +            rvec_add(innersumvec, tmpvec, innersumvec);
 +        } /* now we have the inner sum vector S^n for this slab */
 +          /* Save it to be used in do_flex_lowlevel */
 +        copy_rvec(innersumvec, erg->slab_innersumvec[islab]);
 +    }
 +}
 +
 +
 +static real do_flex2_lowlevel(
 +        t_rotgrp  *rotg,
 +        real       sigma,   /* The Gaussian width sigma */
 +        rvec       x[],
 +        gmx_bool   bOutstepRot,
 +        gmx_bool   bOutstepSlab,
 +        matrix     box)
 +{
 +    int             count, ic, ii, j, m, n, islab, iigrp, ifit;
 +    rvec            xj;          /* position in the i-sum                         */
 +    rvec            yj0;         /* the reference position in the j-sum           */
 +    rvec            xcn, ycn;    /* the current and the reference slab centers    */
 +    real            V;           /* This node's part of the rotation pot. energy  */
 +    real            gaussian_xj; /* Gaussian weight                               */
 +    real            beta;
 +
 +    real            numerator, fit_numerator;
 +    rvec            rjn, fit_rjn; /* Helper variables                              */
 +    real            fac, fac2;
 +
 +    real            OOpsij, OOpsijstar;
 +    real            OOsigma2; /* 1/(sigma^2)                                   */
 +    real            sjn_rjn;
 +    real            betasigpsi;
 +    rvec            sjn, tmpvec, tmpvec2, yj0_ycn;
 +    rvec            sum1vec_part, sum1vec, sum2vec_part, sum2vec, sum3vec, sum4vec, innersumvec;
 +    real            sum3, sum4;
 +    gmx_enfrotgrp_t erg;     /* Pointer to enforced rotation group data       */
 +    real            mj, wj;  /* Mass-weighting of the positions               */
 +    real            N_M;     /* N/M                                           */
 +    real            Wjn;     /* g_n(x_j) m_j / Mjn                            */
 +    gmx_bool        bCalcPotFit;
 +
 +    /* To calculate the torque per slab */
 +    rvec slab_force;         /* Single force from slab n on one atom          */
 +    rvec slab_sum1vec_part;
 +    real slab_sum3part, slab_sum4part;
 +    rvec slab_sum1vec, slab_sum2vec, slab_sum3vec, slab_sum4vec;
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Pre-calculate the inner sums, so that we do not have to calculate
 +     * them again for every atom */
 +    flex2_precalc_inner_sum(rotg);
 +
 +    bCalcPotFit = (bOutstepRot || bOutstepSlab) && (erotgFitPOT == rotg->eFittype);
 +
 +    /********************************************************/
 +    /* Main loop over all local atoms of the rotation group */
 +    /********************************************************/
 +    N_M      = rotg->nat * erg->invmass;
 +    V        = 0.0;
 +    OOsigma2 = 1.0 / (sigma*sigma);
 +    for (j = 0; j < erg->nat_loc; j++)
 +    {
 +        /* Local index of a rotation group atom  */
 +        ii = erg->ind_loc[j];
 +        /* Position of this atom in the collective array */
 +        iigrp = erg->xc_ref_ind[j];
 +        /* Mass-weighting */
 +        mj = erg->mc[iigrp];  /* need the unsorted mass here */
 +        wj = N_M*mj;
 +
 +        /* Current position of this atom: x[ii][XX/YY/ZZ]
 +         * Note that erg->xc_center contains the center of mass in case the flex2-t
 +         * potential was chosen. For the flex2 potential erg->xc_center must be
 +         * zero. */
 +        rvec_sub(x[ii], erg->xc_center, xj);
 +
 +        /* Shift this atom such that it is near its reference */
 +        shift_single_coord(box, xj, erg->xc_shifts[iigrp]);
 +
 +        /* Determine the slabs to loop over, i.e. the ones with contributions
 +         * larger than min_gaussian */
 +        count = get_single_atom_gaussians(xj, rotg);
 +
 +        clear_rvec(sum1vec_part);
 +        clear_rvec(sum2vec_part);
 +        sum3 = 0.0;
 +        sum4 = 0.0;
 +        /* Loop over the relevant slabs for this atom */
 +        for (ic = 0; ic < count; ic++)
 +        {
 +            n = erg->gn_slabind[ic];
 +
 +            /* Get the precomputed Gaussian value of curr_slab for curr_x */
 +            gaussian_xj = erg->gn_atom[ic];
 +
 +            islab = n - erg->slab_first; /* slab index */
 +
 +            /* The (unrotated) reference position of this atom is copied to yj0: */
 +            copy_rvec(rotg->x_ref[iigrp], yj0);
 +
 +            beta = calc_beta(xj, rotg, n);
 +
 +            /* The current center of this slab is saved in xcn: */
 +            copy_rvec(erg->slab_center[islab], xcn);
 +            /* ... and the reference center in ycn: */
 +            copy_rvec(erg->slab_center_ref[islab+erg->slab_buffer], ycn);
 +
 +            rvec_sub(yj0, ycn, yj0_ycn);          /* yj0_ycn = yj0 - ycn      */
 +
 +            /* Rotate: */
 +            mvmul(erg->rotmat, yj0_ycn, rjn);     /* rjn = Omega.(yj0 - ycn)  */
 +
 +            /* Subtract the slab center from xj */
 +            rvec_sub(xj, xcn, tmpvec2);           /* tmpvec2 = xj - xcn       */
 +
 +            /* Calculate sjn */
 +            cprod(rotg->vec, tmpvec2, tmpvec);    /* tmpvec = v x (xj - xcn)  */
 +
 +            OOpsijstar = norm2(tmpvec)+rotg->eps; /* OOpsij* = 1/psij* = |v x (xj-xcn)|^2 + eps */
 +
 +            numerator = sqr(iprod(tmpvec, rjn));
 +
 +            /*********************************/
 +            /* Add to the rotation potential */
 +            /*********************************/
 +            V += 0.5*rotg->k*wj*gaussian_xj*numerator/OOpsijstar;
 +
 +            /* If requested, also calculate the potential for a set of angles
 +             * near the current reference angle */
 +            if (bCalcPotFit)
 +            {
 +                for (ifit = 0; ifit < rotg->PotAngle_nstep; ifit++)
 +                {
 +                    mvmul(erg->PotAngleFit->rotmat[ifit], yj0_ycn, fit_rjn);
 +                    fit_numerator              = sqr(iprod(tmpvec, fit_rjn));
 +                    erg->PotAngleFit->V[ifit] += 0.5*rotg->k*wj*gaussian_xj*fit_numerator/OOpsijstar;
 +                }
 +            }
 +
 +            /*************************************/
 +            /* Now calculate the force on atom j */
 +            /*************************************/
 +
 +            OOpsij = norm(tmpvec);    /* OOpsij = 1 / psij = |v x (xj - xcn)| */
 +
-     rvec            tmpvec, tmpvec2, tmp_f; /* Helper variables                         */
++            /*                              *         v x (xj - xcn)          */
 +            unitv(tmpvec, sjn);            /*  sjn = ----------------         */
 +                                           /*        |v x (xj - xcn)|         */
 +
 +            sjn_rjn = iprod(sjn, rjn);     /* sjn_rjn = sjn . rjn             */
 +
 +
 +            /*** A. Calculate the first of the four sum terms: ****************/
 +            fac = OOpsij/OOpsijstar;
 +            svmul(fac, rjn, tmpvec);
 +            fac2 = fac*fac*OOpsij;
 +            svmul(fac2*sjn_rjn, sjn, tmpvec2);
 +            rvec_dec(tmpvec, tmpvec2);
 +            fac2 = wj*gaussian_xj; /* also needed for sum4 */
 +            svmul(fac2*sjn_rjn, tmpvec, slab_sum1vec_part);
 +            /********************/
 +            /*** Add to sum1: ***/
 +            /********************/
 +            rvec_inc(sum1vec_part, slab_sum1vec_part); /* sum1 still needs to vector multiplied with v */
 +
 +            /*** B. Calculate the forth of the four sum terms: ****************/
 +            betasigpsi = beta*OOsigma2*OOpsij; /* this is also needed for sum3 */
 +            /********************/
 +            /*** Add to sum4: ***/
 +            /********************/
 +            slab_sum4part = fac2*betasigpsi*fac*sjn_rjn*sjn_rjn; /* Note that fac is still valid from above */
 +            sum4         += slab_sum4part;
 +
 +            /*** C. Calculate Wjn for second and third sum */
 +            /* Note that we can safely divide by slab_weights since we check in
 +             * get_slab_centers that it is non-zero. */
 +            Wjn = gaussian_xj*mj/erg->slab_weights[islab];
 +
 +            /* We already have precalculated the inner sum for slab n */
 +            copy_rvec(erg->slab_innersumvec[islab], innersumvec);
 +
 +            /* Weigh the inner sum vector with Wjn */
 +            svmul(Wjn, innersumvec, innersumvec);
 +
 +            /*** E. Calculate the second of the four sum terms: */
 +            /********************/
 +            /*** Add to sum2: ***/
 +            /********************/
 +            rvec_inc(sum2vec_part, innersumvec); /* sum2 still needs to be vector crossproduct'ed with v */
 +
 +            /*** F. Calculate the third of the four sum terms: */
 +            slab_sum3part = betasigpsi * iprod(sjn, innersumvec);
 +            sum3         += slab_sum3part; /* still needs to be multiplied with v */
 +
 +            /*** G. Calculate the torque on the local slab's axis: */
 +            if (bOutstepRot)
 +            {
 +                /* Sum1 */
 +                cprod(slab_sum1vec_part, rotg->vec, slab_sum1vec);
 +                /* Sum2 */
 +                cprod(innersumvec, rotg->vec, slab_sum2vec);
 +                /* Sum3 */
 +                svmul(slab_sum3part, rotg->vec, slab_sum3vec);
 +                /* Sum4 */
 +                svmul(slab_sum4part, rotg->vec, slab_sum4vec);
 +
 +                /* The force on atom ii from slab n only: */
 +                for (m = 0; m < DIM; m++)
 +                {
 +                    slab_force[m] = rotg->k * (-slab_sum1vec[m] + slab_sum2vec[m] - slab_sum3vec[m] + 0.5*slab_sum4vec[m]);
 +                }
 +
 +                erg->slab_torque_v[islab] += torque(rotg->vec, slab_force, xj, xcn);
 +            }
 +        } /* END of loop over slabs */
 +
 +        /* Construct the four individual parts of the vector sum: */
 +        cprod(sum1vec_part, rotg->vec, sum1vec);      /* sum1vec =   { } x v  */
 +        cprod(sum2vec_part, rotg->vec, sum2vec);      /* sum2vec =   { } x v  */
 +        svmul(sum3, rotg->vec, sum3vec);              /* sum3vec =   { } . v  */
 +        svmul(sum4, rotg->vec, sum4vec);              /* sum4vec =   { } . v  */
 +
 +        /* Store the additional force so that it can be added to the force
 +         * array after the normal forces have been evaluated */
 +        for (m = 0; m < DIM; m++)
 +        {
 +            erg->f_rot_loc[j][m] = rotg->k * (-sum1vec[m] + sum2vec[m] - sum3vec[m] + 0.5*sum4vec[m]);
 +        }
 +
 +#ifdef SUM_PARTS
 +        fprintf(stderr, "sum1: %15.8f %15.8f %15.8f\n",    -rotg->k*sum1vec[XX],    -rotg->k*sum1vec[YY],    -rotg->k*sum1vec[ZZ]);
 +        fprintf(stderr, "sum2: %15.8f %15.8f %15.8f\n",     rotg->k*sum2vec[XX],     rotg->k*sum2vec[YY],     rotg->k*sum2vec[ZZ]);
 +        fprintf(stderr, "sum3: %15.8f %15.8f %15.8f\n",    -rotg->k*sum3vec[XX],    -rotg->k*sum3vec[YY],    -rotg->k*sum3vec[ZZ]);
 +        fprintf(stderr, "sum4: %15.8f %15.8f %15.8f\n", 0.5*rotg->k*sum4vec[XX], 0.5*rotg->k*sum4vec[YY], 0.5*rotg->k*sum4vec[ZZ]);
 +#endif
 +
 +        PRINT_FORCE_J
 +
 +    } /* END of loop over local atoms */
 +
 +    return V;
 +}
 +
 +
 +static real do_flex_lowlevel(
 +        t_rotgrp *rotg,
 +        real      sigma,     /* The Gaussian width sigma                      */
 +        rvec      x[],
 +        gmx_bool  bOutstepRot,
 +        gmx_bool  bOutstepSlab,
 +        matrix    box)
 +{
 +    int             count, ic, ifit, ii, j, m, n, islab, iigrp;
 +    rvec            xj, yj0;                /* current and reference position                */
 +    rvec            xcn, ycn;               /* the current and the reference slab centers    */
 +    rvec            yj0_ycn;                /* yj0 - ycn                                     */
 +    rvec            xj_xcn;                 /* xj - xcn                                      */
 +    rvec            qjn, fit_qjn;           /* q_i^n                                         */
 +    rvec            sum_n1, sum_n2;         /* Two contributions to the rotation force       */
 +    rvec            innersumvec;            /* Inner part of sum_n2                          */
 +    rvec            s_n;
 +    rvec            force_n;                /* Single force from slab n on one atom          */
 +    rvec            force_n1, force_n2;     /* First and second part of force_n              */
-             /*         v x Omega.(yj0-ycn)    */
++    rvec            tmpvec, tmpvec2, tmp_f; /* Helper variables                              */
 +    real            V;                      /* The rotation potential energy                 */
 +    real            OOsigma2;               /* 1/(sigma^2)                                   */
 +    real            beta;                   /* beta_n(xj)                                    */
 +    real            bjn, fit_bjn;           /* b_j^n                                         */
 +    real            gaussian_xj;            /* Gaussian weight gn(xj)                        */
 +    real            betan_xj_sigma2;
 +    real            mj, wj;                 /* Mass-weighting of the positions               */
 +    real            N_M;                    /* N/M                                           */
 +    gmx_enfrotgrp_t erg;                    /* Pointer to enforced rotation group data       */
 +    gmx_bool        bCalcPotFit;
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Pre-calculate the inner sums, so that we do not have to calculate
 +     * them again for every atom */
 +    flex_precalc_inner_sum(rotg);
 +
 +    bCalcPotFit = (bOutstepRot || bOutstepSlab) && (erotgFitPOT == rotg->eFittype);
 +
 +    /********************************************************/
 +    /* Main loop over all local atoms of the rotation group */
 +    /********************************************************/
 +    OOsigma2 = 1.0/(sigma*sigma);
 +    N_M      = rotg->nat * erg->invmass;
 +    V        = 0.0;
 +    for (j = 0; j < erg->nat_loc; j++)
 +    {
 +        /* Local index of a rotation group atom  */
 +        ii = erg->ind_loc[j];
 +        /* Position of this atom in the collective array */
 +        iigrp = erg->xc_ref_ind[j];
 +        /* Mass-weighting */
 +        mj = erg->mc[iigrp];  /* need the unsorted mass here */
 +        wj = N_M*mj;
 +
 +        /* Current position of this atom: x[ii][XX/YY/ZZ]
 +         * Note that erg->xc_center contains the center of mass in case the flex-t
 +         * potential was chosen. For the flex potential erg->xc_center must be
 +         * zero. */
 +        rvec_sub(x[ii], erg->xc_center, xj);
 +
 +        /* Shift this atom such that it is near its reference */
 +        shift_single_coord(box, xj, erg->xc_shifts[iigrp]);
 +
 +        /* Determine the slabs to loop over, i.e. the ones with contributions
 +         * larger than min_gaussian */
 +        count = get_single_atom_gaussians(xj, rotg);
 +
 +        clear_rvec(sum_n1);
 +        clear_rvec(sum_n2);
 +
 +        /* Loop over the relevant slabs for this atom */
 +        for (ic = 0; ic < count; ic++)
 +        {
 +            n = erg->gn_slabind[ic];
 +
 +            /* Get the precomputed Gaussian for xj in slab n */
 +            gaussian_xj = erg->gn_atom[ic];
 +
 +            islab = n - erg->slab_first; /* slab index */
 +
 +            /* The (unrotated) reference position of this atom is saved in yj0: */
 +            copy_rvec(rotg->x_ref[iigrp], yj0);
 +
 +            beta = calc_beta(xj, rotg, n);
 +
 +            /* The current center of this slab is saved in xcn: */
 +            copy_rvec(erg->slab_center[islab], xcn);
 +            /* ... and the reference center in ycn: */
 +            copy_rvec(erg->slab_center_ref[islab+erg->slab_buffer], ycn);
 +
 +            rvec_sub(yj0, ycn, yj0_ycn); /* yj0_ycn = yj0 - ycn */
 +
 +            /* Rotate: */
 +            mvmul(erg->rotmat, yj0_ycn, tmpvec2); /* tmpvec2= Omega.(yj0-ycn) */
 +
 +            /* Subtract the slab center from xj */
 +            rvec_sub(xj, xcn, xj_xcn);           /* xj_xcn = xj - xcn         */
 +
 +            /* Calculate qjn */
 +            cprod(rotg->vec, tmpvec2, tmpvec); /* tmpvec= v x Omega.(yj0-ycn) */
 +
-                     /*             v x Omega.(yj0-ycn)    */
++            /*                         *         v x Omega.(yj0-ycn)    */
 +            unitv(tmpvec, qjn);       /*  qjn = ---------------------   */
 +                                      /*        |v x Omega.(yj0-ycn)|   */
 +
 +            bjn = iprod(qjn, xj_xcn); /* bjn = qjn * (xj - xcn) */
 +
 +            /*********************************/
 +            /* Add to the rotation potential */
 +            /*********************************/
 +            V += 0.5*rotg->k*wj*gaussian_xj*sqr(bjn);
 +
 +            /* If requested, also calculate the potential for a set of angles
 +             * near the current reference angle */
 +            if (bCalcPotFit)
 +            {
 +                for (ifit = 0; ifit < rotg->PotAngle_nstep; ifit++)
 +                {
 +                    /* As above calculate Omega.(yj0-ycn), now for the other angles */
 +                    mvmul(erg->PotAngleFit->rotmat[ifit], yj0_ycn, tmpvec2); /* tmpvec2= Omega.(yj0-ycn) */
 +                    /* As above calculate qjn */
 +                    cprod(rotg->vec, tmpvec2, tmpvec);                       /* tmpvec= v x Omega.(yj0-ycn) */
-             /*          beta_n(xj)              */
++                    /*                                                        *             v x Omega.(yj0-ycn)    */
 +                    unitv(tmpvec, fit_qjn);                                  /*  fit_qjn = ---------------------   */
 +                                                                             /*            |v x Omega.(yj0-ycn)|   */
 +                    fit_bjn = iprod(fit_qjn, xj_xcn);                        /* fit_bjn = fit_qjn * (xj - xcn) */
 +                    /* Add to the rotation potential for this angle */
 +                    erg->PotAngleFit->V[ifit] += 0.5*rotg->k*wj*gaussian_xj*sqr(fit_bjn);
 +                }
 +            }
 +
 +            /****************************************************************/
 +            /* sum_n1 will typically be the main contribution to the force: */
 +            /****************************************************************/
 +            betan_xj_sigma2 = beta*OOsigma2;  /*  beta_n(xj)/sigma^2  */
 +
 +            /* The next lines calculate
 +             *  qjn - (bjn*beta(xj)/(2sigma^2))v  */
 +            svmul(bjn*0.5*betan_xj_sigma2, rotg->vec, tmpvec2);
 +            rvec_sub(qjn, tmpvec2, tmpvec);
 +
 +            /* Multiply with gn(xj)*bjn: */
 +            svmul(gaussian_xj*bjn, tmpvec, tmpvec2);
 +
 +            /* Sum over n: */
 +            rvec_inc(sum_n1, tmpvec2);
 +
 +            /* We already have precalculated the Sn term for slab n */
 +            copy_rvec(erg->slab_innersumvec[islab], s_n);
-         /*         v x Omega.(yj0-u)     */
++            /*                                                             *          beta_n(xj)              */
 +            svmul(betan_xj_sigma2*iprod(s_n, xj_xcn), rotg->vec, tmpvec); /* tmpvec = ---------- s_n (xj-xcn) */
 +                                                                          /*            sigma^2               */
 +
 +            rvec_sub(s_n, tmpvec, innersumvec);
 +
 +            /* We can safely divide by slab_weights since we check in get_slab_centers
 +             * that it is non-zero. */
 +            svmul(gaussian_xj/erg->slab_weights[islab], innersumvec, innersumvec);
 +
 +            rvec_add(sum_n2, innersumvec, sum_n2);
 +
 +            /* Calculate the torque: */
 +            if (bOutstepRot)
 +            {
 +                /* The force on atom ii from slab n only: */
 +                svmul(-rotg->k*wj, tmpvec2, force_n1);     /* part 1 */
 +                svmul( rotg->k*mj, innersumvec, force_n2); /* part 2 */
 +                rvec_add(force_n1, force_n2, force_n);
 +                erg->slab_torque_v[islab] += torque(rotg->vec, force_n, xj, xcn);
 +            }
 +        } /* END of loop over slabs */
 +
 +        /* Put both contributions together: */
 +        svmul(wj, sum_n1, sum_n1);
 +        svmul(mj, sum_n2, sum_n2);
 +        rvec_sub(sum_n2, sum_n1, tmp_f); /* F = -grad V */
 +
 +        /* Store the additional force so that it can be added to the force
 +         * array after the normal forces have been evaluated */
 +        for (m = 0; m < DIM; m++)
 +        {
 +            erg->f_rot_loc[j][m] = rotg->k*tmp_f[m];
 +        }
 +
 +        PRINT_FORCE_J
 +
 +    } /* END of loop over local atoms */
 +
 +    return V;
 +}
 +
 +#ifdef PRINT_COORDS
 +static void print_coordinates(t_rotgrp *rotg, rvec x[], matrix box, int step)
 +{
 +    int             i;
 +    static FILE    *fp;
 +    static char     buf[STRLEN];
 +    static gmx_bool bFirst = 1;
 +
 +
 +    if (bFirst)
 +    {
 +        sprintf(buf, "coords%d.txt", cr->nodeid);
 +        fp     = fopen(buf, "w");
 +        bFirst = 0;
 +    }
 +
 +    fprintf(fp, "\nStep %d\n", step);
 +    fprintf(fp, "box: %f %f %f %f %f %f %f %f %f\n",
 +            box[XX][XX], box[XX][YY], box[XX][ZZ],
 +            box[YY][XX], box[YY][YY], box[YY][ZZ],
 +            box[ZZ][XX], box[ZZ][ZZ], box[ZZ][ZZ]);
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        fprintf(fp, "%4d  %f %f %f\n", i,
 +                erg->xc[i][XX], erg->xc[i][YY], erg->xc[i][ZZ]);
 +    }
 +    fflush(fp);
 +
 +}
 +#endif
 +
 +
 +static int projection_compare(const void *a, const void *b)
 +{
 +    sort_along_vec_t *xca, *xcb;
 +
 +
 +    xca = (sort_along_vec_t *)a;
 +    xcb = (sort_along_vec_t *)b;
 +
 +    if (xca->xcproj < xcb->xcproj)
 +    {
 +        return -1;
 +    }
 +    else if (xca->xcproj > xcb->xcproj)
 +    {
 +        return 1;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +
 +static void sort_collective_coordinates(
 +        t_rotgrp         *rotg, /* Rotation group */
 +        sort_along_vec_t *data) /* Buffer for sorting the positions */
 +{
 +    int             i;
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* The projection of the position vector on the rotation vector is
 +     * the relevant value for sorting. Fill the 'data' structure */
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        data[i].xcproj = iprod(erg->xc[i], rotg->vec);  /* sort criterium */
 +        data[i].m      = erg->mc[i];
 +        data[i].ind    = i;
 +        copy_rvec(erg->xc[i], data[i].x    );
 +        copy_rvec(rotg->x_ref[i], data[i].x_ref);
 +    }
 +    /* Sort the 'data' structure */
 +    gmx_qsort(data, rotg->nat, sizeof(sort_along_vec_t), projection_compare);
 +
 +    /* Copy back the sorted values */
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        copy_rvec(data[i].x, erg->xc[i]           );
 +        copy_rvec(data[i].x_ref, erg->xc_ref_sorted[i]);
 +        erg->mc_sorted[i]  = data[i].m;
 +        erg->xc_sortind[i] = data[i].ind;
 +    }
 +}
 +
 +
 +/* For each slab, get the first and the last index of the sorted atom
 + * indices */
 +static void get_firstlast_atom_per_slab(t_rotgrp *rotg)
 +{
 +    int             i, islab, n;
 +    real            beta;
 +    gmx_enfrotgrp_t erg;     /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Find the first atom that needs to enter the calculation for each slab */
 +    n = erg->slab_first; /* slab */
 +    i = 0;               /* start with the first atom */
 +    do
 +    {
 +        /* Find the first atom that significantly contributes to this slab */
 +        do /* move forward in position until a large enough beta is found */
 +        {
 +            beta = calc_beta(erg->xc[i], rotg, n);
 +            i++;
 +        }
 +        while ((beta < -erg->max_beta) && (i < rotg->nat));
 +        i--;
 +        islab                 = n - erg->slab_first; /* slab index */
 +        erg->firstatom[islab] = i;
 +        /* Proceed to the next slab */
 +        n++;
 +    }
 +    while (n <= erg->slab_last);
 +
 +    /* Find the last atom for each slab */
 +    n = erg->slab_last; /* start with last slab */
 +    i = rotg->nat-1;    /* start with the last atom */
 +    do
 +    {
 +        do  /* move backward in position until a large enough beta is found */
 +        {
 +            beta = calc_beta(erg->xc[i], rotg, n);
 +            i--;
 +        }
 +        while ((beta > erg->max_beta) && (i > -1));
 +        i++;
 +        islab                = n - erg->slab_first; /* slab index */
 +        erg->lastatom[islab] = i;
 +        /* Proceed to the next slab */
 +        n--;
 +    }
 +    while (n >= erg->slab_first);
 +}
 +
 +
 +/* Determine the very first and very last slab that needs to be considered
 + * For the first slab that needs to be considered, we have to find the smallest
 + * n that obeys:
 + *
 + * x_first * v - n*Delta_x <= beta_max
 + *
 + * slab index n, slab distance Delta_x, rotation vector v. For the last slab we
 + * have to find the largest n that obeys
 + *
 + * x_last * v - n*Delta_x >= -beta_max
 + *
 + */
 +static gmx_inline int get_first_slab(
 +        t_rotgrp *rotg,      /* The rotation group (inputrec data) */
 +        real      max_beta,  /* The max_beta value, instead of min_gaussian */
 +        rvec      firstatom) /* First atom after sorting along the rotation vector v */
 +{
 +    /* Find the first slab for the first atom */
 +    return ceil((iprod(firstatom, rotg->vec) - max_beta)/rotg->slab_dist);
 +}
 +
 +
 +static gmx_inline int get_last_slab(
 +        t_rotgrp *rotg,     /* The rotation group (inputrec data) */
 +        real      max_beta, /* The max_beta value, instead of min_gaussian */
 +        rvec      lastatom) /* Last atom along v */
 +{
 +    /* Find the last slab for the last atom */
 +    return floor((iprod(lastatom, rotg->vec) + max_beta)/rotg->slab_dist);
 +}
 +
 +
 +static void get_firstlast_slab_check(
 +        t_rotgrp        *rotg,      /* The rotation group (inputrec data) */
 +        t_gmx_enfrotgrp *erg,       /* The rotation group (data only accessible in this file) */
 +        rvec             firstatom, /* First atom after sorting along the rotation vector v */
 +        rvec             lastatom,  /* Last atom along v */
 +        int              g)         /* The rotation group number */
 +{
 +    erg->slab_first = get_first_slab(rotg, erg->max_beta, firstatom);
 +    erg->slab_last  = get_last_slab(rotg, erg->max_beta, lastatom);
 +
 +    /* Check whether we have reference data to compare against */
 +    if (erg->slab_first < erg->slab_first_ref)
 +    {
 +        gmx_fatal(FARGS, "%s No reference data for first slab (n=%d), unable to proceed.",
 +                  RotStr, erg->slab_first);
 +    }
 +
 +    /* Check whether we have reference data to compare against */
 +    if (erg->slab_last > erg->slab_last_ref)
 +    {
 +        gmx_fatal(FARGS, "%s No reference data for last slab (n=%d), unable to proceed.",
 +                  RotStr, erg->slab_last);
 +    }
 +}
 +
 +
 +/* Enforced rotation with a flexible axis */
 +static void do_flexible(
 +        gmx_bool        bMaster,
 +        gmx_enfrot_t    enfrot,       /* Other rotation data                        */
 +        t_rotgrp       *rotg,         /* The rotation group                         */
 +        int             g,            /* Group number                               */
 +        rvec            x[],          /* The local positions                        */
 +        matrix          box,
 +        double          t,            /* Time in picoseconds                        */
 +        gmx_large_int_t step,         /* The time step                              */
 +        gmx_bool        bOutstepRot,  /* Output to main rotation output file        */
 +        gmx_bool        bOutstepSlab) /* Output per-slab data                       */
 +{
 +    int             l, nslabs;
 +    real            sigma;    /* The Gaussian width sigma */
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Define the sigma value */
 +    sigma = 0.7*rotg->slab_dist;
 +
 +    /* Sort the collective coordinates erg->xc along the rotation vector. This is
 +     * an optimization for the inner loop. */
 +    sort_collective_coordinates(rotg, enfrot->data);
 +
 +    /* Determine the first relevant slab for the first atom and the last
 +     * relevant slab for the last atom */
 +    get_firstlast_slab_check(rotg, erg, erg->xc[0], erg->xc[rotg->nat-1], g);
 +
 +    /* Determine for each slab depending on the min_gaussian cutoff criterium,
 +     * a first and a last atom index inbetween stuff needs to be calculated */
 +    get_firstlast_atom_per_slab(rotg);
 +
 +    /* Determine the gaussian-weighted center of positions for all slabs */
 +    get_slab_centers(rotg, erg->xc, erg->mc_sorted, g, t, enfrot->out_slabs, bOutstepSlab, FALSE);
 +
 +    /* Clear the torque per slab from last time step: */
 +    nslabs = erg->slab_last - erg->slab_first + 1;
 +    for (l = 0; l < nslabs; l++)
 +    {
 +        erg->slab_torque_v[l] = 0.0;
 +    }
 +
 +    /* Call the rotational forces kernel */
 +    if (rotg->eType == erotgFLEX || rotg->eType == erotgFLEXT)
 +    {
 +        erg->V = do_flex_lowlevel(rotg, sigma, x, bOutstepRot, bOutstepSlab, box);
 +    }
 +    else if (rotg->eType == erotgFLEX2 || rotg->eType == erotgFLEX2T)
 +    {
 +        erg->V = do_flex2_lowlevel(rotg, sigma, x, bOutstepRot, bOutstepSlab, box);
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "Unknown flexible rotation type");
 +    }
 +
 +    /* Determine angle by RMSD fit to the reference - Let's hope this */
 +    /* only happens once in a while, since this is not parallelized! */
 +    if (bMaster && (erotgFitPOT != rotg->eFittype) )
 +    {
 +        if (bOutstepRot)
 +        {
 +            /* Fit angle of the whole rotation group */
 +            erg->angle_v = flex_fit_angle(rotg);
 +        }
 +        if (bOutstepSlab)
 +        {
 +            /* Fit angle of each slab */
 +            flex_fit_angle_perslab(g, rotg, t, erg->degangle, enfrot->out_angles);
 +        }
 +    }
 +
 +    /* Lump together the torques from all slabs: */
 +    erg->torque_v = 0.0;
 +    for (l = 0; l < nslabs; l++)
 +    {
 +        erg->torque_v += erg->slab_torque_v[l];
 +    }
 +}
 +
 +
 +/* Calculate the angle between reference and actual rotation group atom,
 + * both projected into a plane perpendicular to the rotation vector: */
 +static void angle(t_rotgrp *rotg,
 +                  rvec      x_act,
 +                  rvec      x_ref,
 +                  real     *alpha,
 +                  real     *weight) /* atoms near the rotation axis should count less than atoms far away */
 +{
 +    rvec xp, xrp;                   /* current and reference positions projected on a plane perpendicular to pg->vec */
 +    rvec dum;
 +
 +
 +    /* Project x_ref and x into a plane through the origin perpendicular to rot_vec: */
 +    /* Project x_ref: xrp = x_ref - (vec * x_ref) * vec */
 +    svmul(iprod(rotg->vec, x_ref), rotg->vec, dum);
 +    rvec_sub(x_ref, dum, xrp);
 +    /* Project x_act: */
 +    svmul(iprod(rotg->vec, x_act), rotg->vec, dum);
 +    rvec_sub(x_act, dum, xp);
 +
 +    /* Retrieve information about which vector precedes. gmx_angle always
 +     * returns a positive angle. */
 +    cprod(xp, xrp, dum); /* if reference precedes, this is pointing into the same direction as vec */
 +
 +    if (iprod(rotg->vec, dum) >= 0)
 +    {
 +        *alpha = -gmx_angle(xrp, xp);
 +    }
 +    else
 +    {
 +        *alpha = +gmx_angle(xrp, xp);
 +    }
 +
 +    /* Also return the weight */
 +    *weight = norm(xp);
 +}
 +
 +
 +/* Project first vector onto a plane perpendicular to the second vector
 + * dr = dr - (dr.v)v
 + * Note that v must be of unit length.
 + */
 +static gmx_inline void project_onto_plane(rvec dr, const rvec v)
 +{
 +    rvec tmp;
 +
 +
 +    svmul(iprod(dr, v), v, tmp); /* tmp = (dr.v)v */
 +    rvec_dec(dr, tmp);           /* dr = dr - (dr.v)v */
 +}
 +
 +
 +/* Fixed rotation: The rotation reference group rotates around the v axis. */
 +/* The atoms of the actual rotation group are attached with imaginary  */
 +/* springs to the reference atoms.                                     */
 +static void do_fixed(
 +        t_rotgrp       *rotg,         /* The rotation group                         */
 +        rvec            x[],          /* The positions                              */
 +        matrix          box,          /* The simulation box                         */
 +        double          t,            /* Time in picoseconds                        */
 +        gmx_large_int_t step,         /* The time step                              */
 +        gmx_bool        bOutstepRot,  /* Output to main rotation output file        */
 +        gmx_bool        bOutstepSlab) /* Output per-slab data                       */
 +{
 +    int             ifit, j, jj, m;
 +    rvec            dr;
 +    rvec            tmp_f;     /* Force */
 +    real            alpha;     /* a single angle between an actual and a reference position */
 +    real            weight;    /* single weight for a single angle */
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +    rvec            xi_xc;     /* xi - xc */
 +    gmx_bool        bCalcPotFit;
 +    rvec            fit_xr_loc;
 +
 +    /* for mass weighting: */
 +    real      wi;              /* Mass-weighting of the positions */
 +    real      N_M;             /* N/M */
 +    real      k_wi;            /* k times wi */
 +
 +    gmx_bool  bProject;
 +
 +
 +    erg         = rotg->enfrotgrp;
 +    bProject    = (rotg->eType == erotgPM) || (rotg->eType == erotgPMPF);
 +    bCalcPotFit = (bOutstepRot || bOutstepSlab) && (erotgFitPOT == rotg->eFittype);
 +
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Each process calculates the forces on its local atoms */
 +    for (j = 0; j < erg->nat_loc; j++)
 +    {
 +        /* Calculate (x_i-x_c) resp. (x_i-u) */
 +        rvec_sub(erg->x_loc_pbc[j], erg->xc_center, xi_xc);
 +
 +        /* Calculate Omega*(y_i-y_c)-(x_i-x_c) */
 +        rvec_sub(erg->xr_loc[j], xi_xc, dr);
 +
 +        if (bProject)
 +        {
 +            project_onto_plane(dr, rotg->vec);
 +        }
 +
 +        /* Mass-weighting */
 +        wi = N_M*erg->m_loc[j];
 +
 +        /* Store the additional force so that it can be added to the force
 +         * array after the normal forces have been evaluated */
 +        k_wi = rotg->k*wi;
 +        for (m = 0; m < DIM; m++)
 +        {
 +            tmp_f[m]             = k_wi*dr[m];
 +            erg->f_rot_loc[j][m] = tmp_f[m];
 +            erg->V              += 0.5*k_wi*sqr(dr[m]);
 +        }
 +
 +        /* If requested, also calculate the potential for a set of angles
 +         * near the current reference angle */
 +        if (bCalcPotFit)
 +        {
 +            for (ifit = 0; ifit < rotg->PotAngle_nstep; ifit++)
 +            {
 +                /* Index of this rotation group atom with respect to the whole rotation group */
 +                jj = erg->xc_ref_ind[j];
 +
 +                /* Rotate with the alternative angle. Like rotate_local_reference(),
 +                 * just for a single local atom */
 +                mvmul(erg->PotAngleFit->rotmat[ifit], rotg->x_ref[jj], fit_xr_loc); /* fit_xr_loc = Omega*(y_i-y_c) */
 +
 +                /* Calculate Omega*(y_i-y_c)-(x_i-x_c) */
 +                rvec_sub(fit_xr_loc, xi_xc, dr);
 +
 +                if (bProject)
 +                {
 +                    project_onto_plane(dr, rotg->vec);
 +                }
 +
 +                /* Add to the rotation potential for this angle: */
 +                erg->PotAngleFit->V[ifit] += 0.5*k_wi*norm2(dr);
 +            }
 +        }
 +
 +        if (bOutstepRot)
 +        {
 +            /* Add to the torque of this rotation group */
 +            erg->torque_v += torque(rotg->vec, tmp_f, erg->x_loc_pbc[j], erg->xc_center);
 +
 +            /* Calculate the angle between reference and actual rotation group atom. */
 +            angle(rotg, xi_xc, erg->xr_loc[j], &alpha, &weight);  /* angle in rad, weighted */
 +            erg->angle_v  += alpha * weight;
 +            erg->weight_v += weight;
 +        }
 +        /* If you want enforced rotation to contribute to the virial,
 +         * activate the following lines:
 +            if (MASTER(cr))
 +            {
 +               Add the rotation contribution to the virial
 +              for(j=0; j<DIM; j++)
 +                for(m=0;m<DIM;m++)
 +                  vir[j][m] += 0.5*f[ii][j]*dr[m];
 +            }
 +         */
 +
 +        PRINT_FORCE_J
 +
 +    } /* end of loop over local rotation group atoms */
 +}
 +
 +
 +/* Calculate the radial motion potential and forces */
 +static void do_radial_motion(
 +        t_rotgrp       *rotg,         /* The rotation group                         */
 +        rvec            x[],          /* The positions                              */
 +        matrix          box,          /* The simulation box                         */
 +        double          t,            /* Time in picoseconds                        */
 +        gmx_large_int_t step,         /* The time step                              */
 +        gmx_bool        bOutstepRot,  /* Output to main rotation output file        */
 +        gmx_bool        bOutstepSlab) /* Output per-slab data                       */
 +{
 +    int             j, jj, ifit;
 +    rvec            tmp_f;     /* Force */
 +    real            alpha;     /* a single angle between an actual and a reference position */
 +    real            weight;    /* single weight for a single angle */
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +    rvec            xj_u;      /* xj - u */
 +    rvec            tmpvec, fit_tmpvec;
 +    real            fac, fac2, sum = 0.0;
 +    rvec            pj;
 +    gmx_bool        bCalcPotFit;
 +
 +    /* For mass weighting: */
 +    real      wj;              /* Mass-weighting of the positions */
 +    real      N_M;             /* N/M */
 +
 +
 +    erg         = rotg->enfrotgrp;
 +    bCalcPotFit = (bOutstepRot || bOutstepSlab) && (erotgFitPOT == rotg->eFittype);
 +
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Each process calculates the forces on its local atoms */
 +    for (j = 0; j < erg->nat_loc; j++)
 +    {
 +        /* Calculate (xj-u) */
 +        rvec_sub(erg->x_loc_pbc[j], erg->xc_center, xj_u);  /* xj_u = xj-u */
 +
 +        /* Calculate Omega.(yj0-u) */
 +        cprod(rotg->vec, erg->xr_loc[j], tmpvec);  /* tmpvec = v x Omega.(yj0-u) */
 +
-                 /*         v x Omega.(yj0-u)     */
++        /*                       *         v x Omega.(yj0-u)     */
 +        unitv(tmpvec, pj);      /*  pj = ---------------------   */
 +                                /*       | v x Omega.(yj0-u) |   */
 +
 +        fac  = iprod(pj, xj_u); /* fac = pj.(xj-u) */
 +        fac2 = fac*fac;
 +
 +        /* Mass-weighting */
 +        wj = N_M*erg->m_loc[j];
 +
 +        /* Store the additional force so that it can be added to the force
 +         * array after the normal forces have been evaluated */
 +        svmul(-rotg->k*wj*fac, pj, tmp_f);
 +        copy_rvec(tmp_f, erg->f_rot_loc[j]);
 +        sum += wj*fac2;
 +
 +        /* If requested, also calculate the potential for a set of angles
 +         * near the current reference angle */
 +        if (bCalcPotFit)
 +        {
 +            for (ifit = 0; ifit < rotg->PotAngle_nstep; ifit++)
 +            {
 +                /* Index of this rotation group atom with respect to the whole rotation group */
 +                jj = erg->xc_ref_ind[j];
 +
 +                /* Rotate with the alternative angle. Like rotate_local_reference(),
 +                 * just for a single local atom */
 +                mvmul(erg->PotAngleFit->rotmat[ifit], rotg->x_ref[jj], fit_tmpvec); /* fit_tmpvec = Omega*(yj0-u) */
 +
 +                /* Calculate Omega.(yj0-u) */
 +                cprod(rotg->vec, fit_tmpvec, tmpvec); /* tmpvec = v x Omega.(yj0-u) */
-         /*         v x Omega.(yi0-yc0)     */
++                /*                                     *         v x Omega.(yj0-u)     */
 +                unitv(tmpvec, pj);                    /*  pj = ---------------------   */
 +                                                      /*       | v x Omega.(yj0-u) |   */
 +
 +                fac  = iprod(pj, xj_u);               /* fac = pj.(xj-u) */
 +                fac2 = fac*fac;
 +
 +                /* Add to the rotation potential for this angle: */
 +                erg->PotAngleFit->V[ifit] += 0.5*rotg->k*wj*fac2;
 +            }
 +        }
 +
 +        if (bOutstepRot)
 +        {
 +            /* Add to the torque of this rotation group */
 +            erg->torque_v += torque(rotg->vec, tmp_f, erg->x_loc_pbc[j], erg->xc_center);
 +
 +            /* Calculate the angle between reference and actual rotation group atom. */
 +            angle(rotg, xj_u, erg->xr_loc[j], &alpha, &weight);  /* angle in rad, weighted */
 +            erg->angle_v  += alpha * weight;
 +            erg->weight_v += weight;
 +        }
 +
 +        PRINT_FORCE_J
 +
 +    } /* end of loop over local rotation group atoms */
 +    erg->V = 0.5*rotg->k*sum;
 +}
 +
 +
 +/* Calculate the radial motion pivot-free potential and forces */
 +static void do_radial_motion_pf(
 +        t_rotgrp       *rotg,         /* The rotation group                         */
 +        rvec            x[],          /* The positions                              */
 +        matrix          box,          /* The simulation box                         */
 +        double          t,            /* Time in picoseconds                        */
 +        gmx_large_int_t step,         /* The time step                              */
 +        gmx_bool        bOutstepRot,  /* Output to main rotation output file        */
 +        gmx_bool        bOutstepSlab) /* Output per-slab data                       */
 +{
 +    int             i, ii, iigrp, ifit, j;
 +    rvec            xj;          /* Current position */
 +    rvec            xj_xc;       /* xj  - xc  */
 +    rvec            yj0_yc0;     /* yj0 - yc0 */
 +    rvec            tmp_f;       /* Force */
 +    real            alpha;       /* a single angle between an actual and a reference position */
 +    real            weight;      /* single weight for a single angle */
 +    gmx_enfrotgrp_t erg;         /* Pointer to enforced rotation group data */
 +    rvec            tmpvec, tmpvec2;
 +    rvec            innersumvec; /* Precalculation of the inner sum */
 +    rvec            innersumveckM;
 +    real            fac, fac2, V = 0.0;
 +    rvec            qi, qj;
 +    gmx_bool        bCalcPotFit;
 +
 +    /* For mass weighting: */
 +    real      mj, wi, wj;      /* Mass-weighting of the positions */
 +    real      N_M;             /* N/M */
 +
 +
 +    erg         = rotg->enfrotgrp;
 +    bCalcPotFit = (bOutstepRot || bOutstepSlab) && (erotgFitPOT == rotg->eFittype);
 +
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Get the current center of the rotation group: */
 +    get_center(erg->xc, erg->mc, rotg->nat, erg->xc_center);
 +
 +    /* Precalculate Sum_i [ wi qi.(xi-xc) qi ] which is needed for every single j */
 +    clear_rvec(innersumvec);
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        /* Mass-weighting */
 +        wi = N_M*erg->mc[i];
 +
 +        /* Calculate qi. Note that xc_ref_center has already been subtracted from
 +         * x_ref in init_rot_group.*/
 +        mvmul(erg->rotmat, rotg->x_ref[i], tmpvec); /* tmpvec  = Omega.(yi0-yc0) */
 +
 +        cprod(rotg->vec, tmpvec, tmpvec2);          /* tmpvec2 = v x Omega.(yi0-yc0) */
 +
-         /*         v x Omega.(yj0-yc0)     */
++        /*                                             *         v x Omega.(yi0-yc0)     */
 +        unitv(tmpvec2, qi);                           /*  qi = -----------------------   */
 +                                                      /*       | v x Omega.(yi0-yc0) |   */
 +
 +        rvec_sub(erg->xc[i], erg->xc_center, tmpvec); /* tmpvec = xi-xc */
 +
 +        svmul(wi*iprod(qi, tmpvec), qi, tmpvec2);
 +
 +        rvec_inc(innersumvec, tmpvec2);
 +    }
 +    svmul(rotg->k*erg->invmass, innersumvec, innersumveckM);
 +
 +    /* Each process calculates the forces on its local atoms */
 +    for (j = 0; j < erg->nat_loc; j++)
 +    {
 +        /* Local index of a rotation group atom  */
 +        ii = erg->ind_loc[j];
 +        /* Position of this atom in the collective array */
 +        iigrp = erg->xc_ref_ind[j];
 +        /* Mass-weighting */
 +        mj = erg->mc[iigrp];  /* need the unsorted mass here */
 +        wj = N_M*mj;
 +
 +        /* Current position of this atom: x[ii][XX/YY/ZZ] */
 +        copy_rvec(x[ii], xj);
 +
 +        /* Shift this atom such that it is near its reference */
 +        shift_single_coord(box, xj, erg->xc_shifts[iigrp]);
 +
 +        /* The (unrotated) reference position is yj0. yc0 has already
 +         * been subtracted in init_rot_group */
 +        copy_rvec(rotg->x_ref[iigrp], yj0_yc0);   /* yj0_yc0 = yj0 - yc0      */
 +
 +        /* Calculate Omega.(yj0-yc0) */
 +        mvmul(erg->rotmat, yj0_yc0, tmpvec2); /* tmpvec2 = Omega.(yj0 - yc0)  */
 +
 +        cprod(rotg->vec, tmpvec2, tmpvec);    /* tmpvec = v x Omega.(yj0-yc0) */
 +
-                 /*         v x Omega.(yj0-yc0)     */
++        /*                     *         v x Omega.(yj0-yc0)     */
 +        unitv(tmpvec, qj);    /*  qj = -----------------------   */
 +                              /*       | v x Omega.(yj0-yc0) |   */
 +
 +        /* Calculate (xj-xc) */
 +        rvec_sub(xj, erg->xc_center, xj_xc); /* xj_xc = xj-xc */
 +
 +        fac  = iprod(qj, xj_xc);             /* fac = qj.(xj-xc) */
 +        fac2 = fac*fac;
 +
 +        /* Store the additional force so that it can be added to the force
 +         * array after the normal forces have been evaluated */
 +        svmul(-rotg->k*wj*fac, qj, tmp_f); /* part 1 of force */
 +        svmul(mj, innersumveckM, tmpvec);  /* part 2 of force */
 +        rvec_inc(tmp_f, tmpvec);
 +        copy_rvec(tmp_f, erg->f_rot_loc[j]);
 +        V += wj*fac2;
 +
 +        /* If requested, also calculate the potential for a set of angles
 +         * near the current reference angle */
 +        if (bCalcPotFit)
 +        {
 +            for (ifit = 0; ifit < rotg->PotAngle_nstep; ifit++)
 +            {
 +                /* Rotate with the alternative angle. Like rotate_local_reference(),
 +                 * just for a single local atom */
 +                mvmul(erg->PotAngleFit->rotmat[ifit], yj0_yc0, tmpvec2); /* tmpvec2 = Omega*(yj0-yc0) */
 +
 +                /* Calculate Omega.(yj0-u) */
 +                cprod(rotg->vec, tmpvec2, tmpvec); /* tmpvec = v x Omega.(yj0-yc0) */
-         /*                      1           */
++                /*                                  *         v x Omega.(yj0-yc0)     */
 +                unitv(tmpvec, qj);                 /*  qj = -----------------------   */
 +                                                   /*       | v x Omega.(yj0-yc0) |   */
 +
 +                fac  = iprod(qj, xj_xc);           /* fac = qj.(xj-xc) */
 +                fac2 = fac*fac;
 +
 +                /* Add to the rotation potential for this angle: */
 +                erg->PotAngleFit->V[ifit] += 0.5*rotg->k*wj*fac2;
 +            }
 +        }
 +
 +        if (bOutstepRot)
 +        {
 +            /* Add to the torque of this rotation group */
 +            erg->torque_v += torque(rotg->vec, tmp_f, xj, erg->xc_center);
 +
 +            /* Calculate the angle between reference and actual rotation group atom. */
 +            angle(rotg, xj_xc, yj0_yc0, &alpha, &weight);  /* angle in rad, weighted */
 +            erg->angle_v  += alpha * weight;
 +            erg->weight_v += weight;
 +        }
 +
 +        PRINT_FORCE_J
 +
 +    } /* end of loop over local rotation group atoms */
 +    erg->V = 0.5*rotg->k*V;
 +}
 +
 +
 +/* Precalculate the inner sum for the radial motion 2 forces */
 +static void radial_motion2_precalc_inner_sum(t_rotgrp  *rotg, rvec innersumvec)
 +{
 +    int             i;
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +    rvec            xi_xc;     /* xj - xc */
 +    rvec            tmpvec, tmpvec2;
 +    real            fac, fac2;
 +    rvec            ri, si;
 +    real            siri;
 +    rvec            v_xi_xc;   /* v x (xj - u) */
 +    real            psii, psiistar;
 +    real            wi;        /* Mass-weighting of the positions */
 +    real            N_M;       /* N/M */
 +    rvec            sumvec;
 +
 +    erg = rotg->enfrotgrp;
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Loop over the collective set of positions */
 +    clear_rvec(sumvec);
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        /* Mass-weighting */
 +        wi = N_M*erg->mc[i];
 +
 +        rvec_sub(erg->xc[i], erg->xc_center, xi_xc); /* xi_xc = xi-xc         */
 +
 +        /* Calculate ri. Note that xc_ref_center has already been subtracted from
 +         * x_ref in init_rot_group.*/
 +        mvmul(erg->rotmat, rotg->x_ref[i], ri);      /* ri  = Omega.(yi0-yc0) */
 +
 +        cprod(rotg->vec, xi_xc, v_xi_xc);            /* v_xi_xc = v x (xi-u)  */
 +
 +        fac = norm2(v_xi_xc);
-         /*                      1           */
++        /*                                 *                      1           */
 +        psiistar = 1.0/(fac + rotg->eps); /* psiistar = --------------------- */
 +                                          /*            |v x (xi-xc)|^2 + eps */
 +
 +        psii = gmx_invsqrt(fac);          /*                 1                */
 +                                          /*  psii    = -------------         */
 +                                          /*            |v x (xi-xc)|         */
 +
 +        svmul(psii, v_xi_xc, si);         /*  si = psii * (v x (xi-xc) )     */
 +
 +        fac  = iprod(v_xi_xc, ri);        /* fac = (v x (xi-xc)).ri */
 +        fac2 = fac*fac;
 +
 +        siri = iprod(si, ri);                       /* siri = si.ri           */
 +
 +        svmul(psiistar/psii, ri, tmpvec);
 +        svmul(psiistar*psiistar/(psii*psii*psii) * siri, si, tmpvec2);
 +        rvec_dec(tmpvec, tmpvec2);
 +        cprod(tmpvec, rotg->vec, tmpvec2);
 +
 +        svmul(wi*siri, tmpvec2, tmpvec);
 +
 +        rvec_inc(sumvec, tmpvec);
 +    }
 +    svmul(rotg->k*erg->invmass, sumvec, innersumvec);
 +}
 +
 +
 +/* Calculate the radial motion 2 potential and forces */
 +static void do_radial_motion2(
 +        t_rotgrp       *rotg,         /* The rotation group                         */
 +        rvec            x[],          /* The positions                              */
 +        matrix          box,          /* The simulation box                         */
 +        double          t,            /* Time in picoseconds                        */
 +        gmx_large_int_t step,         /* The time step                              */
 +        gmx_bool        bOutstepRot,  /* Output to main rotation output file        */
 +        gmx_bool        bOutstepSlab) /* Output per-slab data                       */
 +{
 +    int             ii, iigrp, ifit, j;
 +    rvec            xj;        /* Position */
 +    real            alpha;     /* a single angle between an actual and a reference position */
 +    real            weight;    /* single weight for a single angle */
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +    rvec            xj_u;      /* xj - u */
 +    rvec            yj0_yc0;   /* yj0 -yc0 */
 +    rvec            tmpvec, tmpvec2;
 +    real            fac, fit_fac, fac2, Vpart = 0.0;
 +    rvec            rj, fit_rj, sj;
 +    real            sjrj;
 +    rvec            v_xj_u;    /* v x (xj - u) */
 +    real            psij, psijstar;
 +    real            mj, wj;    /* For mass-weighting of the positions */
 +    real            N_M;       /* N/M */
 +    gmx_bool        bPF;
 +    rvec            innersumvec;
 +    gmx_bool        bCalcPotFit;
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    bPF         = rotg->eType == erotgRM2PF;
 +    bCalcPotFit = (bOutstepRot || bOutstepSlab) && (erotgFitPOT == rotg->eFittype);
 +
 +
 +    clear_rvec(yj0_yc0); /* Make the compiler happy */
 +
 +    clear_rvec(innersumvec);
 +    if (bPF)
 +    {
 +        /* For the pivot-free variant we have to use the current center of
 +         * mass of the rotation group instead of the pivot u */
 +        get_center(erg->xc, erg->mc, rotg->nat, erg->xc_center);
 +
 +        /* Also, we precalculate the second term of the forces that is identical
 +         * (up to the weight factor mj) for all forces */
 +        radial_motion2_precalc_inner_sum(rotg, innersumvec);
 +    }
 +
 +    N_M = rotg->nat * erg->invmass;
 +
 +    /* Each process calculates the forces on its local atoms */
 +    for (j = 0; j < erg->nat_loc; j++)
 +    {
 +        if (bPF)
 +        {
 +            /* Local index of a rotation group atom  */
 +            ii = erg->ind_loc[j];
 +            /* Position of this atom in the collective array */
 +            iigrp = erg->xc_ref_ind[j];
 +            /* Mass-weighting */
 +            mj = erg->mc[iigrp];
 +
 +            /* Current position of this atom: x[ii] */
 +            copy_rvec(x[ii], xj);
 +
 +            /* Shift this atom such that it is near its reference */
 +            shift_single_coord(box, xj, erg->xc_shifts[iigrp]);
 +
 +            /* The (unrotated) reference position is yj0. yc0 has already
 +             * been subtracted in init_rot_group */
 +            copy_rvec(rotg->x_ref[iigrp], yj0_yc0);   /* yj0_yc0 = yj0 - yc0  */
 +
 +            /* Calculate Omega.(yj0-yc0) */
 +            mvmul(erg->rotmat, yj0_yc0, rj);         /* rj = Omega.(yj0-yc0)  */
 +        }
 +        else
 +        {
 +            mj = erg->m_loc[j];
 +            copy_rvec(erg->x_loc_pbc[j], xj);
 +            copy_rvec(erg->xr_loc[j], rj);           /* rj = Omega.(yj0-u)    */
 +        }
 +        /* Mass-weighting */
 +        wj = N_M*mj;
 +
 +        /* Calculate (xj-u) resp. (xj-xc) */
 +        rvec_sub(xj, erg->xc_center, xj_u);          /* xj_u = xj-u           */
 +
 +        cprod(rotg->vec, xj_u, v_xj_u);              /* v_xj_u = v x (xj-u)   */
 +
 +        fac = norm2(v_xj_u);
++        /*                                 *                      1           */
 +        psijstar = 1.0/(fac + rotg->eps); /*  psistar = --------------------  */
 +                                          /*            |v x (xj-u)|^2 + eps  */
 +
 +        psij = gmx_invsqrt(fac);          /*                 1                */
 +                                          /*  psij    = ------------          */
 +                                          /*            |v x (xj-u)|          */
 +
 +        svmul(psij, v_xj_u, sj);          /*  sj = psij * (v x (xj-u) )       */
 +
 +        fac  = iprod(v_xj_u, rj);         /* fac = (v x (xj-u)).rj */
 +        fac2 = fac*fac;
 +
 +        sjrj = iprod(sj, rj);                        /* sjrj = sj.rj          */
 +
 +        svmul(psijstar/psij, rj, tmpvec);
 +        svmul(psijstar*psijstar/(psij*psij*psij) * sjrj, sj, tmpvec2);
 +        rvec_dec(tmpvec, tmpvec2);
 +        cprod(tmpvec, rotg->vec, tmpvec2);
 +
 +        /* Store the additional force so that it can be added to the force
 +         * array after the normal forces have been evaluated */
 +        svmul(-rotg->k*wj*sjrj, tmpvec2, tmpvec);
 +        svmul(mj, innersumvec, tmpvec2);  /* This is != 0 only for the pivot-free variant */
 +
 +        rvec_add(tmpvec2, tmpvec, erg->f_rot_loc[j]);
 +        Vpart += wj*psijstar*fac2;
 +
 +        /* If requested, also calculate the potential for a set of angles
 +         * near the current reference angle */
 +        if (bCalcPotFit)
 +        {
 +            for (ifit = 0; ifit < rotg->PotAngle_nstep; ifit++)
 +            {
 +                if (bPF)
 +                {
 +                    mvmul(erg->PotAngleFit->rotmat[ifit], yj0_yc0, fit_rj); /* fit_rj = Omega.(yj0-yc0) */
 +                }
 +                else
 +                {
 +                    /* Position of this atom in the collective array */
 +                    iigrp = erg->xc_ref_ind[j];
 +                    /* Rotate with the alternative angle. Like rotate_local_reference(),
 +                     * just for a single local atom */
 +                    mvmul(erg->PotAngleFit->rotmat[ifit], rotg->x_ref[iigrp], fit_rj); /* fit_rj = Omega*(yj0-u) */
 +                }
 +                fit_fac = iprod(v_xj_u, fit_rj);                                       /* fac = (v x (xj-u)).fit_rj */
 +                /* Add to the rotation potential for this angle: */
 +                erg->PotAngleFit->V[ifit] += 0.5*rotg->k*wj*psijstar*fit_fac*fit_fac;
 +            }
 +        }
 +
 +        if (bOutstepRot)
 +        {
 +            /* Add to the torque of this rotation group */
 +            erg->torque_v += torque(rotg->vec, erg->f_rot_loc[j], xj, erg->xc_center);
 +
 +            /* Calculate the angle between reference and actual rotation group atom. */
 +            angle(rotg, xj_u, rj, &alpha, &weight);  /* angle in rad, weighted */
 +            erg->angle_v  += alpha * weight;
 +            erg->weight_v += weight;
 +        }
 +
 +        PRINT_FORCE_J
 +
 +    } /* end of loop over local rotation group atoms */
 +    erg->V = 0.5*rotg->k*Vpart;
 +}
 +
 +
 +/* Determine the smallest and largest position vector (with respect to the
 + * rotation vector) for the reference group */
 +static void get_firstlast_atom_ref(
 +        t_rotgrp  *rotg,
 +        int       *firstindex,
 +        int       *lastindex)
 +{
 +    gmx_enfrotgrp_t erg;              /* Pointer to enforced rotation group data */
 +    int             i;
 +    real            xcproj;           /* The projection of a reference position on the
 +                                         rotation vector */
 +    real            minproj, maxproj; /* Smallest and largest projection on v */
 +
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Start with some value */
 +    minproj = iprod(rotg->x_ref[0], rotg->vec);
 +    maxproj = minproj;
 +
 +    /* This is just to ensure that it still works if all the atoms of the
 +     * reference structure are situated in a plane perpendicular to the rotation
 +     * vector */
 +    *firstindex = 0;
 +    *lastindex  = rotg->nat-1;
 +
 +    /* Loop over all atoms of the reference group,
 +     * project them on the rotation vector to find the extremes */
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        xcproj = iprod(rotg->x_ref[i], rotg->vec);
 +        if (xcproj < minproj)
 +        {
 +            minproj     = xcproj;
 +            *firstindex = i;
 +        }
 +        if (xcproj > maxproj)
 +        {
 +            maxproj    = xcproj;
 +            *lastindex = i;
 +        }
 +    }
 +}
 +
 +
 +/* Allocate memory for the slabs */
 +static void allocate_slabs(
 +        t_rotgrp  *rotg,
 +        FILE      *fplog,
 +        int        g,
 +        gmx_bool   bVerbose)
 +{
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +    int             i, nslabs;
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* More slabs than are defined for the reference are never needed */
 +    nslabs = erg->slab_last_ref - erg->slab_first_ref + 1;
 +
 +    /* Remember how many we allocated */
 +    erg->nslabs_alloc = nslabs;
 +
 +    if ( (NULL != fplog) && bVerbose)
 +    {
 +        fprintf(fplog, "%s allocating memory to store data for %d slabs (rotation group %d).\n",
 +                RotStr, nslabs, g);
 +    }
 +    snew(erg->slab_center, nslabs);
 +    snew(erg->slab_center_ref, nslabs);
 +    snew(erg->slab_weights, nslabs);
 +    snew(erg->slab_torque_v, nslabs);
 +    snew(erg->slab_data, nslabs);
 +    snew(erg->gn_atom, nslabs);
 +    snew(erg->gn_slabind, nslabs);
 +    snew(erg->slab_innersumvec, nslabs);
 +    for (i = 0; i < nslabs; i++)
 +    {
 +        snew(erg->slab_data[i].x, rotg->nat);
 +        snew(erg->slab_data[i].ref, rotg->nat);
 +        snew(erg->slab_data[i].weight, rotg->nat);
 +    }
 +    snew(erg->xc_ref_sorted, rotg->nat);
 +    snew(erg->xc_sortind, rotg->nat);
 +    snew(erg->firstatom, nslabs);
 +    snew(erg->lastatom, nslabs);
 +}
 +
 +
 +/* From the extreme coordinates of the reference group, determine the first
 + * and last slab of the reference. We can never have more slabs in the real
 + * simulation than calculated here for the reference.
 + */
 +static void get_firstlast_slab_ref(t_rotgrp *rotg, real mc[], int ref_firstindex, int ref_lastindex)
 +{
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +    int             first, last, firststart;
 +    rvec            dummy;
 +
 +
 +    erg        = rotg->enfrotgrp;
 +    first      = get_first_slab(rotg, erg->max_beta, rotg->x_ref[ref_firstindex]);
 +    last       = get_last_slab( rotg, erg->max_beta, rotg->x_ref[ref_lastindex ]);
 +    firststart = first;
 +
 +    while (get_slab_weight(first, rotg, rotg->x_ref, mc, &dummy) > WEIGHT_MIN)
 +    {
 +        first--;
 +    }
 +    erg->slab_first_ref = first+1;
 +    while (get_slab_weight(last, rotg, rotg->x_ref, mc, &dummy) > WEIGHT_MIN)
 +    {
 +        last++;
 +    }
 +    erg->slab_last_ref  = last-1;
 +
 +    erg->slab_buffer = firststart - erg->slab_first_ref;
 +}
 +
 +
 +/* Special version of copy_rvec:
 + * During the copy procedure of xcurr to b, the correct PBC image is chosen
 + * such that the copied vector ends up near its reference position xref */
 +static inline void copy_correct_pbc_image(
 +        const rvec  xcurr,  /* copy vector xcurr ...                */
 +        rvec        b,      /* ... to b ...                         */
 +        const rvec  xref,   /* choosing the PBC image such that b ends up near xref */
 +        matrix      box,
 +        int         npbcdim)
 +{
 +    rvec  dx;
 +    int   d, m;
 +    ivec  shift;
 +
 +
 +    /* Shortest PBC distance between the atom and its reference */
 +    rvec_sub(xcurr, xref, dx);
 +
 +    /* Determine the shift for this atom */
 +    clear_ivec(shift);
 +    for (m = npbcdim-1; m >= 0; m--)
 +    {
 +        while (dx[m] < -0.5*box[m][m])
 +        {
 +            for (d = 0; d < DIM; d++)
 +            {
 +                dx[d] += box[m][d];
 +            }
 +            shift[m]++;
 +        }
 +        while (dx[m] >= 0.5*box[m][m])
 +        {
 +            for (d = 0; d < DIM; d++)
 +            {
 +                dx[d] -= box[m][d];
 +            }
 +            shift[m]--;
 +        }
 +    }
 +
 +    /* Apply the shift to the position */
 +    copy_rvec(xcurr, b);
 +    shift_single_coord(box, b, shift);
 +}
 +
 +
 +static void init_rot_group(FILE *fplog, t_commrec *cr, int g, t_rotgrp *rotg,
 +                           rvec *x, gmx_mtop_t *mtop, gmx_bool bVerbose, FILE *out_slabs, matrix box,
 +                           gmx_bool bOutputCenters)
 +{
 +    int                   i, ii;
 +    rvec                  coord, *xdum;
 +    gmx_bool              bFlex, bColl;
 +    t_atom               *atom;
 +    gmx_enfrotgrp_t       erg; /* Pointer to enforced rotation group data */
 +    int                   ref_firstindex, ref_lastindex;
 +    gmx_mtop_atomlookup_t alook = NULL;
 +    real                  mass, totalmass;
 +    real                  start = 0.0;
 +
 +
 +    /* Do we have a flexible axis? */
 +    bFlex = ISFLEX(rotg);
 +    /* Do we use a global set of coordinates? */
 +    bColl = ISCOLL(rotg);
 +
 +    erg = rotg->enfrotgrp;
 +
 +    /* Allocate space for collective coordinates if needed */
 +    if (bColl)
 +    {
 +        snew(erg->xc, rotg->nat);
 +        snew(erg->xc_shifts, rotg->nat);
 +        snew(erg->xc_eshifts, rotg->nat);
 +
 +        /* Save the original (whole) set of positions such that later the
 +         * molecule can always be made whole again */
 +        snew(erg->xc_old, rotg->nat);
 +        if (MASTER(cr))
 +        {
 +            for (i = 0; i < rotg->nat; i++)
 +            {
 +                ii = rotg->ind[i];
 +                copy_correct_pbc_image(x[ii], erg->xc_old[i], rotg->x_ref[i], box, 3);
 +            }
 +        }
 +#ifdef GMX_MPI
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(rotg->nat*sizeof(erg->xc_old[0]), erg->xc_old, cr);
 +        }
 +#endif
 +
 +        if (rotg->eFittype == erotgFitNORM)
 +        {
 +            snew(erg->xc_ref_length, rotg->nat); /* in case fit type NORM is chosen */
 +            snew(erg->xc_norm, rotg->nat);
 +        }
 +    }
 +    else
 +    {
 +        snew(erg->xr_loc, rotg->nat);
 +        snew(erg->x_loc_pbc, rotg->nat);
 +    }
 +
 +    snew(erg->f_rot_loc, rotg->nat);
 +    snew(erg->xc_ref_ind, rotg->nat);
 +
 +    /* Make space for the calculation of the potential at other angles (used
 +     * for fitting only) */
 +    if (erotgFitPOT == rotg->eFittype)
 +    {
 +        snew(erg->PotAngleFit, 1);
 +        snew(erg->PotAngleFit->degangle, rotg->PotAngle_nstep);
 +        snew(erg->PotAngleFit->V, rotg->PotAngle_nstep);
 +        snew(erg->PotAngleFit->rotmat, rotg->PotAngle_nstep);
 +
 +        /* Get the set of angles around the reference angle */
 +        start = -0.5 * (rotg->PotAngle_nstep - 1)*rotg->PotAngle_step;
 +        for (i = 0; i < rotg->PotAngle_nstep; i++)
 +        {
 +            erg->PotAngleFit->degangle[i] = start + i*rotg->PotAngle_step;
 +        }
 +    }
 +    else
 +    {
 +        erg->PotAngleFit = NULL;
 +    }
 +
 +    /* xc_ref_ind needs to be set to identity in the serial case */
 +    if (!PAR(cr))
 +    {
 +        for (i = 0; i < rotg->nat; i++)
 +        {
 +            erg->xc_ref_ind[i] = i;
 +        }
 +    }
 +
 +    /* Copy the masses so that the center can be determined. For all types of
 +     * enforced rotation, we store the masses in the erg->mc array. */
 +    if (rotg->bMassW)
 +    {
 +        alook = gmx_mtop_atomlookup_init(mtop);
 +    }
 +    snew(erg->mc, rotg->nat);
 +    if (bFlex)
 +    {
 +        snew(erg->mc_sorted, rotg->nat);
 +    }
 +    if (!bColl)
 +    {
 +        snew(erg->m_loc, rotg->nat);
 +    }
 +    totalmass = 0.0;
 +    for (i = 0; i < rotg->nat; i++)
 +    {
 +        if (rotg->bMassW)
 +        {
 +            gmx_mtop_atomnr_to_atom(alook, rotg->ind[i], &atom);
 +            mass = atom->m;
 +        }
 +        else
 +        {
 +            mass = 1.0;
 +        }
 +        erg->mc[i] = mass;
 +        totalmass += mass;
 +    }
 +    erg->invmass = 1.0/totalmass;
 +
 +    if (rotg->bMassW)
 +    {
 +        gmx_mtop_atomlookup_destroy(alook);
 +    }
 +
 +    /* Set xc_ref_center for any rotation potential */
 +    if ((rotg->eType == erotgISO) || (rotg->eType == erotgPM) || (rotg->eType == erotgRM) || (rotg->eType == erotgRM2))
 +    {
 +        /* Set the pivot point for the fixed, stationary-axis potentials. This
 +         * won't change during the simulation */
 +        copy_rvec(rotg->pivot, erg->xc_ref_center);
 +        copy_rvec(rotg->pivot, erg->xc_center    );
 +    }
 +    else
 +    {
 +        /* Center of the reference positions */
 +        get_center(rotg->x_ref, erg->mc, rotg->nat, erg->xc_ref_center);
 +
 +        /* Center of the actual positions */
 +        if (MASTER(cr))
 +        {
 +            snew(xdum, rotg->nat);
 +            for (i = 0; i < rotg->nat; i++)
 +            {
 +                ii = rotg->ind[i];
 +                copy_rvec(x[ii], xdum[i]);
 +            }
 +            get_center(xdum, erg->mc, rotg->nat, erg->xc_center);
 +            sfree(xdum);
 +        }
 +#ifdef GMX_MPI
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(erg->xc_center), erg->xc_center, cr);
 +        }
 +#endif
 +    }
 +
 +    if ( (rotg->eType != erotgFLEX) && (rotg->eType != erotgFLEX2) )
 +    {
 +        /* Put the reference positions into origin: */
 +        for (i = 0; i < rotg->nat; i++)
 +        {
 +            rvec_dec(rotg->x_ref[i], erg->xc_ref_center);
 +        }
 +    }
 +
 +    /* Enforced rotation with flexible axis */
 +    if (bFlex)
 +    {
 +        /* Calculate maximum beta value from minimum gaussian (performance opt.) */
 +        erg->max_beta = calc_beta_max(rotg->min_gaussian, rotg->slab_dist);
 +
 +        /* Determine the smallest and largest coordinate with respect to the rotation vector */
 +        get_firstlast_atom_ref(rotg, &ref_firstindex, &ref_lastindex);
 +
 +        /* From the extreme coordinates of the reference group, determine the first
 +         * and last slab of the reference. */
 +        get_firstlast_slab_ref(rotg, erg->mc, ref_firstindex, ref_lastindex);
 +
 +        /* Allocate memory for the slabs */
 +        allocate_slabs(rotg, fplog, g, bVerbose);
 +
 +        /* Flexible rotation: determine the reference centers for the rest of the simulation */
 +        erg->slab_first = erg->slab_first_ref;
 +        erg->slab_last  = erg->slab_last_ref;
 +        get_slab_centers(rotg, rotg->x_ref, erg->mc, g, -1, out_slabs, bOutputCenters, TRUE);
 +
 +        /* Length of each x_rotref vector from center (needed if fit routine NORM is chosen): */
 +        if (rotg->eFittype == erotgFitNORM)
 +        {
 +            for (i = 0; i < rotg->nat; i++)
 +            {
 +                rvec_sub(rotg->x_ref[i], erg->xc_ref_center, coord);
 +                erg->xc_ref_length[i] = norm(coord);
 +            }
 +        }
 +    }
 +}
 +
 +
 +extern void dd_make_local_rotation_groups(gmx_domdec_t *dd, t_rot *rot)
 +{
 +    gmx_ga2la_t     ga2la;
 +    int             g;
 +    t_rotgrp       *rotg;
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +
 +    ga2la = dd->ga2la;
 +
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +        erg  = rotg->enfrotgrp;
 +
 +
 +        dd_make_local_group_indices(ga2la, rotg->nat, rotg->ind,
 +                                    &erg->nat_loc, &erg->ind_loc, &erg->nalloc_loc, erg->xc_ref_ind);
 +    }
 +}
 +
 +
 +/* Calculate the size of the MPI buffer needed in reduce_output() */
 +static int calc_mpi_bufsize(t_rot *rot)
 +{
 +    int             g;
 +    int             count_group, count_total;
 +    t_rotgrp       *rotg;
 +    gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
 +
 +
 +    count_total = 0;
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +        erg  = rotg->enfrotgrp;
 +
 +        /* Count the items that are transferred for this group: */
 +        count_group = 4; /* V, torque, angle, weight */
 +
 +        /* Add the maximum number of slabs for flexible groups */
 +        if (ISFLEX(rotg))
 +        {
 +            count_group += erg->slab_last_ref - erg->slab_first_ref + 1;
 +        }
 +
 +        /* Add space for the potentials at different angles: */
 +        if (erotgFitPOT == rotg->eFittype)
 +        {
 +            count_group += rotg->PotAngle_nstep;
 +        }
 +
 +        /* Add to the total number: */
 +        count_total += count_group;
 +    }
 +
 +    return count_total;
 +}
 +
 +
 +extern void init_rot(FILE *fplog, t_inputrec *ir, int nfile, const t_filenm fnm[],
 +                     t_commrec *cr, rvec *x, matrix box, gmx_mtop_t *mtop, const output_env_t oenv,
 +                     gmx_bool bVerbose, unsigned long Flags)
 +{
 +    t_rot          *rot;
 +    t_rotgrp       *rotg;
 +    int             g;
 +    int             nat_max = 0;  /* Size of biggest rotation group */
 +    gmx_enfrot_t    er;           /* Pointer to the enforced rotation buffer variables */
 +    gmx_enfrotgrp_t erg;          /* Pointer to enforced rotation group data */
 +    rvec           *x_pbc = NULL; /* Space for the pbc-correct atom positions */
 +
 +
 +    if ( (PAR(cr)) && !DOMAINDECOMP(cr) )
 +    {
 +        gmx_fatal(FARGS, "Enforced rotation is only implemented for domain decomposition!");
 +    }
 +
 +    if (MASTER(cr) && bVerbose)
 +    {
 +        fprintf(stdout, "%s Initializing ...\n", RotStr);
 +    }
 +
 +    rot = ir->rot;
 +    snew(rot->enfrot, 1);
 +    er        = rot->enfrot;
 +    er->Flags = Flags;
 +
 +    /* When appending, skip first output to avoid duplicate entries in the data files */
 +    if (er->Flags & MD_APPENDFILES)
 +    {
 +        er->bOut = FALSE;
 +    }
 +    else
 +    {
 +        er->bOut = TRUE;
 +    }
 +
 +    if (MASTER(cr) && er->bOut)
 +    {
 +        please_cite(fplog, "Kutzner2011");
 +    }
 +
 +    /* Output every step for reruns */
 +    if (er->Flags & MD_RERUN)
 +    {
 +        if (NULL != fplog)
 +        {
 +            fprintf(fplog, "%s rerun - will write rotation output every available step.\n", RotStr);
 +        }
 +        rot->nstrout = 1;
 +        rot->nstsout = 1;
 +    }
 +
 +    er->out_slabs = NULL;
 +    if (MASTER(cr) && HaveFlexibleGroups(rot) )
 +    {
 +        er->out_slabs = open_slab_out(opt2fn("-rs", nfile, fnm), rot, oenv);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        /* Remove pbc, make molecule whole.
 +         * When ir->bContinuation=TRUE this has already been done, but ok. */
 +        snew(x_pbc, mtop->natoms);
 +        m_rveccopy(mtop->natoms, x, x_pbc);
 +        do_pbc_first_mtop(NULL, ir->ePBC, box, mtop, x_pbc);
 +        /* All molecules will be whole now, but not necessarily in the home box.
 +         * Additionally, if a rotation group consists of more than one molecule
 +         * (e.g. two strands of DNA), each one of them can end up in a different
 +         * periodic box. This is taken care of in init_rot_group.  */
 +    }
 +
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +
 +        if (NULL != fplog)
 +        {
 +            fprintf(fplog, "%s group %d type '%s'\n", RotStr, g, erotg_names[rotg->eType]);
 +        }
 +
 +        if (rotg->nat > 0)
 +        {
 +            /* Allocate space for the rotation group's data: */
 +            snew(rotg->enfrotgrp, 1);
 +            erg  = rotg->enfrotgrp;
 +
 +            nat_max = max(nat_max, rotg->nat);
 +
 +            if (PAR(cr))
 +            {
 +                erg->nat_loc    = 0;
 +                erg->nalloc_loc = 0;
 +                erg->ind_loc    = NULL;
 +            }
 +            else
 +            {
 +                erg->nat_loc = rotg->nat;
 +                erg->ind_loc = rotg->ind;
 +            }
 +            init_rot_group(fplog, cr, g, rotg, x_pbc, mtop, bVerbose, er->out_slabs, box,
 +                           !(er->Flags & MD_APPENDFILES) ); /* Do not output the reference centers
 +                                                             * again if we are appending */
 +        }
 +    }
 +
 +    /* Allocate space for enforced rotation buffer variables */
 +    er->bufsize = nat_max;
 +    snew(er->data, nat_max);
 +    snew(er->xbuf, nat_max);
 +    snew(er->mbuf, nat_max);
 +
 +    /* Buffers for MPI reducing torques, angles, weights (for each group), and V */
 +    if (PAR(cr))
 +    {
 +        er->mpi_bufsize = calc_mpi_bufsize(rot) + 100; /* larger to catch errors */
 +        snew(er->mpi_inbuf, er->mpi_bufsize);
 +        snew(er->mpi_outbuf, er->mpi_bufsize);
 +    }
 +    else
 +    {
 +        er->mpi_bufsize = 0;
 +        er->mpi_inbuf   = NULL;
 +        er->mpi_outbuf  = NULL;
 +    }
 +
 +    /* Only do I/O on the MASTER */
 +    er->out_angles  = NULL;
 +    er->out_rot     = NULL;
 +    er->out_torque  = NULL;
 +    if (MASTER(cr))
 +    {
 +        er->out_rot = open_rot_out(opt2fn("-ro", nfile, fnm), rot, oenv);
 +
 +        if (rot->nstsout > 0)
 +        {
 +            if (HaveFlexibleGroups(rot) || HavePotFitGroups(rot) )
 +            {
 +                er->out_angles  = open_angles_out(opt2fn("-ra", nfile, fnm), rot, oenv);
 +            }
 +            if (HaveFlexibleGroups(rot) )
 +            {
 +                er->out_torque  = open_torque_out(opt2fn("-rt", nfile, fnm), rot, oenv);
 +            }
 +        }
 +
 +        sfree(x_pbc);
 +    }
 +}
 +
 +
 +extern void finish_rot(FILE *fplog, t_rot *rot)
 +{
 +    gmx_enfrot_t er;        /* Pointer to the enforced rotation buffer variables */
 +
 +
 +    er = rot->enfrot;
 +    if (er->out_rot)
 +    {
 +        gmx_fio_fclose(er->out_rot);
 +    }
 +    if (er->out_slabs)
 +    {
 +        gmx_fio_fclose(er->out_slabs);
 +    }
 +    if (er->out_angles)
 +    {
 +        gmx_fio_fclose(er->out_angles);
 +    }
 +    if (er->out_torque)
 +    {
 +        gmx_fio_fclose(er->out_torque);
 +    }
 +}
 +
 +
 +/* Rotate the local reference positions and store them in
 + * erg->xr_loc[0...(nat_loc-1)]
 + *
 + * Note that we already subtracted u or y_c from the reference positions
 + * in init_rot_group().
 + */
 +static void rotate_local_reference(t_rotgrp *rotg)
 +{
 +    gmx_enfrotgrp_t erg;
 +    int             i, ii;
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    for (i = 0; i < erg->nat_loc; i++)
 +    {
 +        /* Index of this rotation group atom with respect to the whole rotation group */
 +        ii = erg->xc_ref_ind[i];
 +        /* Rotate */
 +        mvmul(erg->rotmat, rotg->x_ref[ii], erg->xr_loc[i]);
 +    }
 +}
 +
 +
 +/* Select the PBC representation for each local x position and store that
 + * for later usage. We assume the right PBC image of an x is the one nearest to
 + * its rotated reference */
 +static void choose_pbc_image(rvec x[], t_rotgrp *rotg, matrix box, int npbcdim)
 +{
 +    int             i, ii;
 +    gmx_enfrotgrp_t erg;       /* Pointer to enforced rotation group data */
 +    rvec            xref;
 +
 +
 +    erg = rotg->enfrotgrp;
 +
 +    for (i = 0; i < erg->nat_loc; i++)
 +    {
 +        /* Index of a rotation group atom  */
 +        ii = erg->ind_loc[i];
 +
 +        /* Get the reference position. The pivot was already
 +         * subtracted in init_rot_group() from the reference positions. Also,
 +         * the reference positions have already been rotated in
 +         * rotate_local_reference() */
 +        copy_rvec(erg->xr_loc[i], xref);
 +
 +        copy_correct_pbc_image(x[ii], erg->x_loc_pbc[i], xref, box, npbcdim);
 +    }
 +}
 +
 +
 +extern void do_rotation(
 +        t_commrec      *cr,
 +        t_inputrec     *ir,
 +        matrix          box,
 +        rvec            x[],
 +        real            t,
 +        gmx_large_int_t step,
 +        gmx_wallcycle_t wcycle,
 +        gmx_bool        bNS)
 +{
 +    int             g, i, ii;
 +    t_rot          *rot;
 +    t_rotgrp       *rotg;
 +    gmx_bool        outstep_slab, outstep_rot;
 +    gmx_bool        bFlex, bColl;
 +    gmx_enfrot_t    er;         /* Pointer to the enforced rotation buffer variables */
 +    gmx_enfrotgrp_t erg;        /* Pointer to enforced rotation group data           */
 +    rvec            transvec;
 +    t_gmx_potfit   *fit = NULL; /* For fit type 'potential' determine the fit
 +                                   angle via the potential minimum            */
 +
 +    /* Enforced rotation cycle counting: */
 +    gmx_cycles_t cycles_comp;   /* Cycles for the enf. rotation computation
 +                                   only, does not count communication. This
 +                                   counter is used for load-balancing         */
 +
 +#ifdef TAKETIME
 +    double t0;
 +#endif
 +
 +    rot = ir->rot;
 +    er  = rot->enfrot;
 +
 +    /* When to output in main rotation output file */
 +    outstep_rot  = do_per_step(step, rot->nstrout) && er->bOut;
 +    /* When to output per-slab data */
 +    outstep_slab = do_per_step(step, rot->nstsout) && er->bOut;
 +
 +    /* Output time into rotation output file */
 +    if (outstep_rot && MASTER(cr))
 +    {
 +        fprintf(er->out_rot, "%12.3e", t);
 +    }
 +
 +    /**************************************************************************/
 +    /* First do ALL the communication! */
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +        erg  = rotg->enfrotgrp;
 +
 +        /* Do we have a flexible axis? */
 +        bFlex = ISFLEX(rotg);
 +        /* Do we use a collective (global) set of coordinates? */
 +        bColl = ISCOLL(rotg);
 +
 +        /* Calculate the rotation matrix for this angle: */
 +        erg->degangle = rotg->rate * t;
 +        calc_rotmat(rotg->vec, erg->degangle, erg->rotmat);
 +
 +        if (bColl)
 +        {
 +            /* Transfer the rotation group's positions such that every node has
 +             * all of them. Every node contributes its local positions x and stores
 +             * it in the collective erg->xc array. */
 +            communicate_group_positions(cr, erg->xc, erg->xc_shifts, erg->xc_eshifts, bNS,
 +                                        x, rotg->nat, erg->nat_loc, erg->ind_loc, erg->xc_ref_ind, erg->xc_old, box);
 +        }
 +        else
 +        {
 +            /* Fill the local masses array;
 +             * this array changes in DD/neighborsearching steps */
 +            if (bNS)
 +            {
 +                for (i = 0; i < erg->nat_loc; i++)
 +                {
 +                    /* Index of local atom w.r.t. the collective rotation group */
 +                    ii            = erg->xc_ref_ind[i];
 +                    erg->m_loc[i] = erg->mc[ii];
 +                }
 +            }
 +
 +            /* Calculate Omega*(y_i-y_c) for the local positions */
 +            rotate_local_reference(rotg);
 +
 +            /* Choose the nearest PBC images of the group atoms with respect
 +             * to the rotated reference positions */
 +            choose_pbc_image(x, rotg, box, 3);
 +
 +            /* Get the center of the rotation group */
 +            if ( (rotg->eType == erotgISOPF) || (rotg->eType == erotgPMPF) )
 +            {
 +                get_center_comm(cr, erg->x_loc_pbc, erg->m_loc, erg->nat_loc, rotg->nat, erg->xc_center);
 +            }
 +        }
 +
 +    } /* End of loop over rotation groups */
 +
 +    /**************************************************************************/
 +    /* Done communicating, we can start to count cycles for the load balancing now ... */
 +    cycles_comp = gmx_cycles_read();
 +
 +
 +#ifdef TAKETIME
 +    t0 = MPI_Wtime();
 +#endif
 +
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        rotg = &rot->grp[g];
 +        erg  = rotg->enfrotgrp;
 +
 +        bFlex = ISFLEX(rotg);
 +        bColl = ISCOLL(rotg);
 +
 +        if (outstep_rot && MASTER(cr))
 +        {
 +            fprintf(er->out_rot, "%12.4f", erg->degangle);
 +        }
 +
 +        /* Calculate angles and rotation matrices for potential fitting: */
 +        if ( (outstep_rot || outstep_slab) && (erotgFitPOT == rotg->eFittype) )
 +        {
 +            fit = erg->PotAngleFit;
 +            for (i = 0; i < rotg->PotAngle_nstep; i++)
 +            {
 +                calc_rotmat(rotg->vec, erg->degangle + fit->degangle[i], fit->rotmat[i]);
 +
 +                /* Clear value from last step */
 +                erg->PotAngleFit->V[i] = 0.0;
 +            }
 +        }
 +
 +        /* Clear values from last time step */
 +        erg->V        = 0.0;
 +        erg->torque_v = 0.0;
 +        erg->angle_v  = 0.0;
 +        erg->weight_v = 0.0;
 +
 +        switch (rotg->eType)
 +        {
 +            case erotgISO:
 +            case erotgISOPF:
 +            case erotgPM:
 +            case erotgPMPF:
 +                do_fixed(rotg, x, box, t, step, outstep_rot, outstep_slab);
 +                break;
 +            case erotgRM:
 +                do_radial_motion(rotg, x, box, t, step, outstep_rot, outstep_slab);
 +                break;
 +            case erotgRMPF:
 +                do_radial_motion_pf(rotg, x, box, t, step, outstep_rot, outstep_slab);
 +                break;
 +            case erotgRM2:
 +            case erotgRM2PF:
 +                do_radial_motion2(rotg, x, box, t, step, outstep_rot, outstep_slab);
 +                break;
 +            case erotgFLEXT:
 +            case erotgFLEX2T:
 +                /* Subtract the center of the rotation group from the collective positions array
 +                 * Also store the center in erg->xc_center since it needs to be subtracted
 +                 * in the low level routines from the local coordinates as well */
 +                get_center(erg->xc, erg->mc, rotg->nat, erg->xc_center);
 +                svmul(-1.0, erg->xc_center, transvec);
 +                translate_x(erg->xc, rotg->nat, transvec);
 +                do_flexible(MASTER(cr), er, rotg, g, x, box, t, step, outstep_rot, outstep_slab);
 +                break;
 +            case erotgFLEX:
 +            case erotgFLEX2:
 +                /* Do NOT subtract the center of mass in the low level routines! */
 +                clear_rvec(erg->xc_center);
 +                do_flexible(MASTER(cr), er, rotg, g, x, box, t, step, outstep_rot, outstep_slab);
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "No such rotation potential.");
 +                break;
 +        }
 +    }
 +
 +#ifdef TAKETIME
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "%s calculation (step %d) took %g seconds.\n", RotStr, step, MPI_Wtime()-t0);
 +    }
 +#endif
 +
 +    /* Stop the enforced rotation cycle counter and add the computation-only
 +     * cycles to the force cycles for load balancing */
 +    cycles_comp  = gmx_cycles_read() - cycles_comp;
 +
 +    if (DOMAINDECOMP(cr) && wcycle)
 +    {
 +        dd_cycles_add(cr->dd, cycles_comp, ddCyclF);
 +    }
 +}