Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Sun, 10 Mar 2013 21:25:44 +0000 (17:25 -0400)
committerRoland Schulz <roland@utk.edu>
Wed, 20 Mar 2013 06:09:06 +0000 (02:09 -0400)
Conflicts:
CMakeLists.txt (trivial)
bootstrap (deleted)

Reverted: tests/CMakeLists.txt (GMXLIB - not necessary)

Change-Id: I2f0810ac6343b3629f01169b13c06784ccfd56cf

34 files changed:
1  2 
CMakeLists.txt
src/gromacs/gmxlib/mvdata.c
src/gromacs/gmxlib/nonbonded/nb_kernel_adress_c/make_nb_kernel_adress_c.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/make_nb_kernel_avx_128_fma_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/make_nb_kernel_avx_128_fma_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/make_nb_kernel_avx_256_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/make_nb_kernel_avx_256_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_avx_256_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_c/make_nb_kernel_c.py
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_c.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/make_nb_kernel_sse2_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_sse2_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/make_nb_kernel_sse2_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/make_nb_kernel_sse4_1_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/make_nb_kernel_sse4_1_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.c
src/gromacs/gmxlib/thread_mpi/pthreads.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/gmx_wallcycle.c
src/gromacs/mdlib/nbnxn_atomdata.c
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/ns.c
src/tools/gmx_dipoles.c
src/tools/gmx_order.c
src/tools/gmx_tune_pme.c
tests/CMakeLists.txt

diff --cc CMakeLists.txt
index f663aa6bd133d978b7eb53c7de093d3a8cff2066,df12172a9a74370c7bfcaa51507c9db3d1b19a74..97ab59b4fd8a09a150a6c2f610d4b6455f90e19b
@@@ -22,7 -56,12 +22,12 @@@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CM
  # machine with no git. 
  #
  # NOTE: when releasing the "-dev" suffix needs to be stripped off!
 -set(PROJECT_VERSION "4.6.2-dev")
 +set(PROJECT_VERSION "5.0-dev")
+ # The version number of the regressiontest tarball against which this
+ # git branch can be tested. Normally, this will be the version of the
+ # last patch release. Comment the next line out for branches leading
+ # to a major/minor release.
+ set(REGRESSIONTEST_VERSION "4.6.1")
  set(CUSTOM_VERSION_STRING ""
      CACHE STRING "Custom version string (if empty, use hard-coded default)")
  mark_as_advanced(CUSTOM_VERSION_STRING)
index 80e3b2bd1b97ae1b90024312a308b150e1d5adfa,0000000000000000000000000000000000000000..dcbcc03491060029c4b5e1c152d72276b88c9ae8
mode 100644,000000..100644
--- /dev/null
@@@ -1,797 -1,0 +1,800 @@@
-     snew_bc(cr, fep->all_lambda, efptNR);
-     nblock_bc(cr, efptNR, fep->all_lambda);
-     for (i = 0; i < efptNR; i++)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <sysstuff.h>
 +#include <string.h>
 +#include "typedefs.h"
 +#include "main.h"
 +#include "mvdata.h"
 +#include "network.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "symtab.h"
 +#include "vec.h"
 +#include "tgroup.h"
 +
 +#define   block_bc(cr,   d) gmx_bcast(     sizeof(d),     &(d), (cr))
 +/* Probably the test for (nr) > 0 in the next macro is only needed
 + * on BlueGene(/L), where IBM's MPI_Bcast will segfault after
 + * dereferencing a null pointer, even when no data is to be transferred. */
 +#define  nblock_bc(cr, nr, d) { if ((nr) > 0) {gmx_bcast((nr)*sizeof((d)[0]), (d), (cr)); }}
 +#define    snew_bc(cr, d, nr) { if (!MASTER(cr)) {snew((d), (nr)); }}
 +/* Dirty macro with bAlloc not as an argument */
 +#define nblock_abc(cr, nr, d) { if (bAlloc) {snew((d), (nr)); } nblock_bc(cr, (nr), (d)); }
 +
 +static void bc_string(const t_commrec *cr, t_symtab *symtab, char ***s)
 +{
 +    int handle;
 +
 +    if (MASTER(cr))
 +    {
 +        handle = lookup_symtab(symtab, *s);
 +    }
 +    block_bc(cr, handle);
 +    if (!MASTER(cr))
 +    {
 +        *s = get_symtab_handle(symtab, handle);
 +    }
 +}
 +
 +static void bc_strings(const t_commrec *cr, t_symtab *symtab, int nr, char ****nm)
 +{
 +    int     i;
 +    int    *handle;
 +    char ***NM;
 +
 +    snew(handle, nr);
 +    if (MASTER(cr))
 +    {
 +        NM = *nm;
 +        for (i = 0; (i < nr); i++)
 +        {
 +            handle[i] = lookup_symtab(symtab, NM[i]);
 +        }
 +    }
 +    nblock_bc(cr, nr, handle);
 +
 +    if (!MASTER(cr))
 +    {
 +        snew_bc(cr, *nm, nr);
 +        NM = *nm;
 +        for (i = 0; (i < nr); i++)
 +        {
 +            (*nm)[i] = get_symtab_handle(symtab, handle[i]);
 +        }
 +    }
 +    sfree(handle);
 +}
 +
 +static void bc_strings_resinfo(const t_commrec *cr, t_symtab *symtab,
 +                               int nr, t_resinfo *resinfo)
 +{
 +    int   i;
 +    int  *handle;
 +
 +    snew(handle, nr);
 +    if (MASTER(cr))
 +    {
 +        for (i = 0; (i < nr); i++)
 +        {
 +            handle[i] = lookup_symtab(symtab, resinfo[i].name);
 +        }
 +    }
 +    nblock_bc(cr, nr, handle);
 +
 +    if (!MASTER(cr))
 +    {
 +        for (i = 0; (i < nr); i++)
 +        {
 +            resinfo[i].name = get_symtab_handle(symtab, handle[i]);
 +        }
 +    }
 +    sfree(handle);
 +}
 +
 +static void bc_symtab(const t_commrec *cr, t_symtab *symtab)
 +{
 +    int       i, nr, len;
 +    t_symbuf *symbuf;
 +
 +    block_bc(cr, symtab->nr);
 +    nr = symtab->nr;
 +    snew_bc(cr, symtab->symbuf, 1);
 +    symbuf          = symtab->symbuf;
 +    symbuf->bufsize = nr;
 +    snew_bc(cr, symbuf->buf, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        if (MASTER(cr))
 +        {
 +            len = strlen(symbuf->buf[i]) + 1;
 +        }
 +        block_bc(cr, len);
 +        snew_bc(cr, symbuf->buf[i], len);
 +        nblock_bc(cr, len, symbuf->buf[i]);
 +    }
 +}
 +
 +static void bc_block(const t_commrec *cr, t_block *block)
 +{
 +    block_bc(cr, block->nr);
 +    snew_bc(cr, block->index, block->nr+1);
 +    nblock_bc(cr, block->nr+1, block->index);
 +}
 +
 +static void bc_blocka(const t_commrec *cr, t_blocka *block)
 +{
 +    block_bc(cr, block->nr);
 +    snew_bc(cr, block->index, block->nr+1);
 +    nblock_bc(cr, block->nr+1, block->index);
 +    block_bc(cr, block->nra);
 +    if (block->nra)
 +    {
 +        snew_bc(cr, block->a, block->nra);
 +        nblock_bc(cr, block->nra, block->a);
 +    }
 +}
 +
 +static void bc_grps(const t_commrec *cr, t_grps grps[])
 +{
 +    int i;
 +
 +    for (i = 0; (i < egcNR); i++)
 +    {
 +        block_bc(cr, grps[i].nr);
 +        snew_bc(cr, grps[i].nm_ind, grps[i].nr);
 +        nblock_bc(cr, grps[i].nr, grps[i].nm_ind);
 +    }
 +}
 +
 +static void bc_atoms(const t_commrec *cr, t_symtab *symtab, t_atoms *atoms)
 +{
 +    int dummy;
 +
 +    block_bc(cr, atoms->nr);
 +    snew_bc(cr, atoms->atom, atoms->nr);
 +    nblock_bc(cr, atoms->nr, atoms->atom);
 +    bc_strings(cr, symtab, atoms->nr, &atoms->atomname);
 +    block_bc(cr, atoms->nres);
 +    snew_bc(cr, atoms->resinfo, atoms->nres);
 +    nblock_bc(cr, atoms->nres, atoms->resinfo);
 +    bc_strings_resinfo(cr, symtab, atoms->nres, atoms->resinfo);
 +    /* QMMM requires atomtypes to be known on all nodes as well */
 +    bc_strings(cr, symtab, atoms->nr, &atoms->atomtype);
 +    bc_strings(cr, symtab, atoms->nr, &atoms->atomtypeB);
 +}
 +
 +static void bc_groups(const t_commrec *cr, t_symtab *symtab,
 +                      int natoms, gmx_groups_t *groups)
 +{
 +    int dummy;
 +    int g, n;
 +
 +    bc_grps(cr, groups->grps);
 +    block_bc(cr, groups->ngrpname);
 +    bc_strings(cr, symtab, groups->ngrpname, &groups->grpname);
 +    for (g = 0; g < egcNR; g++)
 +    {
 +        if (MASTER(cr))
 +        {
 +            if (groups->grpnr[g])
 +            {
 +                n = natoms;
 +            }
 +            else
 +            {
 +                n = 0;
 +            }
 +        }
 +        block_bc(cr, n);
 +        if (n == 0)
 +        {
 +            groups->grpnr[g] = NULL;
 +        }
 +        else
 +        {
 +            snew_bc(cr, groups->grpnr[g], n);
 +            nblock_bc(cr, n, groups->grpnr[g]);
 +        }
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_groups\n");
 +    }
 +}
 +
 +void bcast_state_setup(const t_commrec *cr, t_state *state)
 +{
 +    block_bc(cr, state->natoms);
 +    block_bc(cr, state->ngtc);
 +    block_bc(cr, state->nnhpres);
 +    block_bc(cr, state->nhchainlength);
 +    block_bc(cr, state->nrng);
 +    block_bc(cr, state->nrngi);
 +    block_bc(cr, state->flags);
 +    if (state->lambda == NULL)
 +    {
 +        snew_bc(cr, state->lambda, efptNR)
 +    }
 +}
 +
 +void bcast_state(const t_commrec *cr, t_state *state, gmx_bool bAlloc)
 +{
 +    int i, nnht, nnhtp;
 +
 +    bcast_state_setup(cr, state);
 +
 +    nnht  = (state->ngtc)*(state->nhchainlength);
 +    nnhtp = (state->nnhpres)*(state->nhchainlength);
 +
 +    if (MASTER(cr))
 +    {
 +        bAlloc = FALSE;
 +    }
 +    if (bAlloc)
 +    {
 +        state->nalloc = state->natoms;
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (state->flags & (1<<i))
 +        {
 +            switch (i)
 +            {
 +                case estLAMBDA:  nblock_bc(cr, efptNR, state->lambda); break;
 +                case estFEPSTATE: block_bc(cr, state->fep_state); break;
 +                case estBOX:     block_bc(cr, state->box); break;
 +                case estBOX_REL: block_bc(cr, state->box_rel); break;
 +                case estBOXV:    block_bc(cr, state->boxv); break;
 +                case estPRES_PREV: block_bc(cr, state->pres_prev); break;
 +                case estSVIR_PREV: block_bc(cr, state->svir_prev); break;
 +                case estFVIR_PREV: block_bc(cr, state->fvir_prev); break;
 +                case estNH_XI:   nblock_abc(cr, nnht, state->nosehoover_xi); break;
 +                case estNH_VXI:  nblock_abc(cr, nnht, state->nosehoover_vxi); break;
 +                case estNHPRES_XI:   nblock_abc(cr, nnhtp, state->nhpres_xi); break;
 +                case estNHPRES_VXI:  nblock_abc(cr, nnhtp, state->nhpres_vxi); break;
 +                case estTC_INT:  nblock_abc(cr, state->ngtc, state->therm_integral); break;
 +                case estVETA:    block_bc(cr, state->veta); break;
 +                case estVOL0:    block_bc(cr, state->vol0); break;
 +                case estX:       nblock_abc(cr, state->natoms, state->x); break;
 +                case estV:       nblock_abc(cr, state->natoms, state->v); break;
 +                case estSDX:     nblock_abc(cr, state->natoms, state->sd_X); break;
 +                case estCGP:     nblock_abc(cr, state->natoms, state->cg_p); break;
 +                case estLD_RNG:  if (state->nrngi == 1)
 +                    {
 +                        nblock_abc(cr, state->nrng, state->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI: if (state->nrngi == 1)
 +                    {
 +                        nblock_abc(cr, state->nrngi, state->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF: block_bc(cr, state->hist.disre_initf); break;
 +                case estDISRE_RM3TAV:
 +                    block_bc(cr, state->hist.ndisrepairs);
 +                    nblock_abc(cr, state->hist.ndisrepairs, state->hist.disre_rm3tav);
 +                    break;
 +                case estORIRE_INITF: block_bc(cr, state->hist.orire_initf); break;
 +                case estORIRE_DTAV:
 +                    block_bc(cr, state->hist.norire_Dtav);
 +                    nblock_abc(cr, state->hist.norire_Dtav, state->hist.orire_Dtav);
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS,
 +                              "Communication is not implemented for %s in bcast_state",
 +                              est_names[i]);
 +            }
 +        }
 +    }
 +}
 +
 +static void bc_ilists(const t_commrec *cr, t_ilist *ilist)
 +{
 +    int ftype;
 +
 +    /* Here we only communicate the non-zero length ilists */
 +    if (MASTER(cr))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if (ilist[ftype].nr > 0)
 +            {
 +                block_bc(cr, ftype);
 +                block_bc(cr, ilist[ftype].nr);
 +                nblock_bc(cr, ilist[ftype].nr, ilist[ftype].iatoms);
 +            }
 +        }
 +        ftype = -1;
 +        block_bc(cr, ftype);
 +    }
 +    else
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            ilist[ftype].nr = 0;
 +        }
 +        do
 +        {
 +            block_bc(cr, ftype);
 +            if (ftype >= 0)
 +            {
 +                block_bc(cr, ilist[ftype].nr);
 +                snew_bc(cr, ilist[ftype].iatoms, ilist[ftype].nr);
 +                nblock_bc(cr, ilist[ftype].nr, ilist[ftype].iatoms);
 +            }
 +        }
 +        while (ftype >= 0);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_ilists\n");
 +    }
 +}
 +
 +static void bc_cmap(const t_commrec *cr, gmx_cmap_t *cmap_grid)
 +{
 +    int i, j, nelem, ngrid;
 +
 +    block_bc(cr, cmap_grid->ngrid);
 +    block_bc(cr, cmap_grid->grid_spacing);
 +
 +    ngrid = cmap_grid->ngrid;
 +    nelem = cmap_grid->grid_spacing * cmap_grid->grid_spacing;
 +
 +    if (ngrid > 0)
 +    {
 +        snew_bc(cr, cmap_grid->cmapdata, ngrid);
 +
 +        for (i = 0; i < ngrid; i++)
 +        {
 +            snew_bc(cr, cmap_grid->cmapdata[i].cmap, 4*nelem);
 +            nblock_bc(cr, 4*nelem, cmap_grid->cmapdata[i].cmap);
 +        }
 +    }
 +}
 +
 +static void bc_ffparams(const t_commrec *cr, gmx_ffparams_t *ffp)
 +{
 +    int i;
 +
 +    block_bc(cr, ffp->ntypes);
 +    block_bc(cr, ffp->atnr);
 +    snew_bc(cr, ffp->functype, ffp->ntypes);
 +    snew_bc(cr, ffp->iparams, ffp->ntypes);
 +    nblock_bc(cr, ffp->ntypes, ffp->functype);
 +    nblock_bc(cr, ffp->ntypes, ffp->iparams);
 +    block_bc(cr, ffp->reppow);
 +    block_bc(cr, ffp->fudgeQQ);
 +    bc_cmap(cr, &ffp->cmap_grid);
 +}
 +
 +static void bc_grpopts(const t_commrec *cr, t_grpopts *g)
 +{
 +    int i, n;
 +
 +    block_bc(cr, g->ngtc);
 +    block_bc(cr, g->ngacc);
 +    block_bc(cr, g->ngfrz);
 +    block_bc(cr, g->ngener);
 +    snew_bc(cr, g->nrdf, g->ngtc);
 +    snew_bc(cr, g->tau_t, g->ngtc);
 +    snew_bc(cr, g->ref_t, g->ngtc);
 +    snew_bc(cr, g->acc, g->ngacc);
 +    snew_bc(cr, g->nFreeze, g->ngfrz);
 +    snew_bc(cr, g->egp_flags, g->ngener*g->ngener);
 +
 +    nblock_bc(cr, g->ngtc, g->nrdf);
 +    nblock_bc(cr, g->ngtc, g->tau_t);
 +    nblock_bc(cr, g->ngtc, g->ref_t);
 +    nblock_bc(cr, g->ngacc, g->acc);
 +    nblock_bc(cr, g->ngfrz, g->nFreeze);
 +    nblock_bc(cr, g->ngener*g->ngener, g->egp_flags);
 +    snew_bc(cr, g->annealing, g->ngtc);
 +    snew_bc(cr, g->anneal_npoints, g->ngtc);
 +    snew_bc(cr, g->anneal_time, g->ngtc);
 +    snew_bc(cr, g->anneal_temp, g->ngtc);
 +    nblock_bc(cr, g->ngtc, g->annealing);
 +    nblock_bc(cr, g->ngtc, g->anneal_npoints);
 +    for (i = 0; (i < g->ngtc); i++)
 +    {
 +        n = g->anneal_npoints[i];
 +        if (n > 0)
 +        {
 +            snew_bc(cr, g->anneal_time[i], n);
 +            snew_bc(cr, g->anneal_temp[i], n);
 +            nblock_bc(cr, n, g->anneal_time[i]);
 +            nblock_bc(cr, n, g->anneal_temp[i]);
 +        }
 +    }
 +
 +    /* QMMM stuff, see inputrec */
 +    block_bc(cr, g->ngQM);
 +    snew_bc(cr, g->QMmethod, g->ngQM);
 +    snew_bc(cr, g->QMbasis, g->ngQM);
 +    snew_bc(cr, g->QMcharge, g->ngQM);
 +    snew_bc(cr, g->QMmult, g->ngQM);
 +    snew_bc(cr, g->bSH, g->ngQM);
 +    snew_bc(cr, g->CASorbitals, g->ngQM);
 +    snew_bc(cr, g->CASelectrons, g->ngQM);
 +    snew_bc(cr, g->SAon, g->ngQM);
 +    snew_bc(cr, g->SAoff, g->ngQM);
 +    snew_bc(cr, g->SAsteps, g->ngQM);
 +
 +    if (g->ngQM)
 +    {
 +        nblock_bc(cr, g->ngQM, g->QMmethod);
 +        nblock_bc(cr, g->ngQM, g->QMbasis);
 +        nblock_bc(cr, g->ngQM, g->QMcharge);
 +        nblock_bc(cr, g->ngQM, g->QMmult);
 +        nblock_bc(cr, g->ngQM, g->bSH);
 +        nblock_bc(cr, g->ngQM, g->CASorbitals);
 +        nblock_bc(cr, g->ngQM, g->CASelectrons);
 +        nblock_bc(cr, g->ngQM, g->SAon);
 +        nblock_bc(cr, g->ngQM, g->SAoff);
 +        nblock_bc(cr, g->ngQM, g->SAsteps);
 +        /* end of QMMM stuff */
 +    }
 +}
 +
 +static void bc_cosines(const t_commrec *cr, t_cosines *cs)
 +{
 +    block_bc(cr, cs->n);
 +    snew_bc(cr, cs->a, cs->n);
 +    snew_bc(cr, cs->phi, cs->n);
 +    if (cs->n > 0)
 +    {
 +        nblock_bc(cr, cs->n, cs->a);
 +        nblock_bc(cr, cs->n, cs->phi);
 +    }
 +}
 +
 +static void bc_pullgrp(const t_commrec *cr, t_pullgrp *pgrp)
 +{
 +    block_bc(cr, *pgrp);
 +    if (pgrp->nat > 0)
 +    {
 +        snew_bc(cr, pgrp->ind, pgrp->nat);
 +        nblock_bc(cr, pgrp->nat, pgrp->ind);
 +    }
 +    if (pgrp->nweight > 0)
 +    {
 +        snew_bc(cr, pgrp->weight, pgrp->nweight);
 +        nblock_bc(cr, pgrp->nweight, pgrp->weight);
 +    }
 +}
 +
 +static void bc_pull(const t_commrec *cr, t_pull *pull)
 +{
 +    int g;
 +
 +    block_bc(cr, *pull);
 +    snew_bc(cr, pull->grp, pull->ngrp+1);
 +    for (g = 0; g < pull->ngrp+1; g++)
 +    {
 +        bc_pullgrp(cr, &pull->grp[g]);
 +    }
 +}
 +
 +static void bc_rotgrp(const t_commrec *cr, t_rotgrp *rotg)
 +{
 +    block_bc(cr, *rotg);
 +    if (rotg->nat > 0)
 +    {
 +        snew_bc(cr, rotg->ind, rotg->nat);
 +        nblock_bc(cr, rotg->nat, rotg->ind);
 +        snew_bc(cr, rotg->x_ref, rotg->nat);
 +        nblock_bc(cr, rotg->nat, rotg->x_ref);
 +    }
 +}
 +
 +static void bc_rot(const t_commrec *cr, t_rot *rot)
 +{
 +    int g;
 +
 +    block_bc(cr, *rot);
 +    snew_bc(cr, rot->grp, rot->ngrp);
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        bc_rotgrp(cr, &rot->grp[g]);
 +    }
 +}
 +
 +static void bc_adress(const t_commrec *cr, t_adress *adress)
 +{
 +    block_bc(cr, *adress);
 +    if (adress->n_tf_grps > 0)
 +    {
 +        snew_bc(cr, adress->tf_table_index, adress->n_tf_grps);
 +        nblock_bc(cr, adress->n_tf_grps, adress->tf_table_index);
 +    }
 +    if (adress->n_energy_grps > 0)
 +    {
 +        snew_bc(cr, adress->group_explicit, adress->n_energy_grps);
 +        nblock_bc(cr, adress->n_energy_grps, adress->group_explicit);
 +    }
 +}
 +static void bc_fepvals(const t_commrec *cr, t_lambda *fep)
 +{
 +    gmx_bool bAlloc = TRUE;
 +    int      i;
 +
 +    block_bc(cr, fep->nstdhdl);
 +    block_bc(cr, fep->init_lambda);
 +    block_bc(cr, fep->init_fep_state);
 +    block_bc(cr, fep->delta_lambda);
 +    block_bc(cr, fep->bPrintEnergy);
 +    block_bc(cr, fep->n_lambda);
-         snew_bc(cr, fep->all_lambda[i], fep->n_lambda);
-         nblock_bc(cr, fep->n_lambda, fep->all_lambda[i]);
++    if (fep->n_lambda > 0)
 +    {
++        snew_bc(cr, fep->all_lambda, efptNR);
++        nblock_bc(cr, efptNR, fep->all_lambda);
++        for (i = 0; i < efptNR; i++)
++        {
++            snew_bc(cr, fep->all_lambda[i], fep->n_lambda);
++            nblock_bc(cr, fep->n_lambda, fep->all_lambda[i]);
++        }
 +    }
 +    block_bc(cr, fep->sc_alpha);
 +    block_bc(cr, fep->sc_power);
 +    block_bc(cr, fep->sc_r_power);
 +    block_bc(cr, fep->sc_sigma);
 +    block_bc(cr, fep->sc_sigma_min);
 +    block_bc(cr, fep->bScCoul);
 +    nblock_bc(cr, efptNR, &(fep->separate_dvdl[0]));
 +    block_bc(cr, fep->dhdl_derivatives);
 +    block_bc(cr, fep->dh_hist_size);
 +    block_bc(cr, fep->dh_hist_spacing);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_fepvals\n");
 +    }
 +}
 +
 +static void bc_expandedvals(const t_commrec *cr, t_expanded *expand, int n_lambda)
 +{
 +    gmx_bool bAlloc = TRUE;
 +    int      i;
 +
 +    block_bc(cr, expand->nstexpanded);
 +    block_bc(cr, expand->elamstats);
 +    block_bc(cr, expand->elmcmove);
 +    block_bc(cr, expand->elmceq);
 +    block_bc(cr, expand->equil_n_at_lam);
 +    block_bc(cr, expand->equil_wl_delta);
 +    block_bc(cr, expand->equil_ratio);
 +    block_bc(cr, expand->equil_steps);
 +    block_bc(cr, expand->equil_samples);
 +    block_bc(cr, expand->lmc_seed);
 +    block_bc(cr, expand->minvar);
 +    block_bc(cr, expand->minvar_const);
 +    block_bc(cr, expand->c_range);
 +    block_bc(cr, expand->bSymmetrizedTMatrix);
 +    block_bc(cr, expand->nstTij);
 +    block_bc(cr, expand->lmc_repeats);
 +    block_bc(cr, expand->lmc_forced_nstart);
 +    block_bc(cr, expand->gibbsdeltalam);
 +    block_bc(cr, expand->wl_scale);
 +    block_bc(cr, expand->wl_ratio);
 +    block_bc(cr, expand->init_wl_delta);
 +    block_bc(cr, expand->bInit_weights);
 +    snew_bc(cr, expand->init_lambda_weights, n_lambda);
 +    nblock_bc(cr, n_lambda, expand->init_lambda_weights);
 +    block_bc(cr, expand->mc_temp);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_expandedvals\n");
 +    }
 +}
 +
 +static void bc_simtempvals(const t_commrec *cr, t_simtemp *simtemp, int n_lambda)
 +{
 +    gmx_bool bAlloc = TRUE;
 +    int      i;
 +
 +    block_bc(cr, simtemp->simtemp_low);
 +    block_bc(cr, simtemp->simtemp_high);
 +    block_bc(cr, simtemp->eSimTempScale);
 +    snew_bc(cr, simtemp->temperatures, n_lambda);
 +    nblock_bc(cr, n_lambda, simtemp->temperatures);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_simtempvals\n");
 +    }
 +}
 +
 +static void bc_inputrec(const t_commrec *cr, t_inputrec *inputrec)
 +{
 +    gmx_bool bAlloc = TRUE;
 +    int      i;
 +
 +    block_bc(cr, *inputrec);
 +
 +    bc_grpopts(cr, &(inputrec->opts));
 +
 +    /* even if efep is efepNO, we need to initialize to make sure that
 +     * n_lambda is set to zero */
 +
 +    snew_bc(cr, inputrec->fepvals, 1);
 +    if (inputrec->efep != efepNO || inputrec->bSimTemp)
 +    {
 +        bc_fepvals(cr, inputrec->fepvals);
 +    }
 +    /* need to initialize this as well because of data checked for in the logic */
 +    snew_bc(cr, inputrec->expandedvals, 1);
 +    if (inputrec->bExpanded)
 +    {
 +        bc_expandedvals(cr, inputrec->expandedvals, inputrec->fepvals->n_lambda);
 +    }
 +    snew_bc(cr, inputrec->simtempvals, 1);
 +    if (inputrec->bSimTemp)
 +    {
 +        bc_simtempvals(cr, inputrec->simtempvals, inputrec->fepvals->n_lambda);
 +    }
 +    if (inputrec->ePull != epullNO)
 +    {
 +        snew_bc(cr, inputrec->pull, 1);
 +        bc_pull(cr, inputrec->pull);
 +    }
 +    if (inputrec->bRot)
 +    {
 +        snew_bc(cr, inputrec->rot, 1);
 +        bc_rot(cr, inputrec->rot);
 +    }
 +    for (i = 0; (i < DIM); i++)
 +    {
 +        bc_cosines(cr, &(inputrec->ex[i]));
 +        bc_cosines(cr, &(inputrec->et[i]));
 +    }
 +    if (inputrec->bAdress)
 +    {
 +        snew_bc(cr, inputrec->adress, 1);
 +        bc_adress(cr, inputrec->adress);
 +    }
 +}
 +
 +static void bc_moltype(const t_commrec *cr, t_symtab *symtab,
 +                       gmx_moltype_t *moltype)
 +{
 +    bc_string(cr, symtab, &moltype->name);
 +    bc_atoms(cr, symtab, &moltype->atoms);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_atoms\n");
 +    }
 +
 +    bc_ilists(cr, moltype->ilist);
 +    bc_block(cr, &moltype->cgs);
 +    bc_blocka(cr, &moltype->excls);
 +}
 +
 +static void bc_molblock(const t_commrec *cr, gmx_molblock_t *molb)
 +{
 +    gmx_bool bAlloc = TRUE;
 +
 +    block_bc(cr, molb->type);
 +    block_bc(cr, molb->nmol);
 +    block_bc(cr, molb->natoms_mol);
 +    block_bc(cr, molb->nposres_xA);
 +    if (molb->nposres_xA > 0)
 +    {
 +        snew_bc(cr, molb->posres_xA, molb->nposres_xA);
 +        nblock_bc(cr, molb->nposres_xA*DIM, molb->posres_xA[0]);
 +    }
 +    block_bc(cr, molb->nposres_xB);
 +    if (molb->nposres_xB > 0)
 +    {
 +        snew_bc(cr, molb->posres_xB, molb->nposres_xB);
 +        nblock_bc(cr, molb->nposres_xB*DIM, molb->posres_xB[0]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_molblock\n");
 +    }
 +}
 +
 +static void bc_atomtypes(const t_commrec *cr, t_atomtypes *atomtypes)
 +{
 +    int nr;
 +
 +    block_bc(cr, atomtypes->nr);
 +
 +    nr = atomtypes->nr;
 +
 +    snew_bc(cr, atomtypes->radius, nr);
 +    snew_bc(cr, atomtypes->vol, nr);
 +    snew_bc(cr, atomtypes->surftens, nr);
 +    snew_bc(cr, atomtypes->gb_radius, nr);
 +    snew_bc(cr, atomtypes->S_hct, nr);
 +
 +    nblock_bc(cr, nr, atomtypes->radius);
 +    nblock_bc(cr, nr, atomtypes->vol);
 +    nblock_bc(cr, nr, atomtypes->surftens);
 +    nblock_bc(cr, nr, atomtypes->gb_radius);
 +    nblock_bc(cr, nr, atomtypes->S_hct);
 +}
 +
 +
 +void bcast_ir_mtop(const t_commrec *cr, t_inputrec *inputrec, gmx_mtop_t *mtop)
 +{
 +    int i;
 +    if (debug)
 +    {
 +        fprintf(debug, "in bc_data\n");
 +    }
 +    bc_inputrec(cr, inputrec);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_inputrec\n");
 +    }
 +    bc_symtab(cr, &mtop->symtab);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_symtab\n");
 +    }
 +    bc_string(cr, &mtop->symtab, &mtop->name);
 +    if (debug)
 +    {
 +        fprintf(debug, "after bc_name\n");
 +    }
 +
 +    bc_ffparams(cr, &mtop->ffparams);
 +
 +    block_bc(cr, mtop->nmoltype);
 +    snew_bc(cr, mtop->moltype, mtop->nmoltype);
 +    for (i = 0; i < mtop->nmoltype; i++)
 +    {
 +        bc_moltype(cr, &mtop->symtab, &mtop->moltype[i]);
 +    }
 +
 +    block_bc(cr, mtop->nmolblock);
 +    snew_bc(cr, mtop->molblock, mtop->nmolblock);
 +    for (i = 0; i < mtop->nmolblock; i++)
 +    {
 +        bc_molblock(cr, &mtop->molblock[i]);
 +    }
 +
 +    block_bc(cr, mtop->natoms);
 +
 +    bc_atomtypes(cr, &mtop->atomtypes);
 +
 +    bc_block(cr, &mtop->mols);
 +    bc_groups(cr, &mtop->symtab, mtop->natoms, &mtop->groups);
 +}
index ceb52b591ef4831c910b6d8faa0773d9c9cf2af0,0000000000000000000000000000000000000000..a58de0c174787be25ee248910ee4661a21449988
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_avx_128_fma_double[] =
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_avx_128_fma_double_h
 +#define nb_kernel_avx_128_fma_double_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_128_fma_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
 +
 +
 +nb_kernel_info_t
- kernellist_avx_128_fma_double_size = sizeof(kernellist_avx_128_fma_double)/sizeof(kernellist_avx_128_fma_double[0]);
++    kernellist_avx_128_fma_double[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_avx_128_fma_double_size = sizeof(kernellist_avx_128_fma_double)/sizeof(kernellist_avx_128_fma_double[0]);
 +
 +#endif
index 20add7f0a6ac5240f3314a33351baba79600e682,0000000000000000000000000000000000000000..dfc40d926119bbe94f8536f6f2e1b62b298ee48d
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_avx_128_fma_single[] =
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_avx_128_fma_single_h
 +#define nb_kernel_avx_128_fma_single_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_128_fma_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
 +
 +
 +nb_kernel_info_t
- kernellist_avx_128_fma_single_size = sizeof(kernellist_avx_128_fma_single)/sizeof(kernellist_avx_128_fma_single[0]);
++    kernellist_avx_128_fma_single[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_avx_128_fma_single_size = sizeof(kernellist_avx_128_fma_single)/sizeof(kernellist_avx_128_fma_single[0]);
 +
 +#endif
index a887b3b9e4cee08d3a965722309e2c52377a7fc9,0000000000000000000000000000000000000000..9fef448af5b975454c29c169fc4624a23d65f2fe
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_avx_256_double[] =
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_avx_256_double_h
 +#define nb_kernel_avx_256_double_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double;
 +
 +
 +nb_kernel_info_t
- kernellist_avx_256_double_size = sizeof(kernellist_avx_256_double)/sizeof(kernellist_avx_256_double[0]);
++    kernellist_avx_256_double[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double", "avx_256_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_double", "avx_256_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double", "avx_256_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double", "avx_256_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_avx_256_double_size = sizeof(kernellist_avx_256_double)/sizeof(kernellist_avx_256_double[0]);
 +
 +#endif
index f316f44605247bdd8188431ab415f50d2faaaecf,0000000000000000000000000000000000000000..7d71b2ce66de5a631d42f0f51bd1615972911d08
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_avx_256_single[] =
 +/*
 + * Note: this file was generated by the Gromacs avx_256_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_avx_256_single_h
 +#define nb_kernel_avx_256_single_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single;
 +
 +
 +nb_kernel_info_t
- kernellist_avx_256_single_size = sizeof(kernellist_avx_256_single)/sizeof(kernellist_avx_256_single[0]);
++    kernellist_avx_256_single[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_single", "avx_256_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_single", "avx_256_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single", "avx_256_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single", "avx_256_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_avx_256_single_size = sizeof(kernellist_avx_256_single)/sizeof(kernellist_avx_256_single[0]);
 +
 +#endif
index c33cadd829e9c5d5cfd2d4a4b0190a48227cdf53,0000000000000000000000000000000000000000..82a008c3c123dae0606b36578081c0920598f0b5
mode 100644,000000..100644
--- /dev/null
@@@ -1,641 -1,0 +1,641 @@@
- kernellist_c[] =
 +/*
 + * Note: this file was generated by the Gromacs c kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_c_h
 +#define nb_kernel_c_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwBham_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwBham_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwBhamSh_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwBhamSw_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCoul_VdwBham_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwBham_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwBham_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecGB_VdwBham_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSh_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwBhamSw_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomP1P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomP1P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW3P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW3P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW3W3_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW3W3_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW4P1_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW4P1_F_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW4W4_VF_c;
 +nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW4W4_F_c;
 +
 +
 +nb_kernel_info_t
- kernellist_c_size = sizeof(kernellist_c)/sizeof(kernellist_c[0]);
++    kernellist_c[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c", "c", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c", "c", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_c", "c", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_c", "c", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_c", "c", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_c", "c", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_c", "c", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_c", "c", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwBham_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwBham_GeomP1P1_VF_c", "c", "None", "None", "Buckingham", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwBham_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwBham_GeomP1P1_F_c", "c", "None", "None", "Buckingham", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_VF_c", "c", "None", "None", "Buckingham", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_F_c", "c", "None", "None", "Buckingham", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_VF_c", "c", "None", "None", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_F_c", "c", "None", "None", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_c", "c", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_c", "c", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_c", "c", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_c, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_c", "c", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_c", "c", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_c, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_c", "c", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_c", "c", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_c, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_c", "c", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_c", "c", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_c, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_c", "c", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_c", "c", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_c", "c", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_c", "c", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_c", "c", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_c", "c", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_c", "c", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_c", "c", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_c", "c", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_c", "c", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_c", "c", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_c", "c", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_c", "c", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_c, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_c, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_c, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_c, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_c, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_c, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_c, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_c, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_c", "c", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwBham_GeomP1P1_VF_c, "nb_kernel_ElecEw_VdwBham_GeomP1P1_VF_c", "c", "Ewald", "None", "Buckingham", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwBham_GeomP1P1_F_c, "nb_kernel_ElecEw_VdwBham_GeomP1P1_F_c", "c", "Ewald", "None", "Buckingham", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW3P1_VF_c, "nb_kernel_ElecEw_VdwBham_GeomW3P1_VF_c", "c", "Ewald", "None", "Buckingham", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW3P1_F_c, "nb_kernel_ElecEw_VdwBham_GeomW3P1_F_c", "c", "Ewald", "None", "Buckingham", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW3W3_VF_c, "nb_kernel_ElecEw_VdwBham_GeomW3W3_VF_c", "c", "Ewald", "None", "Buckingham", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW3W3_F_c, "nb_kernel_ElecEw_VdwBham_GeomW3W3_F_c", "c", "Ewald", "None", "Buckingham", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW4P1_VF_c, "nb_kernel_ElecEw_VdwBham_GeomW4P1_VF_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW4P1_F_c, "nb_kernel_ElecEw_VdwBham_GeomW4P1_F_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW4W4_VF_c, "nb_kernel_ElecEw_VdwBham_GeomW4W4_VF_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwBham_GeomW4W4_F_c, "nb_kernel_ElecEw_VdwBham_GeomW4W4_F_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_c", "c", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_c", "c", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_c", "c", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_c", "c", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_c", "c", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_c", "c", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_c", "c", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_c", "c", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_c", "c", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_c", "c", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomP1P1_VF_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomP1P1_VF_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomP1P1_F_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomP1P1_F_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW3P1_VF_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW3P1_VF_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW3P1_F_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW3P1_F_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW3W3_VF_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW3W3_VF_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW3W3_F_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW3W3_F_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW4P1_VF_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW4P1_VF_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW4P1_F_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW4P1_F_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW4W4_VF_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW4W4_VF_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwBhamSh_GeomW4W4_F_c, "nb_kernel_ElecEwSh_VdwBhamSh_GeomW4W4_F_c", "c", "Ewald", "PotentialShift", "Buckingham", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_c, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_c", "c", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_c", "c", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_c", "c", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_c", "c", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomP1P1_VF_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomP1P1_VF_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomP1P1_F_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomP1P1_F_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW3P1_VF_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW3P1_VF_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW3P1_F_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW3P1_F_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW3W3_VF_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW3W3_VF_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW3W3_F_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW3W3_F_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW4P1_VF_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW4P1_VF_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW4P1_F_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW4P1_F_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW4W4_VF_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW4W4_VF_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwBhamSw_GeomW4W4_F_c, "nb_kernel_ElecEwSw_VdwBhamSw_GeomW4W4_F_c", "c", "Ewald", "PotentialSwitch", "Buckingham", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_c", "c", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_c", "c", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_c, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_c", "c", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_c, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_c", "c", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_c, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_c", "c", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_c, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_c", "c", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_c, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_c", "c", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_c, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_c", "c", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_c, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_c", "c", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_c, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_c", "c", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_c", "c", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_c", "c", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_c", "c", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_c", "c", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_c", "c", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_c", "c", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_c", "c", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_c", "c", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_c", "c", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_c", "c", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_c, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_c", "c", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomP1P1_VF_c, "nb_kernel_ElecCoul_VdwBham_GeomP1P1_VF_c", "c", "Coulomb", "None", "Buckingham", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomP1P1_F_c, "nb_kernel_ElecCoul_VdwBham_GeomP1P1_F_c", "c", "Coulomb", "None", "Buckingham", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW3P1_VF_c, "nb_kernel_ElecCoul_VdwBham_GeomW3P1_VF_c", "c", "Coulomb", "None", "Buckingham", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW3P1_F_c, "nb_kernel_ElecCoul_VdwBham_GeomW3P1_F_c", "c", "Coulomb", "None", "Buckingham", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW3W3_VF_c, "nb_kernel_ElecCoul_VdwBham_GeomW3W3_VF_c", "c", "Coulomb", "None", "Buckingham", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW3W3_F_c, "nb_kernel_ElecCoul_VdwBham_GeomW3W3_F_c", "c", "Coulomb", "None", "Buckingham", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW4P1_VF_c, "nb_kernel_ElecCoul_VdwBham_GeomW4P1_VF_c", "c", "Coulomb", "None", "Buckingham", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW4P1_F_c, "nb_kernel_ElecCoul_VdwBham_GeomW4P1_F_c", "c", "Coulomb", "None", "Buckingham", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW4W4_VF_c, "nb_kernel_ElecCoul_VdwBham_GeomW4W4_VF_c", "c", "Coulomb", "None", "Buckingham", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwBham_GeomW4W4_F_c, "nb_kernel_ElecCoul_VdwBham_GeomW4W4_F_c", "c", "Coulomb", "None", "Buckingham", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_c, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_c", "c", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_c", "c", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_c", "c", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_c", "c", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_c", "c", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_c", "c", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_c", "c", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_c", "c", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_c", "c", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_c", "c", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_c", "c", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_c, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_c", "c", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomP1P1_VF_c, "nb_kernel_ElecCSTab_VdwBham_GeomP1P1_VF_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomP1P1_F_c, "nb_kernel_ElecCSTab_VdwBham_GeomP1P1_F_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW3P1_VF_c, "nb_kernel_ElecCSTab_VdwBham_GeomW3P1_VF_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW3P1_F_c, "nb_kernel_ElecCSTab_VdwBham_GeomW3P1_F_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW3W3_VF_c, "nb_kernel_ElecCSTab_VdwBham_GeomW3W3_VF_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW3W3_F_c, "nb_kernel_ElecCSTab_VdwBham_GeomW3W3_F_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW4P1_VF_c, "nb_kernel_ElecCSTab_VdwBham_GeomW4P1_VF_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW4P1_F_c, "nb_kernel_ElecCSTab_VdwBham_GeomW4P1_F_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW4W4_VF_c, "nb_kernel_ElecCSTab_VdwBham_GeomW4W4_VF_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwBham_GeomW4W4_F_c, "nb_kernel_ElecCSTab_VdwBham_GeomW4W4_F_c", "c", "CubicSplineTable", "None", "Buckingham", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c", "c", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_c", "c", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_c", "c", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_c", "c", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_c", "c", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_c", "c", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwBham_GeomP1P1_VF_c, "nb_kernel_ElecGB_VdwBham_GeomP1P1_VF_c", "c", "GeneralizedBorn", "None", "Buckingham", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwBham_GeomP1P1_F_c, "nb_kernel_ElecGB_VdwBham_GeomP1P1_F_c", "c", "GeneralizedBorn", "None", "Buckingham", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_c, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_c, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_c", "c", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_c", "c", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_c", "c", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_c", "c", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_c, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_c", "c", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomP1P1_VF_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomP1P1_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomP1P1_F_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomP1P1_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW3P1_VF_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW3P1_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW3P1_F_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW3P1_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW3W3_VF_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW3W3_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW3W3_F_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW3W3_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW4P1_VF_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW4P1_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW4P1_F_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW4P1_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW4W4_VF_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW4W4_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSh_GeomW4W4_F_c, "nb_kernel_ElecRFCut_VdwBhamSh_GeomW4W4_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomP1P1_VF_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomP1P1_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomP1P1_F_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomP1P1_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW3P1_VF_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW3P1_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW3P1_F_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW3P1_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW3W3_VF_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW3W3_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW3W3_F_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW3W3_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW4P1_VF_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW4P1_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW4P1_F_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW4P1_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW4W4_VF_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW4W4_VF_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwBhamSw_GeomW4W4_F_c, "nb_kernel_ElecRFCut_VdwBhamSw_GeomW4W4_F_c", "c", "ReactionField", "ExactCutoff", "Buckingham", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_c", "c", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_c", "c", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_c, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_c", "c", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_c, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_c", "c", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_c, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_c", "c", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_c, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_c", "c", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_c, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_c", "c", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_c, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_c", "c", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_c, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_c", "c", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_c, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_c", "c", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_c, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_c", "c", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_c, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_c", "c", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_c, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_c", "c", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_c, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_c", "c", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_c, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_c", "c", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_c, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_c", "c", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_c, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_c", "c", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_c, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_c", "c", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_c, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_c", "c", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_c, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_c", "c", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_c, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_c, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_c, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_c, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_c, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_c, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_c, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_c, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_c, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_c, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_c", "c", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwBham_GeomP1P1_VF_c, "nb_kernel_ElecRF_VdwBham_GeomP1P1_VF_c", "c", "ReactionField", "None", "Buckingham", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwBham_GeomP1P1_F_c, "nb_kernel_ElecRF_VdwBham_GeomP1P1_F_c", "c", "ReactionField", "None", "Buckingham", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW3P1_VF_c, "nb_kernel_ElecRF_VdwBham_GeomW3P1_VF_c", "c", "ReactionField", "None", "Buckingham", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW3P1_F_c, "nb_kernel_ElecRF_VdwBham_GeomW3P1_F_c", "c", "ReactionField", "None", "Buckingham", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW3W3_VF_c, "nb_kernel_ElecRF_VdwBham_GeomW3W3_VF_c", "c", "ReactionField", "None", "Buckingham", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW3W3_F_c, "nb_kernel_ElecRF_VdwBham_GeomW3W3_F_c", "c", "ReactionField", "None", "Buckingham", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW4P1_VF_c, "nb_kernel_ElecRF_VdwBham_GeomW4P1_VF_c", "c", "ReactionField", "None", "Buckingham", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW4P1_F_c, "nb_kernel_ElecRF_VdwBham_GeomW4P1_F_c", "c", "ReactionField", "None", "Buckingham", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW4W4_VF_c, "nb_kernel_ElecRF_VdwBham_GeomW4W4_VF_c", "c", "ReactionField", "None", "Buckingham", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwBham_GeomW4W4_F_c, "nb_kernel_ElecRF_VdwBham_GeomW4W4_F_c", "c", "ReactionField", "None", "Buckingham", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_c_size = sizeof(kernellist_c)/sizeof(kernellist_c[0]);
 +
 +#endif
index 95a7ef2f7c29df3bd6fc166142d4872fa41ad9bc,0000000000000000000000000000000000000000..bc3dd87a783d7f0c58c1ddce8957d12a7100fbdd
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_sse2_double[] =
 +/*
 + * Note: this file was generated by the Gromacs sse2_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_sse2_double_h
 +#define nb_kernel_sse2_double_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_double;
 +
 +
 +nb_kernel_info_t
- kernellist_sse2_double_size = sizeof(kernellist_sse2_double)/sizeof(kernellist_sse2_double[0]);
++    kernellist_sse2_double[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double", "sse2_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double", "sse2_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_double", "sse2_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_double", "sse2_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_sse2_double_size = sizeof(kernellist_sse2_double)/sizeof(kernellist_sse2_double[0]);
 +
 +#endif
index ddaa369874fb1c0fc1c981d2c238668c12d63770,0000000000000000000000000000000000000000..2a5aa767d9ab2efe9f88148c5d355dd5e8501844
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_sse2_single[] =
 +/*
 + * Note: this file was generated by the Gromacs sse2_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_sse2_single_h
 +#define nb_kernel_sse2_single_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single;
 +
 +
 +nb_kernel_info_t
- kernellist_sse2_single_size = sizeof(kernellist_sse2_single)/sizeof(kernellist_sse2_single[0]);
++    kernellist_sse2_single[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_sse2_single_size = sizeof(kernellist_sse2_single)/sizeof(kernellist_sse2_single[0]);
 +
 +#endif
index cf5b9086093c6c9df0d5aab1e526ee37d98caee1,0000000000000000000000000000000000000000..8d02013eb83ff7d174bde75e95a442e79a4e1e33
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_sse4_1_double[] =
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_sse4_1_double_h
 +#define nb_kernel_sse4_1_double_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse4_1_double;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_double;
 +
 +
 +nb_kernel_info_t
- kernellist_sse4_1_double_size = sizeof(kernellist_sse4_1_double)/sizeof(kernellist_sse4_1_double[0]);
++    kernellist_sse4_1_double[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double", "sse4_1_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_double", "sse4_1_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_sse4_1_double_size = sizeof(kernellist_sse4_1_double)/sizeof(kernellist_sse4_1_double[0]);
 +
 +#endif
index 6deaef13a177da3ea2d6a94972ea46880fd097da,0000000000000000000000000000000000000000..6c15bd3f8c837d50cf43f6b7350ce3624043e0c4
mode 100644,000000..100644
--- /dev/null
@@@ -1,465 -1,0 +1,465 @@@
- kernellist_sse4_1_single[] =
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifndef nb_kernel_sse4_1_single_h
 +#define nb_kernel_sse4_1_single_h
 +
 +#include "../nb_kernel.h"
 +
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse4_1_single;
 +nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_single;
 +
 +
 +nb_kernel_info_t
- kernellist_sse4_1_single_size = sizeof(kernellist_sse4_1_single)/sizeof(kernellist_sse4_1_single[0]);
++    kernellist_sse4_1_single[] =
 +{
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_single", "sse4_1_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
 +    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_single", "sse4_1_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
 +};
 +
 +int
++    kernellist_sse4_1_single_size = sizeof(kernellist_sse4_1_single)/sizeof(kernellist_sse4_1_single[0]);
 +
 +#endif
index 8d2fdb7f4a66a96c87c9db8052c3ee49cd97adca,0000000000000000000000000000000000000000..250d859a2a681b4a258246accc2f6c45d9d4d267
mode 100644,000000..100644
--- /dev/null
@@@ -1,9705 -1,0 +1,9713 @@@
-     if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            for (i = 0; i < state_local->nrng; i++)
 +                            {
 +                                state->ld_rng[i] = state_local->ld_rng[i];
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
 +                                  state_local->ld_rng, state->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            state->ld_rngi[0] = state_local->ld_rngi[0];
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, sizeof(state->ld_rngi[0]),
 +                                  state_local->ld_rngi, state->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd,
 +                                  state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                  state->ld_rng, state_local->ld_rng);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                   state->ld_rng, state_local->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
 +                                  state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
 +                                   state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], format[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
 +                                10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +    sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out, strlen(atomname) < 4 ? format : format4,
 +                "ATOM", (ii+1)%100000,
 +                atomname, resname, ' ', resnr%10000, ' ',
 +                10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP nodes:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
-         cellsize_min = max(cellsize_min,
-                            comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
++    if (!comm->bVacDLBNoLimit)
 +    {
++        /* The cut-off might have changed, e.g. by PME load balacning,
++         * from the value used to set comm->cellsize_min, so check it.
++         */
++        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
++
++        if (comm->bPMELoadBalDLBLimits)
++        {
++            /* Check for the cut-off limit set by the PME load balancing */
++            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
++        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for (j = 0; j < dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d], dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_large_int_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
 +        set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr, t_mdatoms *md,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the nodes: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME nodes\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
 +    comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
 +    recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
 +    comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
 +    comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds, comm->bInterCGMultiBody);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite, gmx_constr_t constr,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, t_forcerec *fr,
 +                       gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
 +                          rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_large_int_t    step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
 +    int                i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        cg0 = 0;
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr, mdatoms,
 +                           !bSortCG, nrnb, &cg0, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        cg0 = 0;
 +        ga2la_clear(dd->ga2la);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
 +    make_dd_indices(dd, cgs_gl->index, cg0);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
 +    init_bonded_thread_force_reduction(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA, mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index 7680d93a42d42d7c17570931f747eb9058889f23,0000000000000000000000000000000000000000..794841cac9f51ca254340ad97233ca7cd6693485
mode 100644,000000..100644
--- /dev/null
@@@ -1,2958 -1,0 +1,2958 @@@
-         if(fp)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "tables.h"
 +#include "nonbonded.h"
 +#include "invblock.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "copyrite.h"
 +#include "mtop_util.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_consts.h"
 +#include "statutil.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gmx_detect_hardware.h"
 +
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +#endif
 +
 +#include "types/nbnxn_cuda_types_ext.h"
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +
 +t_forcerec *mk_forcerec(void)
 +{
 +    t_forcerec *fr;
 +
 +    snew(fr, 1);
 +
 +    return fr;
 +}
 +
 +#ifdef DEBUG
 +static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
 +{
 +    int i, j;
 +
 +    for (i = 0; (i < atnr); i++)
 +    {
 +        for (j = 0; (j < atnr); j++)
 +        {
 +            fprintf(fp, "%2d - %2d", i, j);
 +            if (bBHAM)
 +            {
 +                fprintf(fp, "  a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
 +                        BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
 +            }
 +            else
 +            {
 +                fprintf(fp, "  c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
 +                        C12(nbfp, atnr, i, j)/12.0);
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
 +{
 +    real *nbfp;
 +    int   i, j, k, atnr;
 +
 +    atnr = idef->atnr;
 +    if (bBHAM)
 +    {
 +        snew(nbfp, 3*atnr*atnr);
 +        for (i = k = 0; (i < atnr); i++)
 +        {
 +            for (j = 0; (j < atnr); j++, k++)
 +            {
 +                BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
 +                BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
 +                /* nbfp now includes the 6.0 derivative prefactor */
 +                BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        snew(nbfp, 2*atnr*atnr);
 +        for (i = k = 0; (i < atnr); i++)
 +        {
 +            for (j = 0; (j < atnr); j++, k++)
 +            {
 +                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                C6(nbfp, atnr, i, j)   = idef->iparams[k].lj.c6*6.0;
 +                C12(nbfp, atnr, i, j)  = idef->iparams[k].lj.c12*12.0;
 +            }
 +        }
 +    }
 +
 +    return nbfp;
 +}
 +
 +/* This routine sets fr->solvent_opt to the most common solvent in the
 + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
 + * the fr->solvent_type array with the correct type (or esolNO).
 + *
 + * Charge groups that fulfill the conditions but are not identical to the
 + * most common one will be marked as esolNO in the solvent_type array.
 + *
 + * TIP3p is identical to SPC for these purposes, so we call it
 + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 + *
 + * NOTE: QM particle should not
 + * become an optimized solvent. Not even if there is only one charge
 + * group in the Qm
 + */
 +
 +typedef struct
 +{
 +    int    model;
 +    int    count;
 +    int    vdwtype[4];
 +    real   charge[4];
 +} solvent_parameters_t;
 +
 +static void
 +check_solvent_cg(const gmx_moltype_t    *molt,
 +                 int                     cg0,
 +                 int                     nmol,
 +                 const unsigned char    *qm_grpnr,
 +                 const t_grps           *qm_grps,
 +                 t_forcerec   *          fr,
 +                 int                    *n_solvent_parameters,
 +                 solvent_parameters_t  **solvent_parameters_p,
 +                 int                     cginfo,
 +                 int                    *cg_sp)
 +{
 +    const t_blocka     *  excl;
 +    t_atom               *atom;
 +    int                   j, k;
 +    int                   j0, j1, nj;
 +    gmx_bool              perturbed;
 +    gmx_bool              has_vdw[4];
 +    gmx_bool              match;
 +    real                  tmp_charge[4];
 +    int                   tmp_vdwtype[4];
 +    int                   tjA;
 +    gmx_bool              qm;
 +    solvent_parameters_t *solvent_parameters;
 +
 +    /* We use a list with parameters for each solvent type.
 +     * Every time we discover a new molecule that fulfills the basic
 +     * conditions for a solvent we compare with the previous entries
 +     * in these lists. If the parameters are the same we just increment
 +     * the counter for that type, and otherwise we create a new type
 +     * based on the current molecule.
 +     *
 +     * Once we've finished going through all molecules we check which
 +     * solvent is most common, and mark all those molecules while we
 +     * clear the flag on all others.
 +     */
 +
 +    solvent_parameters = *solvent_parameters_p;
 +
 +    /* Mark the cg first as non optimized */
 +    *cg_sp = -1;
 +
 +    /* Check if this cg has no exclusions with atoms in other charge groups
 +     * and all atoms inside the charge group excluded.
 +     * We only have 3 or 4 atom solvent loops.
 +     */
 +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
 +        !GET_CGINFO_EXCL_INTRA(cginfo))
 +    {
 +        return;
 +    }
 +
 +    /* Get the indices of the first atom in this charge group */
 +    j0     = molt->cgs.index[cg0];
 +    j1     = molt->cgs.index[cg0+1];
 +
 +    /* Number of atoms in our molecule */
 +    nj     = j1 - j0;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Moltype '%s': there are %d atoms in this charge group\n",
 +                *molt->name, nj);
 +    }
 +
 +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 +     * otherwise skip it.
 +     */
 +    if (nj < 3 || nj > 4)
 +    {
 +        return;
 +    }
 +
 +    /* Check if we are doing QM on this group */
 +    qm = FALSE;
 +    if (qm_grpnr != NULL)
 +    {
 +        for (j = j0; j < j1 && !qm; j++)
 +        {
 +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
 +        }
 +    }
 +    /* Cannot use solvent optimization with QM */
 +    if (qm)
 +    {
 +        return;
 +    }
 +
 +    atom = molt->atoms.atom;
 +
 +    /* Still looks like a solvent, time to check parameters */
 +
 +    /* If it is perturbed (free energy) we can't use the solvent loops,
 +     * so then we just skip to the next molecule.
 +     */
 +    perturbed = FALSE;
 +
 +    for (j = j0; j < j1 && !perturbed; j++)
 +    {
 +        perturbed = PERTURBED(atom[j]);
 +    }
 +
 +    if (perturbed)
 +    {
 +        return;
 +    }
 +
 +    /* Now it's only a question if the VdW and charge parameters
 +     * are OK. Before doing the check we compare and see if they are
 +     * identical to a possible previous solvent type.
 +     * First we assign the current types and charges.
 +     */
 +    for (j = 0; j < nj; j++)
 +    {
 +        tmp_vdwtype[j] = atom[j0+j].type;
 +        tmp_charge[j]  = atom[j0+j].q;
 +    }
 +
 +    /* Does it match any previous solvent type? */
 +    for (k = 0; k < *n_solvent_parameters; k++)
 +    {
 +        match = TRUE;
 +
 +
 +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 +        if ( (solvent_parameters[k].model == esolSPC   && nj != 3)  ||
 +             (solvent_parameters[k].model == esolTIP4P && nj != 4) )
 +        {
 +            match = FALSE;
 +        }
 +
 +        /* Check that types & charges match for all atoms in molecule */
 +        for (j = 0; j < nj && match == TRUE; j++)
 +        {
 +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 +            {
 +                match = FALSE;
 +            }
 +            if (tmp_charge[j] != solvent_parameters[k].charge[j])
 +            {
 +                match = FALSE;
 +            }
 +        }
 +        if (match == TRUE)
 +        {
 +            /* Congratulations! We have a matched solvent.
 +             * Flag it with this type for later processing.
 +             */
 +            *cg_sp = k;
 +            solvent_parameters[k].count += nmol;
 +
 +            /* We are done with this charge group */
 +            return;
 +        }
 +    }
 +
 +    /* If we get here, we have a tentative new solvent type.
 +     * Before we add it we must check that it fulfills the requirements
 +     * of the solvent optimized loops. First determine which atoms have
 +     * VdW interactions.
 +     */
 +    for (j = 0; j < nj; j++)
 +    {
 +        has_vdw[j] = FALSE;
 +        tjA        = tmp_vdwtype[j];
 +
 +        /* Go through all other tpes and see if any have non-zero
 +         * VdW parameters when combined with this one.
 +         */
 +        for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
 +        {
 +            /* We already checked that the atoms weren't perturbed,
 +             * so we only need to check state A now.
 +             */
 +            if (fr->bBHAM)
 +            {
 +                has_vdw[j] = (has_vdw[j] ||
 +                              (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
 +                              (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
 +                              (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
 +            }
 +            else
 +            {
 +                /* Standard LJ */
 +                has_vdw[j] = (has_vdw[j] ||
 +                              (C6(fr->nbfp, fr->ntype, tjA, k)  != 0.0) ||
 +                              (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
 +            }
 +        }
 +    }
 +
 +    /* Now we know all we need to make the final check and assignment. */
 +    if (nj == 3)
 +    {
 +        /* So, is it an SPC?
 +         * For this we require thatn all atoms have charge,
 +         * the charges on atom 2 & 3 should be the same, and only
 +         * atom 1 might have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            tmp_charge[0]  != 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1])
 +        {
 +            srenew(solvent_parameters, *n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for (k = 0; k < 3; k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +    else if (nj == 4)
 +    {
 +        /* Or could it be a TIP4P?
 +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
 +         * Only atom 1 mght have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            has_vdw[3] == FALSE &&
 +            tmp_charge[0]  == 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1] &&
 +            tmp_charge[3]  != 0)
 +        {
 +            srenew(solvent_parameters, *n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for (k = 0; k < 4; k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +
 +    *solvent_parameters_p = solvent_parameters;
 +}
 +
 +static void
 +check_solvent(FILE  *                fp,
 +              const gmx_mtop_t  *    mtop,
 +              t_forcerec  *          fr,
 +              cginfo_mb_t           *cginfo_mb)
 +{
 +    const t_block     *   cgs;
 +    const t_block     *   mols;
 +    const gmx_moltype_t  *molt;
 +    int                   mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
 +    int                   n_solvent_parameters;
 +    solvent_parameters_t *solvent_parameters;
 +    int                 **cg_sp;
 +    int                   bestsp, bestsol;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Going to determine what solvent types we have.\n");
 +    }
 +
 +    mols = &mtop->mols;
 +
 +    n_solvent_parameters = 0;
 +    solvent_parameters   = NULL;
 +    /* Allocate temporary array for solvent type */
 +    snew(cg_sp, mtop->nmolblock);
 +
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        cgs  = &molt->cgs;
 +        /* Here we have to loop over all individual molecules
 +         * because we need to check for QMMM particles.
 +         */
 +        snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
 +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
 +        for (mol = 0; mol < nmol_ch; mol++)
 +        {
 +            cgm = mol*cgs->nr;
 +            am  = mol*cgs->index[cgs->nr];
 +            for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
 +            {
 +                check_solvent_cg(molt, cg_mol, nmol,
 +                                 mtop->groups.grpnr[egcQMMM] ?
 +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 +                                 &mtop->groups.grps[egcQMMM],
 +                                 fr,
 +                                 &n_solvent_parameters, &solvent_parameters,
 +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
 +                                 &cg_sp[mb][cgm+cg_mol]);
 +            }
 +        }
 +        cg_offset += cgs->nr;
 +        at_offset += cgs->index[cgs->nr];
 +    }
 +
 +    /* Puh! We finished going through all charge groups.
 +     * Now find the most common solvent model.
 +     */
 +
 +    /* Most common solvent this far */
 +    bestsp = -2;
 +    for (i = 0; i < n_solvent_parameters; i++)
 +    {
 +        if (bestsp == -2 ||
 +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
 +        {
 +            bestsp = i;
 +        }
 +    }
 +
 +    if (bestsp >= 0)
 +    {
 +        bestsol = solvent_parameters[bestsp].model;
 +    }
 +    else
 +    {
 +        bestsol = esolNO;
 +    }
 +
 +#ifdef DISABLE_WATER_NLIST
 +    bestsol = esolNO;
 +#endif
 +
 +    fr->nWatMol = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        cgs  = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 +        for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
 +        {
 +            if (cg_sp[mb][i] == bestsp)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
 +                fr->nWatMol += nmol;
 +            }
 +            else
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
 +            }
 +        }
 +        sfree(cg_sp[mb]);
 +    }
 +    sfree(cg_sp);
 +
 +    if (bestsol != esolNO && fp != NULL)
 +    {
 +        fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
 +                esol_names[bestsol],
 +                solvent_parameters[bestsp].count);
 +    }
 +
 +    sfree(solvent_parameters);
 +    fr->solvent_opt = bestsol;
 +}
 +
 +enum {
 +    acNONE = 0, acCONSTRAINT, acSETTLE
 +};
 +
 +static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
 +                                   t_forcerec *fr, gmx_bool bNoSolvOpt,
 +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 +{
 +    const t_block        *cgs;
 +    const t_blocka       *excl;
 +    const gmx_moltype_t  *molt;
 +    const gmx_molblock_t *molb;
 +    cginfo_mb_t          *cginfo_mb;
 +    gmx_bool             *type_VDW;
 +    int                  *cginfo;
 +    int                   cg_offset, a_offset, cgm, am;
 +    int                   mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
 +    int                  *a_con;
 +    int                   ftype;
 +    int                   ia;
 +    gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
 +
 +    ncg_tot = ncg_mtop(mtop);
 +    snew(cginfo_mb, mtop->nmolblock);
 +
 +    snew(type_VDW, fr->ntype);
 +    for (ai = 0; ai < fr->ntype; ai++)
 +    {
 +        type_VDW[ai] = FALSE;
 +        for (j = 0; j < fr->ntype; j++)
 +        {
 +            type_VDW[ai] = type_VDW[ai] ||
 +                fr->bBHAM ||
 +                C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
 +                C12(fr->nbfp, fr->ntype, ai, j) != 0;
 +        }
 +    }
 +
 +    *bExcl_IntraCGAll_InterCGNone = TRUE;
 +
 +    excl_nalloc = 10;
 +    snew(bExcl, excl_nalloc);
 +    cg_offset = 0;
 +    a_offset  = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        cgs  = &molt->cgs;
 +        excl = &molt->excls;
 +
 +        /* Check if the cginfo is identical for all molecules in this block.
 +         * If so, we only need an array of the size of one molecule.
 +         * Otherwise we make an array of #mol times #cgs per molecule.
 +         */
 +        bId = TRUE;
 +        am  = 0;
 +        for (m = 0; m < molb->nmol; m++)
 +        {
 +            am = m*cgs->index[cgs->nr];
 +            for (cg = 0; cg < cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
 +                    ggrpnr(&mtop->groups, egcENER, a_offset   +a0))
 +                {
 +                    bId = FALSE;
 +                }
 +                if (mtop->groups.grpnr[egcQMMM] != NULL)
 +                {
 +                    for (ai = a0; ai < a1; ai++)
 +                    {
 +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 +                        {
 +                            bId = FALSE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        cginfo_mb[mb].cg_start = cg_offset;
 +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 +        snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
 +        cginfo = cginfo_mb[mb].cginfo;
 +
 +        /* Set constraints flags for constrained atoms */
 +        snew(a_con, molt->atoms.nr);
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
 +            {
 +                int nral;
 +
 +                nral = NRAL(ftype);
 +                for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
 +                {
 +                    int a;
 +
 +                    for (a = 0; a < nral; a++)
 +                    {
 +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 +                    }
 +                }
 +            }
 +        }
 +
 +        for (m = 0; m < (bId ? 1 : molb->nmol); m++)
 +        {
 +            cgm = m*cgs->nr;
 +            am  = m*cgs->index[cgs->nr];
 +            for (cg = 0; cg < cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +
 +                /* Store the energy group in cginfo */
 +                gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
 +                SET_CGINFO_GID(cginfo[cgm+cg], gid);
 +
 +                /* Check the intra/inter charge group exclusions */
 +                if (a1-a0 > excl_nalloc)
 +                {
 +                    excl_nalloc = a1 - a0;
 +                    srenew(bExcl, excl_nalloc);
 +                }
 +                /* bExclIntraAll: all intra cg interactions excluded
 +                 * bExclInter:    any inter cg interactions excluded
 +                 */
 +                bExclIntraAll = TRUE;
 +                bExclInter    = FALSE;
 +                bHaveVDW      = FALSE;
 +                bHaveQ        = FALSE;
 +                for (ai = a0; ai < a1; ai++)
 +                {
 +                    /* Check VDW and electrostatic interactions */
 +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 +                                            type_VDW[molt->atoms.atom[ai].typeB]);
 +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 +                                            molt->atoms.atom[ai].qB != 0);
 +
 +                    /* Clear the exclusion list for atom ai */
 +                    for (aj = a0; aj < a1; aj++)
 +                    {
 +                        bExcl[aj-a0] = FALSE;
 +                    }
 +                    /* Loop over all the exclusions of atom ai */
 +                    for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
 +                    {
 +                        aj = excl->a[j];
 +                        if (aj < a0 || aj >= a1)
 +                        {
 +                            bExclInter = TRUE;
 +                        }
 +                        else
 +                        {
 +                            bExcl[aj-a0] = TRUE;
 +                        }
 +                    }
 +                    /* Check if ai excludes a0 to a1 */
 +                    for (aj = a0; aj < a1; aj++)
 +                    {
 +                        if (!bExcl[aj-a0])
 +                        {
 +                            bExclIntraAll = FALSE;
 +                        }
 +                    }
 +
 +                    switch (a_con[ai])
 +                    {
 +                        case acCONSTRAINT:
 +                            SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 +                            break;
 +                        case acSETTLE:
 +                            SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 +                            break;
 +                        default:
 +                            break;
 +                    }
 +                }
 +                if (bExclIntraAll)
 +                {
 +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 +                }
 +                if (bExclInter)
 +                {
 +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 +                }
 +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 +                {
 +                    /* The size in cginfo is currently only read with DD */
 +                    gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
 +                }
 +                if (bHaveVDW)
 +                {
 +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 +                }
 +                if (bHaveQ)
 +                {
 +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 +                }
 +                /* Store the charge group size */
 +                SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
 +
 +                if (!bExclIntraAll || bExclInter)
 +                {
 +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
 +                }
 +            }
 +        }
 +
 +        sfree(a_con);
 +
 +        cg_offset += molb->nmol*cgs->nr;
 +        a_offset  += molb->nmol*cgs->index[cgs->nr];
 +    }
 +    sfree(bExcl);
 +
 +    /* the solvent optimizer is called after the QM is initialized,
 +     * because we don't want to have the QM subsystemto become an
 +     * optimized solvent
 +     */
 +
 +    check_solvent(fplog, mtop, fr, cginfo_mb);
 +
 +    if (getenv("GMX_NO_SOLV_OPT"))
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
 +                    "Disabling all solvent optimization\n");
 +        }
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (bNoSolvOpt)
 +    {
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (!fr->solvent_opt)
 +    {
 +        for (mb = 0; mb < mtop->nmolblock; mb++)
 +        {
 +            for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
 +            }
 +        }
 +    }
 +
 +    return cginfo_mb;
 +}
 +
 +static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
 +{
 +    int  ncg, mb, cg;
 +    int *cginfo;
 +
 +    ncg = cgi_mb[nmb-1].cg_end;
 +    snew(cginfo, ncg);
 +    mb = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        while (cg >= cgi_mb[mb].cg_end)
 +        {
 +            mb++;
 +        }
 +        cginfo[cg] =
 +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 +    }
 +
 +    return cginfo;
 +}
 +
 +static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
 +{
 +    double         qsum, q2sum, q;
 +    int            mb, nmol, i;
 +    const t_atoms *atoms;
 +
 +    qsum  = 0;
 +    q2sum = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        nmol  = mtop->molblock[mb].nmol;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        for (i = 0; i < atoms->nr; i++)
 +        {
 +            q      = atoms->atom[i].q;
 +            qsum  += nmol*q;
 +            q2sum += nmol*q*q;
 +        }
 +    }
 +    fr->qsum[0]  = qsum;
 +    fr->q2sum[0] = q2sum;
 +    if (fr->efep != efepNO)
 +    {
 +        qsum  = 0;
 +        q2sum = 0;
 +        for (mb = 0; mb < mtop->nmolblock; mb++)
 +        {
 +            nmol  = mtop->molblock[mb].nmol;
 +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +            for (i = 0; i < atoms->nr; i++)
 +            {
 +                q      = atoms->atom[i].qB;
 +                qsum  += nmol*q;
 +                q2sum += nmol*q*q;
 +            }
 +            fr->qsum[1]  = qsum;
 +            fr->q2sum[1] = q2sum;
 +        }
 +    }
 +    else
 +    {
 +        fr->qsum[1]  = fr->qsum[0];
 +        fr->q2sum[1] = fr->q2sum[0];
 +    }
 +    if (log)
 +    {
 +        if (fr->efep == efepNO)
 +        {
 +            fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
 +        }
 +        else
 +        {
 +            fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
 +                    fr->qsum[0], fr->qsum[1]);
 +        }
 +    }
 +}
 +
 +void update_forcerec(FILE *log, t_forcerec *fr, matrix box)
 +{
 +    if (fr->eeltype == eelGRF)
 +    {
 +        calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
 +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
 +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
 +    }
 +}
 +
 +void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
 +{
 +    const t_atoms  *atoms, *atoms_tpi;
 +    const t_blocka *excl;
 +    int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
 +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
 +    long long int   npair, npair_ij, tmpi, tmpj;
 +#else
 +    double          npair, npair_ij, tmpi, tmpj;
 +#endif
 +    double          csix, ctwelve;
 +    int             ntp, *typecount;
 +    gmx_bool        bBHAM;
 +    real           *nbfp;
 +
 +    ntp   = fr->ntype;
 +    bBHAM = fr->bBHAM;
 +    nbfp  = fr->nbfp;
 +
 +    for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
 +    {
 +        csix    = 0;
 +        ctwelve = 0;
 +        npair   = 0;
 +        nexcl   = 0;
 +        if (!fr->n_tpi)
 +        {
 +            /* Count the types so we avoid natoms^2 operations */
 +            snew(typecount, ntp);
 +            for (mb = 0; mb < mtop->nmolblock; mb++)
 +            {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for (i = 0; i < atoms->nr; i++)
 +                {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    typecount[tpi] += nmol;
 +                }
 +            }
 +            for (tpi = 0; tpi < ntp; tpi++)
 +            {
 +                for (tpj = tpi; tpj < ntp; tpj++)
 +                {
 +                    tmpi = typecount[tpi];
 +                    tmpj = typecount[tpj];
 +                    if (tpi != tpj)
 +                    {
 +                        npair_ij = tmpi*tmpj;
 +                    }
 +                    else
 +                    {
 +                        npair_ij = tmpi*(tmpi - 1)/2;
 +                    }
 +                    if (bBHAM)
 +                    {
 +                        /* nbfp now includes the 6.0 derivative prefactor */
 +                        csix    += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
 +                    }
 +                    else
 +                    {
 +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                        csix    += npair_ij*   C6(nbfp, ntp, tpi, tpj)/6.0;
 +                        ctwelve += npair_ij*  C12(nbfp, ntp, tpi, tpj)/12.0;
 +                    }
 +                    npair += npair_ij;
 +                }
 +            }
 +            sfree(typecount);
 +            /* Subtract the excluded pairs.
 +             * The main reason for substracting exclusions is that in some cases
 +             * some combinations might never occur and the parameters could have
 +             * any value. These unused values should not influence the dispersion
 +             * correction.
 +             */
 +            for (mb = 0; mb < mtop->nmolblock; mb++)
 +            {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
 +                for (i = 0; (i < atoms->nr); i++)
 +                {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    j1  = excl->index[i];
 +                    j2  = excl->index[i+1];
 +                    for (j = j1; j < j2; j++)
 +                    {
 +                        k = excl->a[j];
 +                        if (k > i)
 +                        {
 +                            if (q == 0)
 +                            {
 +                                tpj = atoms->atom[k].type;
 +                            }
 +                            else
 +                            {
 +                                tpj = atoms->atom[k].typeB;
 +                            }
 +                            if (bBHAM)
 +                            {
 +                                /* nbfp now includes the 6.0 derivative prefactor */
 +                                csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
 +                            }
 +                            else
 +                            {
 +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                                csix    -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
 +                                ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
 +                            }
 +                            nexcl += nmol;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Only correct for the interaction of the test particle
 +             * with the rest of the system.
 +             */
 +            atoms_tpi =
 +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
 +
 +            npair = 0;
 +            for (mb = 0; mb < mtop->nmolblock; mb++)
 +            {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for (j = 0; j < atoms->nr; j++)
 +                {
 +                    nmolc = nmol;
 +                    /* Remove the interaction of the test charge group
 +                     * with itself.
 +                     */
 +                    if (mb == mtop->nmolblock-1)
 +                    {
 +                        nmolc--;
 +
 +                        if (mb == 0 && nmol == 1)
 +                        {
 +                            gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
 +                        }
 +                    }
 +                    if (q == 0)
 +                    {
 +                        tpj = atoms->atom[j].type;
 +                    }
 +                    else
 +                    {
 +                        tpj = atoms->atom[j].typeB;
 +                    }
 +                    for (i = 0; i < fr->n_tpi; i++)
 +                    {
 +                        if (q == 0)
 +                        {
 +                            tpi = atoms_tpi->atom[i].type;
 +                        }
 +                        else
 +                        {
 +                            tpi = atoms_tpi->atom[i].typeB;
 +                        }
 +                        if (bBHAM)
 +                        {
 +                            /* nbfp now includes the 6.0 derivative prefactor */
 +                            csix    += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
 +                        }
 +                        else
 +                        {
 +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                            csix    += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
 +                            ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
 +                        }
 +                        npair += nmolc;
 +                    }
 +                }
 +            }
 +        }
 +        if (npair - nexcl <= 0 && fplog)
 +        {
 +            fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
 +            csix     = 0;
 +            ctwelve  = 0;
 +        }
 +        else
 +        {
 +            csix    /= npair - nexcl;
 +            ctwelve /= npair - nexcl;
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Counted %d exclusions\n", nexcl);
 +            fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
 +            fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
 +        }
 +        fr->avcsix[q]    = csix;
 +        fr->avctwelve[q] = ctwelve;
 +    }
 +    if (fplog != NULL)
 +    {
 +        if (fr->eDispCorr == edispcAllEner ||
 +            fr->eDispCorr == edispcAllEnerPres)
 +        {
 +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                    fr->avcsix[0], fr->avctwelve[0]);
 +        }
 +        else
 +        {
 +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
 +        }
 +    }
 +}
 +
 +
 +static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
 +                           const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *at1, *at2;
 +    int            mt1, mt2, i, j, tpi, tpj, ntypes;
 +    real           b, bmin;
 +    real          *nbfp;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
 +    }
 +    nbfp   = fr->nbfp;
 +    ntypes = fr->ntype;
 +
 +    bmin           = -1;
 +    fr->bham_b_max = 0;
 +    for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
 +    {
 +        at1 = &mtop->moltype[mt1].atoms;
 +        for (i = 0; (i < at1->nr); i++)
 +        {
 +            tpi = at1->atom[i].type;
 +            if (tpi >= ntypes)
 +            {
 +                gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
 +            }
 +
 +            for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
 +            {
 +                at2 = &mtop->moltype[mt2].atoms;
 +                for (j = 0; (j < at2->nr); j++)
 +                {
 +                    tpj = at2->atom[j].type;
 +                    if (tpj >= ntypes)
 +                    {
 +                        gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
 +                    }
 +                    b = BHAMB(nbfp, ntypes, tpi, tpj);
 +                    if (b > fr->bham_b_max)
 +                    {
 +                        fr->bham_b_max = b;
 +                    }
 +                    if ((b < bmin) || (bmin == -1))
 +                    {
 +                        bmin = b;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
 +                bmin, fr->bham_b_max);
 +    }
 +}
 +
 +static void make_nbf_tables(FILE *fp, const output_env_t oenv,
 +                            t_forcerec *fr, real rtab,
 +                            const t_commrec *cr,
 +                            const char *tabfn, char *eg1, char *eg2,
 +                            t_nblists *nbl)
 +{
 +    char buf[STRLEN];
 +    int  i, j;
 +
 +    if (tabfn == NULL)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
 +        }
 +        return;
 +    }
 +
 +    sprintf(buf, "%s", tabfn);
 +    if (eg1 && eg2)
 +    {
 +        /* Append the two energy group names */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
 +                eg1, eg2, ftp2ext(efXVG));
 +    }
 +    nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
 +    /* Copy the contents of the table to separate coulomb and LJ tables too,
 +     * to improve cache performance.
 +     */
 +    /* For performance reasons we want
 +     * the table data to be aligned to 16-byte. The pointers could be freed
 +     * but currently aren't.
 +     */
 +    nbl->table_elec.interaction   = GMX_TABLE_INTERACTION_ELEC;
 +    nbl->table_elec.format        = nbl->table_elec_vdw.format;
 +    nbl->table_elec.r             = nbl->table_elec_vdw.r;
 +    nbl->table_elec.n             = nbl->table_elec_vdw.n;
 +    nbl->table_elec.scale         = nbl->table_elec_vdw.scale;
 +    nbl->table_elec.scale_exp     = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_elec.formatsize    = nbl->table_elec_vdw.formatsize;
 +    nbl->table_elec.ninteractions = 1;
 +    nbl->table_elec.stride        = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
 +    snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
 +
 +    nbl->table_vdw.interaction   = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
 +    nbl->table_vdw.format        = nbl->table_elec_vdw.format;
 +    nbl->table_vdw.r             = nbl->table_elec_vdw.r;
 +    nbl->table_vdw.n             = nbl->table_elec_vdw.n;
 +    nbl->table_vdw.scale         = nbl->table_elec_vdw.scale;
 +    nbl->table_vdw.scale_exp     = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_vdw.formatsize    = nbl->table_elec_vdw.formatsize;
 +    nbl->table_vdw.ninteractions = 2;
 +    nbl->table_vdw.stride        = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
 +    snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
 +
 +    for (i = 0; i <= nbl->table_elec_vdw.n; i++)
 +    {
 +        for (j = 0; j < 4; j++)
 +        {
 +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
 +        }
 +        for (j = 0; j < 8; j++)
 +        {
 +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
 +        }
 +    }
 +}
 +
 +static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
 +                         int *ncount, int **count)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_ilist       *il;
 +    int                  mt, ftype, stride, i, j, tabnr;
 +
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if (ftype == ftype1 || ftype == ftype2)
 +            {
 +                il     = &molt->ilist[ftype];
 +                stride = 1 + NRAL(ftype);
 +                for (i = 0; i < il->nr; i += stride)
 +                {
 +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
 +                    if (tabnr < 0)
 +                    {
 +                        gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
 +                    }
 +                    if (tabnr >= *ncount)
 +                    {
 +                        srenew(*count, tabnr+1);
 +                        for (j = *ncount; j < tabnr+1; j++)
 +                        {
 +                            (*count)[j] = 0;
 +                        }
 +                        *ncount = tabnr+1;
 +                    }
 +                    (*count)[tabnr]++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static bondedtable_t *make_bonded_tables(FILE *fplog,
 +                                         int ftype1, int ftype2,
 +                                         const gmx_mtop_t *mtop,
 +                                         const char *basefn, const char *tabext)
 +{
 +    int            i, ncount, *count;
 +    char           tabfn[STRLEN];
 +    bondedtable_t *tab;
 +
 +    tab = NULL;
 +
 +    ncount = 0;
 +    count  = NULL;
 +    count_tables(ftype1, ftype2, mtop, &ncount, &count);
 +
 +    if (ncount > 0)
 +    {
 +        snew(tab, ncount);
 +        for (i = 0; i < ncount; i++)
 +        {
 +            if (count[i] > 0)
 +            {
 +                sprintf(tabfn, "%s", basefn);
 +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
 +                        tabext, i, ftp2ext(efXVG));
 +                tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
 +            }
 +        }
 +        sfree(count);
 +    }
 +
 +    return tab;
 +}
 +
 +void forcerec_set_ranges(t_forcerec *fr,
 +                         int ncg_home, int ncg_force,
 +                         int natoms_force,
 +                         int natoms_force_constr, int natoms_f_novirsum)
 +{
 +    fr->cg0 = 0;
 +    fr->hcg = ncg_home;
 +
 +    /* fr->ncg_force is unused in the standard code,
 +     * but it can be useful for modified code dealing with charge groups.
 +     */
 +    fr->ncg_force           = ncg_force;
 +    fr->natoms_force        = natoms_force;
 +    fr->natoms_force_constr = natoms_force_constr;
 +
 +    if (fr->natoms_force_constr > fr->nalloc_force)
 +    {
 +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 +
 +        if (fr->bTwinRange)
 +        {
 +            srenew(fr->f_twin, fr->nalloc_force);
 +        }
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        fr->f_novirsum_n = natoms_f_novirsum;
 +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
 +        {
 +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
 +            srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
 +        }
 +    }
 +    else
 +    {
 +        fr->f_novirsum_n = 0;
 +    }
 +}
 +
 +static real cutoff_inf(real cutoff)
 +{
 +    if (cutoff == 0)
 +    {
 +        cutoff = GMX_CUTOFF_INF;
 +    }
 +
 +    return cutoff;
 +}
 +
 +static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
 +                                  t_forcerec *fr, const t_inputrec *ir,
 +                                  const char *tabfn, const gmx_mtop_t *mtop,
 +                                  matrix     box)
 +{
 +    char buf[STRLEN];
 +    int  i, j;
 +
 +    if (tabfn == NULL)
 +    {
 +        gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
 +        return;
 +    }
 +
 +    snew(fr->atf_tabs, ir->adress->n_tf_grps);
 +
 +    sprintf(buf, "%s", tabfn);
 +    for (i = 0; i < ir->adress->n_tf_grps; i++)
 +    {
 +        j = ir->adress->tf_table_index[i]; /* get energy group index */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
 +                *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
-            fprintf(fp,"loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
++        if (fp)
 +        {
++            fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
 +        }
 +        fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
 +    }
 +
 +}
 +
 +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 +                          gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
 +{
 +    gmx_bool bAllvsAll;
 +
 +    bAllvsAll =
 +        (
 +            ir->rlist == 0            &&
 +            ir->rcoulomb == 0         &&
 +            ir->rvdw == 0             &&
 +            ir->ePBC == epbcNONE      &&
 +            ir->vdwtype == evdwCUT    &&
 +            ir->coulombtype == eelCUT &&
 +            ir->efep == efepNO        &&
 +            (ir->implicit_solvent == eisNO ||
 +             (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
 +                                                  ir->gb_algorithm == egbHCT   ||
 +                                                  ir->gb_algorithm == egbOBC))) &&
 +            getenv("GMX_NO_ALLVSALL") == NULL
 +        );
 +
 +    if (bAllvsAll && ir->opts.ngener > 1)
 +    {
 +        const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
 +
 +        if (bPrintNote)
 +        {
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr, "\n%s\n", note);
 +            }
 +            if (fp != NULL)
 +            {
 +                fprintf(fp, "\n%s\n", note);
 +            }
 +        }
 +        bAllvsAll = FALSE;
 +    }
 +
 +    if (bAllvsAll && fp && MASTER(cr))
 +    {
 +        fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
 +    }
 +
 +    return bAllvsAll;
 +}
 +
 +
 +static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
 +{
 +    int t, i;
 +
 +    /* These thread local data structures are used for bondeds only */
 +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 +
 +    if (fr->nthreads > 1)
 +    {
 +        snew(fr->f_t, fr->nthreads);
 +        /* Thread 0 uses the global force and energy arrays */
 +        for (t = 1; t < fr->nthreads; t++)
 +        {
 +            fr->f_t[t].f        = NULL;
 +            fr->f_t[t].f_nalloc = 0;
 +            snew(fr->f_t[t].fshift, SHIFTS);
 +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
 +            for (i = 0; i < egNR; i++)
 +            {
 +                snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void pick_nbnxn_kernel_cpu(FILE             *fp,
 +                                  const t_commrec  *cr,
 +                                  const gmx_cpuid_t cpuid_info,
 +                                  const t_inputrec *ir,
 +                                  int              *kernel_type,
 +                                  int              *ewald_excl)
 +{
 +    *kernel_type = nbnxnk4x4_PlainC;
 +    *ewald_excl  = ewaldexclTable;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +        *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +        /* We expect the 2xNN kernels to be faster in most cases */
 +        *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#endif
 +
 +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
 +        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
 +        {
 +            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
 +             * 10% with HT, 50% without HT, but extra zeros interactions
 +             * can compensate. As we currently don't detect the actual use
 +             * of HT, switch to 4x8 to avoid a potential performance hit.
 +             */
 +            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +        }
 +#endif
 +        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#else
 +            gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
 +        {
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#else
 +            gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +
 +        /* Analytical Ewald exclusion correction is only an option in the
 +         * x86 SIMD kernel. This is faster in single precision
 +         * on Bulldozer and slightly faster on Sandy Bridge.
 +         */
 +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
 +        *ewald_excl = ewaldexclAnalytical;
 +#endif
 +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
 +        {
 +            *ewald_excl = ewaldexclTable;
 +        }
 +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
 +        {
 +            *ewald_excl = ewaldexclAnalytical;
 +        }
 +
 +    }
 +#endif /* GMX_X86_SSE2 */
 +}
 +
 +
 +const char *lookup_nbnxn_kernel_name(int kernel_type)
 +{
 +    const char *returnvalue = NULL;
 +    switch (kernel_type)
 +    {
 +        case nbnxnkNotSet: returnvalue     = "not set"; break;
 +        case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
 +#ifndef GMX_NBNXN_SIMD
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "not available"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
 +#else
 +#ifdef GMX_X86_SSE2
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +            /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
 +             * on compiler flags. As we use nearly identical intrinsics, using an AVX
 +             * compiler flag without an AVX macro effectively results in AVX kernels.
 +             * For gcc we check for __AVX__
 +             * At least a check for icc should be added (if there is a macro)
 +             */
 +#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 +#ifndef GMX_X86_SSE4_1
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE2"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
 +#else
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE4.1"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
 +#endif
 +#else
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-128"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
 +#endif
 +#endif
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-256"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
 +#endif
 +#else   /* not GMX_X86_SSE2 */
 +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SIMD"; break;
 +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
 +#endif
 +#endif
 +        case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
 +        case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
 +
 +        case nbnxnkNR:
 +        default:
 +            gmx_fatal(FARGS, "Illegal kernel type selected");
 +            returnvalue = NULL;
 +            break;
 +    }
 +    return returnvalue;
 +};
 +
 +static void pick_nbnxn_kernel(FILE                *fp,
 +                              const t_commrec     *cr,
 +                              const gmx_hw_info_t *hwinfo,
 +                              gmx_bool             use_cpu_acceleration,
 +                              gmx_bool             bUseGPU,
 +                              gmx_bool             bEmulateGPU,
 +                              const t_inputrec    *ir,
 +                              int                 *kernel_type,
 +                              int                 *ewald_excl,
 +                              gmx_bool             bDoNonbonded)
 +{
 +    assert(kernel_type);
 +
 +    *kernel_type = nbnxnkNotSet;
 +    *ewald_excl  = ewaldexclTable;
 +
 +    if (bEmulateGPU)
 +    {
 +        *kernel_type = nbnxnk8x8x8_PlainC;
 +
 +        if (bDoNonbonded)
 +        {
 +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
 +        }
 +    }
 +    else if (bUseGPU)
 +    {
 +        *kernel_type = nbnxnk8x8x8_CUDA;
 +    }
 +
 +    if (*kernel_type == nbnxnkNotSet)
 +    {
 +        if (use_cpu_acceleration)
 +        {
 +            pick_nbnxn_kernel_cpu(fp, cr, hwinfo->cpuid_info, ir,
 +                                  kernel_type, ewald_excl);
 +        }
 +        else
 +        {
 +            *kernel_type = nbnxnk4x4_PlainC;
 +        }
 +    }
 +
 +    if (bDoNonbonded && fp != NULL)
 +    {
 +        fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
 +                lookup_nbnxn_kernel_name(*kernel_type),
 +                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
 +                nbnxn_kernel_to_cj_size(*kernel_type));
 +    }
 +}
 +
 +static void pick_nbnxn_resources(FILE                *fp,
 +                                 const t_commrec     *cr,
 +                                 const gmx_hw_info_t *hwinfo,
 +                                 gmx_bool             bDoNonbonded,
 +                                 gmx_bool            *bUseGPU,
 +                                 gmx_bool            *bEmulateGPU)
 +{
 +    gmx_bool bEmulateGPUEnvVarSet;
 +    char     gpu_err_str[STRLEN];
 +
 +    *bUseGPU = FALSE;
 +
 +    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
 +     * GPUs (currently) only handle non-bonded calculations, we will
 +     * automatically switch to emulation if non-bonded calculations are
 +     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
 +     * way to turn off GPU initialization, data movement, and cleanup.
 +     *
 +     * GPU emulation can be useful to assess the performance one can expect by
 +     * adding GPU(s) to the machine. The conditional below allows this even
 +     * if mdrun is compiled without GPU acceleration support.
 +     * Note that you should freezing the system as otherwise it will explode.
 +     */
 +    *bEmulateGPU = (bEmulateGPUEnvVarSet ||
 +                    (!bDoNonbonded && hwinfo->bCanUseGPU));
 +
 +    /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
 +     */
 +    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
 +    {
 +        /* Each PP node will use the intra-node id-th device from the
 +         * list of detected/selected GPUs. */
 +        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
 +        {
 +            /* At this point the init should never fail as we made sure that
 +             * we have all the GPUs we need. If it still does, we'll bail. */
 +            gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
 +                      cr->nodeid,
 +                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
 +                      gpu_err_str);
 +        }
 +
 +        /* Here we actually turn on hardware GPU acceleration */
 +        *bUseGPU = TRUE;
 +    }
 +}
 +
 +gmx_bool uses_simple_tables(int                 cutoff_scheme,
 +                            nonbonded_verlet_t *nbv,
 +                            int                 group)
 +{
 +    gmx_bool bUsesSimpleTables = TRUE;
 +    int      grp_index;
 +
 +    switch (cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            bUsesSimpleTables = TRUE;
 +            break;
 +        case ecutsVERLET:
 +            assert(NULL != nbv && NULL != nbv->grp);
 +            grp_index         = (group < 0) ? 0 : (nbv->ngrp - 1);
 +            bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    return bUsesSimpleTables;
 +}
 +
 +static void init_ewald_f_table(interaction_const_t *ic,
 +                               gmx_bool             bUsesSimpleTables,
 +                               real                 rtab)
 +{
 +    real maxr;
 +
 +    if (bUsesSimpleTables)
 +    {
 +        /* With a spacing of 0.0005 we are at the force summation accuracy
 +         * for the SSE kernels for "normal" atomistic simulations.
 +         */
 +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
 +                                                   ic->rcoulomb);
 +
 +        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
 +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
 +    }
 +    else
 +    {
 +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
 +    }
 +
 +    sfree_aligned(ic->tabq_coul_FDV0);
 +    sfree_aligned(ic->tabq_coul_F);
 +    sfree_aligned(ic->tabq_coul_V);
 +
 +    /* Create the original table data in FDV0 */
 +    snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
 +    snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
 +    snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
 +    table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
 +                                ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
 +}
 +
 +void init_interaction_const_tables(FILE                *fp,
 +                                   interaction_const_t *ic,
 +                                   gmx_bool             bUsesSimpleTables,
 +                                   real                 rtab)
 +{
 +    real spacing;
 +
 +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
 +    {
 +        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
 +
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
 +                    1/ic->tabq_scale, ic->tabq_size);
 +        }
 +    }
 +}
 +
 +void init_interaction_const(FILE                 *fp,
 +                            interaction_const_t **interaction_const,
 +                            const t_forcerec     *fr,
 +                            real                  rtab)
 +{
 +    interaction_const_t *ic;
 +    gmx_bool             bUsesSimpleTables = TRUE;
 +
 +    snew(ic, 1);
 +
 +    /* Just allocate something so we can free it */
 +    snew_aligned(ic->tabq_coul_FDV0, 16, 32);
 +    snew_aligned(ic->tabq_coul_F, 16, 32);
 +    snew_aligned(ic->tabq_coul_V, 16, 32);
 +
 +    ic->rlist       = fr->rlist;
 +    ic->rlistlong   = fr->rlistlong;
 +
 +    /* Lennard-Jones */
 +    ic->rvdw        = fr->rvdw;
 +    if (fr->vdw_modifier == eintmodPOTSHIFT)
 +    {
 +        ic->sh_invrc6 = pow(ic->rvdw, -6.0);
 +    }
 +    else
 +    {
 +        ic->sh_invrc6 = 0;
 +    }
 +
 +    /* Electrostatics */
 +    ic->eeltype     = fr->eeltype;
 +    ic->rcoulomb    = fr->rcoulomb;
 +    ic->epsilon_r   = fr->epsilon_r;
 +    ic->epsfac      = fr->epsfac;
 +
 +    /* Ewald */
 +    ic->ewaldcoeff  = fr->ewaldcoeff;
 +    if (fr->coulomb_modifier == eintmodPOTSHIFT)
 +    {
 +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
 +    }
 +    else
 +    {
 +        ic->sh_ewald = 0;
 +    }
 +
 +    /* Reaction-field */
 +    if (EEL_RF(ic->eeltype))
 +    {
 +        ic->epsilon_rf = fr->epsilon_rf;
 +        ic->k_rf       = fr->k_rf;
 +        ic->c_rf       = fr->c_rf;
 +    }
 +    else
 +    {
 +        /* For plain cut-off we might use the reaction-field kernels */
 +        ic->epsilon_rf = ic->epsilon_r;
 +        ic->k_rf       = 0;
 +        if (fr->coulomb_modifier == eintmodPOTSHIFT)
 +        {
 +            ic->c_rf   = 1/ic->rcoulomb;
 +        }
 +        else
 +        {
 +            ic->c_rf   = 0;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
 +                sqr(ic->sh_invrc6), ic->sh_invrc6);
 +        if (ic->eeltype == eelCUT)
 +        {
 +            fprintf(fp, ", Coulomb %.3f", ic->c_rf);
 +        }
 +        else if (EEL_PME(ic->eeltype))
 +        {
 +            fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
 +        }
 +        fprintf(fp, "\n");
 +    }
 +
 +    *interaction_const = ic;
 +
 +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
 +    }
 +
 +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
 +    init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
 +}
 +
 +static void init_nb_verlet(FILE                *fp,
 +                           nonbonded_verlet_t **nb_verlet,
 +                           const t_inputrec    *ir,
 +                           const t_forcerec    *fr,
 +                           const t_commrec     *cr,
 +                           const char          *nbpu_opt)
 +{
 +    nonbonded_verlet_t *nbv;
 +    int                 i;
 +    char               *env;
 +    gmx_bool            bEmulateGPU, bHybridGPURun = FALSE;
 +
 +    nbnxn_alloc_t      *nb_alloc;
 +    nbnxn_free_t       *nb_free;
 +
 +    snew(nbv, 1);
 +
 +    pick_nbnxn_resources(fp, cr, fr->hwinfo,
 +                         fr->bNonbonded,
 +                         &nbv->bUseGPU,
 +                         &bEmulateGPU);
 +
 +    nbv->nbs = NULL;
 +
 +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
 +    for (i = 0; i < nbv->ngrp; i++)
 +    {
 +        nbv->grp[i].nbl_lists.nnbl = 0;
 +        nbv->grp[i].nbat           = NULL;
 +        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 +
 +        if (i == 0) /* local */
 +        {
 +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                              nbv->bUseGPU, bEmulateGPU,
 +                              ir,
 +                              &nbv->grp[i].kernel_type,
 +                              &nbv->grp[i].ewald_excl,
 +                              fr->bNonbonded);
 +        }
 +        else /* non-local */
 +        {
 +            if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
 +            {
 +                /* Use GPU for local, select a CPU kernel for non-local */
 +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                                  FALSE, FALSE,
 +                                  ir,
 +                                  &nbv->grp[i].kernel_type,
 +                                  &nbv->grp[i].ewald_excl,
 +                                  fr->bNonbonded);
 +
 +                bHybridGPURun = TRUE;
 +            }
 +            else
 +            {
 +                /* Use the same kernel for local and non-local interactions */
 +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
 +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
 +            }
 +        }
 +    }
 +
 +    if (nbv->bUseGPU)
 +    {
 +        /* init the NxN GPU data; the last argument tells whether we'll have
 +         * both local and non-local NB calculation on GPU */
 +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
 +                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
 +                        (nbv->ngrp > 1) && !bHybridGPURun);
 +
 +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
 +        {
 +            char *end;
 +
 +            nbv->min_ci_balanced = strtol(env, &end, 10);
 +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
 +            {
 +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +        else
 +        {
 +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nbv->min_ci_balanced = 0;
 +    }
 +
 +    *nb_verlet = nbv;
 +
 +    nbnxn_init_search(&nbv->nbs,
 +                      DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
 +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
 +                      gmx_omp_nthreads_get(emntNonbonded));
 +
 +    for (i = 0; i < nbv->ngrp; i++)
 +    {
 +        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +        {
 +            nb_alloc = &pmalloc;
 +            nb_free  = &pfree;
 +        }
 +        else
 +        {
 +            nb_alloc = NULL;
 +            nb_free  = NULL;
 +        }
 +
 +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                /* 8x8x8 "non-simple" lists are ATM always combined */
 +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                nb_alloc, nb_free);
 +
 +        if (i == 0 ||
 +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
 +        {
 +            snew(nbv->grp[i].nbat, 1);
 +            nbnxn_atomdata_init(fp,
 +                                nbv->grp[i].nbat,
 +                                nbv->grp[i].kernel_type,
 +                                fr->ntype, fr->nbfp,
 +                                ir->opts.ngener,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
 +                                nb_alloc, nb_free);
 +        }
 +        else
 +        {
 +            nbv->grp[i].nbat = nbv->grp[0].nbat;
 +        }
 +    }
 +}
 +
 +void init_forcerec(FILE              *fp,
 +                   const output_env_t oenv,
 +                   t_forcerec        *fr,
 +                   t_fcdata          *fcd,
 +                   const t_inputrec  *ir,
 +                   const gmx_mtop_t  *mtop,
 +                   const t_commrec   *cr,
 +                   matrix             box,
 +                   gmx_bool           bMolEpot,
 +                   const char        *tabfn,
 +                   const char        *tabafn,
 +                   const char        *tabpfn,
 +                   const char        *tabbfn,
 +                   const char        *nbpu_opt,
 +                   gmx_bool           bNoSolvOpt,
 +                   real               print_force)
 +{
 +    int            i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
 +    real           rtab;
 +    char          *env;
 +    double         dbl;
 +    rvec           box_size;
 +    const t_block *cgs;
 +    gmx_bool       bGenericKernelOnly;
 +    gmx_bool       bTab, bSep14tab, bNormalnblists;
 +    t_nblists     *nbl;
 +    int           *nm_ind, egp_flags;
 +
 +    if (fr->hwinfo == NULL)
 +    {
 +        /* Detect hardware, gather information.
 +         * In mdrun, hwinfo has already been set before calling init_forcerec.
 +         * Here we ignore GPUs, as tools will not use them anyhow.
 +         */
 +        snew(fr->hwinfo, 1);
 +        gmx_detect_hardware(fp, fr->hwinfo, cr,
 +                            FALSE, FALSE, NULL);
 +    }
 +
 +    /* By default we turn acceleration on, but it might be turned off further down... */
 +    fr->use_cpu_acceleration = TRUE;
 +
 +    fr->bDomDec = DOMAINDECOMP(cr);
 +
 +    natoms = mtop->natoms;
 +
 +    if (check_box(ir->ePBC, box))
 +    {
 +        gmx_fatal(FARGS, check_box(ir->ePBC, box));
 +    }
 +
 +    /* Test particle insertion ? */
 +    if (EI_TPI(ir->eI))
 +    {
 +        /* Set to the size of the molecule to be inserted (the last one) */
 +        /* Because of old style topologies, we have to use the last cg
 +         * instead of the last molecule type.
 +         */
 +        cgs       = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
 +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
 +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
 +        {
 +            gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
 +        }
 +    }
 +    else
 +    {
 +        fr->n_tpi = 0;
 +    }
 +
 +    /* Copy AdResS parameters */
 +    if (ir->bAdress)
 +    {
 +        fr->adress_type           = ir->adress->type;
 +        fr->adress_const_wf       = ir->adress->const_wf;
 +        fr->adress_ex_width       = ir->adress->ex_width;
 +        fr->adress_hy_width       = ir->adress->hy_width;
 +        fr->adress_icor           = ir->adress->icor;
 +        fr->adress_site           = ir->adress->site;
 +        fr->adress_ex_forcecap    = ir->adress->ex_forcecap;
 +        fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
 +
 +
 +        snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
 +        for (i = 0; i < ir->adress->n_energy_grps; i++)
 +        {
 +            fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
 +        }
 +
 +        fr->n_adress_tf_grps = ir->adress->n_tf_grps;
 +        snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
 +        for (i = 0; i < fr->n_adress_tf_grps; i++)
 +        {
 +            fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
 +        }
 +        copy_rvec(ir->adress->refs, fr->adress_refs);
 +    }
 +    else
 +    {
 +        fr->adress_type           = eAdressOff;
 +        fr->adress_do_hybridpairs = FALSE;
 +    }
 +
 +    /* Copy the user determined parameters */
 +    fr->userint1  = ir->userint1;
 +    fr->userint2  = ir->userint2;
 +    fr->userint3  = ir->userint3;
 +    fr->userint4  = ir->userint4;
 +    fr->userreal1 = ir->userreal1;
 +    fr->userreal2 = ir->userreal2;
 +    fr->userreal3 = ir->userreal3;
 +    fr->userreal4 = ir->userreal4;
 +
 +    /* Shell stuff */
 +    fr->fc_stepsize = ir->fc_stepsize;
 +
 +    /* Free energy */
 +    fr->efep        = ir->efep;
 +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
 +    if (ir->fepvals->bScCoul)
 +    {
 +        fr->sc_alphacoul  = ir->fepvals->sc_alpha;
 +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
 +    }
 +    else
 +    {
 +        fr->sc_alphacoul  = 0;
 +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
 +    }
 +    fr->sc_power      = ir->fepvals->sc_power;
 +    fr->sc_r_power    = ir->fepvals->sc_r_power;
 +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
 +
 +    env = getenv("GMX_SCSIGMA_MIN");
 +    if (env != NULL)
 +    {
 +        dbl = 0;
 +        sscanf(env, "%lf", &dbl);
 +        fr->sc_sigma6_min = pow(dbl, 6);
 +        if (fp)
 +        {
 +            fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
 +        }
 +    }
 +
 +    fr->bNonbonded = TRUE;
 +    if (getenv("GMX_NO_NONBONDED") != NULL)
 +    {
 +        /* turn off non-bonded calculations */
 +        fr->bNonbonded = FALSE;
 +        md_print_warn(cr, fp,
 +                      "Found environment variable GMX_NO_NONBONDED.\n"
 +                      "Disabling nonbonded calculations.\n");
 +    }
 +
 +    bGenericKernelOnly = FALSE;
 +
 +    /* We now check in the NS code whether a particular combination of interactions
 +     * can be used with water optimization, and disable it if that is not the case.
 +     */
 +
 +    if (getenv("GMX_NB_GENERIC") != NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "Found environment variable GMX_NB_GENERIC.\n"
 +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
 +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
 +        }
 +        bGenericKernelOnly = TRUE;
 +    }
 +
 +    if (bGenericKernelOnly == TRUE)
 +    {
 +        bNoSolvOpt         = TRUE;
 +    }
 +
 +    if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
 +    {
 +        fr->use_cpu_acceleration = FALSE;
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
 +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
 +        }
 +    }
 +
 +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +
 +    /* Check if we can/should do all-vs-all kernels */
 +    fr->bAllvsAll       = can_use_allvsall(ir, mtop, FALSE, NULL, NULL);
 +    fr->AllvsAll_work   = NULL;
 +    fr->AllvsAll_workgb = NULL;
 +
 +
 +    /* Neighbour searching stuff */
 +    fr->cutoff_scheme = ir->cutoff_scheme;
 +    fr->bGrid         = (ir->ns_type == ensGRID);
 +    fr->ePBC          = ir->ePBC;
 +
 +    /* Determine if we will do PBC for distances in bonded interactions */
 +    if (fr->ePBC == epbcNONE)
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* The group cut-off scheme and SHAKE assume charge groups
 +             * are whole, but not using molpbc is faster in most cases.
 +             */
 +            if (fr->cutoff_scheme == ecutsGROUP ||
 +                (ir->eConstrAlg == econtSHAKE &&
 +                 (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
 +                  gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
 +            {
 +                fr->bMolPBC = ir->bPeriodicMols;
 +            }
 +            else
 +            {
 +                fr->bMolPBC = TRUE;
 +                if (getenv("GMX_USE_GRAPH") != NULL)
 +                {
 +                    fr->bMolPBC = FALSE;
 +                    if (fp)
 +                    {
 +                        fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
 +        }
 +    }
 +    fr->bGB = (ir->implicit_solvent == eisGBSA);
 +
 +    fr->rc_scaling = ir->refcoord_scaling;
 +    copy_rvec(ir->posres_com, fr->posres_com);
 +    copy_rvec(ir->posres_comB, fr->posres_comB);
 +    fr->rlist      = cutoff_inf(ir->rlist);
 +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
 +    fr->eeltype    = ir->coulombtype;
 +    fr->vdwtype    = ir->vdwtype;
 +
 +    fr->coulomb_modifier = ir->coulomb_modifier;
 +    fr->vdw_modifier     = ir->vdw_modifier;
 +
 +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
 +    switch (fr->eeltype)
 +    {
 +        case eelCUT:
 +            fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
 +            break;
 +
 +        case eelRF:
 +        case eelGRF:
 +        case eelRF_NEC:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            break;
 +
 +        case eelRF_ZERO:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
 +            break;
 +
 +        case eelSWITCH:
 +        case eelSHIFT:
 +        case eelUSER:
 +        case eelENCADSHIFT:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            break;
 +
 +        case eelPME:
 +        case eelEWALD:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
 +            break;
 +    }
 +
 +    /* Vdw: Translate from mdp settings to kernel format */
 +    switch (fr->vdwtype)
 +    {
 +        case evdwCUT:
 +            if (fr->bBHAM)
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
 +            }
 +            else
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
 +            }
 +            break;
 +
 +        case evdwSWITCH:
 +        case evdwSHIFT:
 +        case evdwUSER:
 +        case evdwENCADSHIFT:
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
 +            break;
 +    }
 +
 +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
 +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
 +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
 +
 +    fr->bTwinRange = fr->rlistlong > fr->rlist;
 +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
 +
 +    fr->reppow     = mtop->ffparams.reppow;
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
 +                          !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
 +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
 +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
 +                           fr->eeltype == eelEWALD ||
 +                           fr->eeltype == eelPME ||
 +                           fr->eeltype == eelRF ||
 +                           fr->eeltype == eelRF_ZERO);
 +
 +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
 +         * going to be faster to tabulate the interaction than calling the generic kernel.
 +         */
 +        if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
 +        {
 +            if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +        else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
 +                 ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
 +                   fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
 +                   (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
 +        {
 +            if (fr->rcoulomb != fr->rvdw)
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +
 +        if (getenv("GMX_REQUIRE_TABLES"))
 +        {
 +            fr->bvdwtab  = TRUE;
 +            fr->bcoultab = TRUE;
 +        }
 +
 +        if (fp)
 +        {
 +            fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
 +            fprintf(fp, "Table routines are used for vdw:     %s\n", bool_names[fr->bvdwtab ]);
 +        }
 +
 +        if (fr->bvdwtab == TRUE)
 +        {
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            fr->nbkernel_vdw_modifier    = eintmodNONE;
 +        }
 +        if (fr->bcoultab == TRUE)
 +        {
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            fr->nbkernel_elec_modifier    = eintmodNONE;
 +        }
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
 +        {
 +            gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
 +        }
 +        fr->bvdwtab  = FALSE;
 +        fr->bcoultab = FALSE;
 +    }
 +
 +    /* Tables are used for direct ewald sum */
 +    if (fr->bEwald)
 +    {
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (fp)
 +            {
 +                fprintf(fp, "Will do PME sum in reciprocal space.\n");
 +            }
 +            if (ir->coulombtype == eelP3M_AD)
 +            {
 +                please_cite(fp, "Hockney1988");
 +                please_cite(fp, "Ballenegger2012");
 +            }
 +            else
 +            {
 +                please_cite(fp, "Essmann95a");
 +            }
 +
 +            if (ir->ewald_geometry == eewg3DC)
 +            {
 +                if (fp)
 +                {
 +                    fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
 +                }
 +                please_cite(fp, "In-Chul99a");
 +            }
 +        }
 +        fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
 +        init_ewald_tab(&(fr->ewald_table), cr, ir, fp);
 +        if (fp)
 +        {
 +            fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
 +                    1/fr->ewaldcoeff);
 +        }
 +    }
 +
 +    /* Electrostatics */
 +    fr->epsilon_r       = ir->epsilon_r;
 +    fr->epsilon_rf      = ir->epsilon_rf;
 +    fr->fudgeQQ         = mtop->ffparams.fudgeQQ;
 +    fr->rcoulomb_switch = ir->rcoulomb_switch;
 +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
 +
 +    /* Parameters for generalized RF */
 +    fr->zsquare = 0.0;
 +    fr->temp    = 0.0;
 +
 +    if (fr->eeltype == eelGRF)
 +    {
 +        init_generalized_rf(fp, mtop, ir, fr);
 +    }
 +    else if (fr->eeltype == eelSHIFT)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            box_size[m] = box[m][m];
 +        }
 +
 +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
 +        {
 +            set_shift_consts(fp, fr->rcoulomb_switch, fr->rcoulomb, box_size, fr);
 +        }
 +    }
 +
 +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
 +                       gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
 +                       gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
 +                       IR_ELEC_FIELD(*ir) ||
 +                       (fr->adress_icor != eAdressICOff)
 +                       );
 +
 +    if (fr->cutoff_scheme == ecutsGROUP &&
 +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
 +    {
 +        /* Count the total number of charge groups */
 +        fr->cg_nalloc = ncg_mtop(mtop);
 +        srenew(fr->cg_cm, fr->cg_nalloc);
 +    }
 +    if (fr->shift_vec == NULL)
 +    {
 +        snew(fr->shift_vec, SHIFTS);
 +    }
 +
 +    if (fr->fshift == NULL)
 +    {
 +        snew(fr->fshift, SHIFTS);
 +    }
 +
 +    if (fr->nbfp == NULL)
 +    {
 +        fr->ntype = mtop->ffparams.atnr;
 +        fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
 +    }
 +
 +    /* Copy the energy group exclusions */
 +    fr->egp_flags = ir->opts.egp_flags;
 +
 +    /* Van der Waals stuff */
 +    fr->rvdw        = cutoff_inf(ir->rvdw);
 +    fr->rvdw_switch = ir->rvdw_switch;
 +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
 +    {
 +        if (fr->rvdw_switch >= fr->rvdw)
 +        {
 +            gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
 +                      fr->rvdw_switch, fr->rvdw);
 +        }
 +        if (fp)
 +        {
 +            fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
 +                    (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
 +                    fr->rvdw_switch, fr->rvdw);
 +        }
 +    }
 +
 +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
 +    {
 +        gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
 +    }
 +
 +    if (fp)
 +    {
 +        fprintf(fp, "Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
 +                fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
 +    }
 +
 +    fr->eDispCorr = ir->eDispCorr;
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        set_avcsixtwelve(fp, fr, mtop);
 +    }
 +
 +    if (fr->bBHAM)
 +    {
 +        set_bham_b_max(fp, fr, mtop);
 +    }
 +
 +    fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +
 +    /* Copy the GBSA data (radius, volume and surftens for each
 +     * atomtype) from the topology atomtype section to forcerec.
 +     */
 +    snew(fr->atype_radius, fr->ntype);
 +    snew(fr->atype_vol, fr->ntype);
 +    snew(fr->atype_surftens, fr->ntype);
 +    snew(fr->atype_gb_radius, fr->ntype);
 +    snew(fr->atype_S_hct, fr->ntype);
 +
 +    if (mtop->atomtypes.nr > 0)
 +    {
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_radius[i] = mtop->atomtypes.radius[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
 +        }
 +        for (i = 0; i < fr->ntype; i++)
 +        {
 +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
 +        }
 +    }
 +
 +    /* Generate the GB table if needed */
 +    if (fr->bGB)
 +    {
 +#ifdef GMX_DOUBLE
 +        fr->gbtabscale = 2000;
 +#else
 +        fr->gbtabscale = 500;
 +#endif
 +
 +        fr->gbtabr = 100;
 +        fr->gbtab  = make_gb_table(fp, oenv, fr, tabpfn, fr->gbtabscale);
 +
 +        init_gb(&fr->born, cr, fr, ir, mtop, ir->rgbradii, ir->gb_algorithm);
 +
 +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            make_local_gb(cr, fr->born, ir->gb_algorithm);
 +        }
 +    }
 +
 +    /* Set the charge scaling */
 +    if (fr->epsilon_r != 0)
 +    {
 +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
 +    }
 +    else
 +    {
 +        /* eps = 0 is infinite dieletric: no coulomb interactions */
 +        fr->epsfac = 0;
 +    }
 +
 +    /* Reaction field constants */
 +    if (EEL_RF(fr->eeltype))
 +    {
 +        calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
 +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
 +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
 +    }
 +
 +    set_chargesum(fp, fr, mtop);
 +
 +    /* if we are using LR electrostatics, and they are tabulated,
 +     * the tables will contain modified coulomb interactions.
 +     * Since we want to use the non-shifted ones for 1-4
 +     * coulombic interactions, we must have an extra set of tables.
 +     */
 +
 +    /* Construct tables.
 +     * A little unnecessary to make both vdw and coul tables sometimes,
 +     * but what the heck... */
 +
 +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
 +
 +    bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
 +                  fr->bBHAM || fr->bEwald) &&
 +                 (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
 +                  gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
 +                  gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
 +
 +    negp_pp   = ir->opts.ngener - ir->nwall;
 +    negptable = 0;
 +    if (!bTab)
 +    {
 +        bNormalnblists = TRUE;
 +        fr->nnblists   = 1;
 +    }
 +    else
 +    {
 +        bNormalnblists = (ir->eDispCorr != edispcNO);
 +        for (egi = 0; egi < negp_pp; egi++)
 +        {
 +            for (egj = egi; egj < negp_pp; egj++)
 +            {
 +                egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
 +                if (!(egp_flags & EGP_EXCL))
 +                {
 +                    if (egp_flags & EGP_TABLE)
 +                    {
 +                        negptable++;
 +                    }
 +                    else
 +                    {
 +                        bNormalnblists = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (bNormalnblists)
 +        {
 +            fr->nnblists = negptable + 1;
 +        }
 +        else
 +        {
 +            fr->nnblists = negptable;
 +        }
 +        if (fr->nnblists > 1)
 +        {
 +            snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
 +        }
 +    }
 +
 +    if (ir->adress)
 +    {
 +        fr->nnblists *= 2;
 +    }
 +
 +    snew(fr->nblists, fr->nnblists);
 +
 +    /* This code automatically gives table length tabext without cut-off's,
 +     * in that case grompp should already have checked that we do not need
 +     * normal tables and we only generate tables for 1-4 interactions.
 +     */
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (bTab)
 +    {
 +        /* make tables for ordinary interactions */
 +        if (bNormalnblists)
 +        {
 +            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
 +            if (ir->adress)
 +            {
 +                make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
 +            }
 +            if (!bSep14tab)
 +            {
 +                fr->tab14 = fr->nblists[0].table_elec_vdw;
 +            }
 +            m = 1;
 +        }
 +        else
 +        {
 +            m = 0;
 +        }
 +        if (negptable > 0)
 +        {
 +            /* Read the special tables for certain energy group pairs */
 +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
 +            for (egi = 0; egi < negp_pp; egi++)
 +            {
 +                for (egj = egi; egj < negp_pp; egj++)
 +                {
 +                    egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
 +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
 +                    {
 +                        nbl = &(fr->nblists[m]);
 +                        if (fr->nnblists > 1)
 +                        {
 +                            fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
 +                        }
 +                        /* Read the table file with the two energy groups names appended */
 +                        make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[m]);
 +                        if (ir->adress)
 +                        {
 +                            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
 +                                            *mtop->groups.grpname[nm_ind[egi]],
 +                                            *mtop->groups.grpname[nm_ind[egj]],
 +                                            &fr->nblists[fr->nnblists/2+m]);
 +                        }
 +                        m++;
 +                    }
 +                    else if (fr->nnblists > 1)
 +                    {
 +                        fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (bSep14tab)
 +    {
 +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
 +        fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
 +                                GMX_MAKETABLES_14ONLY);
 +    }
 +
 +    /* Read AdResS Thermo Force table if needed */
 +    if (fr->adress_icor == eAdressICThermoForce)
 +    {
 +        /* old todo replace */
 +
 +        if (ir->adress->n_tf_grps > 0)
 +        {
 +            make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
 +
 +        }
 +        else
 +        {
 +            /* load the default table */
 +            snew(fr->atf_tabs, 1);
 +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
 +        }
 +    }
 +
 +    /* Wall stuff */
 +    fr->nwall = ir->nwall;
 +    if (ir->nwall && ir->wall_type == ewtTABLE)
 +    {
 +        make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
 +    }
 +
 +    if (fcd && tabbfn)
 +    {
 +        fcd->bondtab  = make_bonded_tables(fp,
 +                                           F_TABBONDS, F_TABBONDSNC,
 +                                           mtop, tabbfn, "b");
 +        fcd->angletab = make_bonded_tables(fp,
 +                                           F_TABANGLES, -1,
 +                                           mtop, tabbfn, "a");
 +        fcd->dihtab   = make_bonded_tables(fp,
 +                                           F_TABDIHS, -1,
 +                                           mtop, tabbfn, "d");
 +    }
 +    else
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
 +        }
 +    }
 +
 +    /* QM/MM initialization if requested
 +     */
 +    if (ir->bQMMM)
 +    {
 +        fprintf(stderr, "QM/MM calculation requested.\n");
 +    }
 +
 +    fr->bQMMM      = ir->bQMMM;
 +    fr->qr         = mk_QMMMrec();
 +
 +    /* Set all the static charge group info */
 +    fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
 +                                   &fr->bExcl_IntraCGAll_InterCGNone);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        fr->cginfo = NULL;
 +    }
 +    else
 +    {
 +        fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
 +    }
 +
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* When using particle decomposition, the effect of the second argument,
 +         * which sets fr->hcg, is corrected later in do_md and init_em.
 +         */
 +        forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
 +                            mtop->natoms, mtop->natoms, mtop->natoms);
 +    }
 +
 +    fr->print_force = print_force;
 +
 +
 +    /* coarse load balancing vars */
 +    fr->t_fnbf    = 0.;
 +    fr->t_wait    = 0.;
 +    fr->timesteps = 0;
 +
 +    /* Initialize neighbor search */
 +    init_ns(fp, cr, &fr->ns, fr, mtop, box);
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        gmx_nonbonded_setup(fp, fr, bGenericKernelOnly);
 +        /*
 +           if (ir->bAdress)
 +            {
 +                gmx_setup_adress_kernels(fp,bGenericKernelOnly);
 +            }
 +         */
 +    }
 +
 +    /* Initialize the thread working data for bonded interactions */
 +    init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
 +
 +    snew(fr->excl_load, fr->nthreads+1);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
 +        }
 +
 +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
 +    }
 +
 +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
 +    init_interaction_const(fp, &fr->ic, fr, rtab);
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        calc_enervirdiff(fp, ir->eDispCorr, fr);
 +    }
 +}
 +
 +#define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
 +#define pr_int(fp, i)  fprintf((fp), "%s: %d\n",#i, i)
 +#define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
 +
 +void pr_forcerec(FILE *fp, t_forcerec *fr, t_commrec *cr)
 +{
 +    int i;
 +
 +    pr_real(fp, fr->rlist);
 +    pr_real(fp, fr->rcoulomb);
 +    pr_real(fp, fr->fudgeQQ);
 +    pr_bool(fp, fr->bGrid);
 +    pr_bool(fp, fr->bTwinRange);
 +    /*pr_int(fp,fr->cg0);
 +       pr_int(fp,fr->hcg);*/
 +    for (i = 0; i < fr->nnblists; i++)
 +    {
 +        pr_int(fp, fr->nblists[i].table_elec_vdw.n);
 +    }
 +    pr_real(fp, fr->rcoulomb_switch);
 +    pr_real(fp, fr->rcoulomb);
 +
 +    fflush(fp);
 +}
 +
 +void forcerec_set_excl_load(t_forcerec *fr,
 +                            const gmx_localtop_t *top, const t_commrec *cr)
 +{
 +    const int *ind, *a;
 +    int        t, i, j, ntot, n, ntarget;
 +
 +    if (cr != NULL && PARTDECOMP(cr))
 +    {
 +        /* No OpenMP with particle decomposition */
 +        pd_at_range(cr,
 +                    &fr->excl_load[0],
 +                    &fr->excl_load[1]);
 +
 +        return;
 +    }
 +
 +    ind = top->excls.index;
 +    a   = top->excls.a;
 +
 +    ntot = 0;
 +    for (i = 0; i < top->excls.nr; i++)
 +    {
 +        for (j = ind[i]; j < ind[i+1]; j++)
 +        {
 +            if (a[j] > i)
 +            {
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    fr->excl_load[0] = 0;
 +    n                = 0;
 +    i                = 0;
 +    for (t = 1; t <= fr->nthreads; t++)
 +    {
 +        ntarget = (ntot*t)/fr->nthreads;
 +        while (i < top->excls.nr && n < ntarget)
 +        {
 +            for (j = ind[i]; j < ind[i+1]; j++)
 +            {
 +                if (a[j] > i)
 +                {
 +                    n++;
 +                }
 +            }
 +            i++;
 +        }
 +        fr->excl_load[t] = i;
 +    }
 +}
index eb60d3fe0565402b1207eb1c231f804ca32ca7a6,0000000000000000000000000000000000000000..241611c7ce09fd2b49bc1e7126ced73e8740baf0
mode 100644,000000..100644
--- /dev/null
@@@ -1,851 -1,0 +1,866 @@@
-                          int nnodes_tot, int nnodes, int nthreads,
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "gmx_wallcycle.h"
 +#include "gmx_cyclecounter.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "md_logging.h"
 +#include "string2.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +/* DEBUG_WCYCLE adds consistency checking for the counters.
 + * It checks if you stop a counter different from the last
 + * one that was opened and if you do nest too deep.
 + */
 +/* #define DEBUG_WCYCLE */
 +
 +typedef struct
 +{
 +    int          n;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +    gmx_cycles_t last;
 +} wallcc_t;
 +
 +typedef struct gmx_wallcycle
 +{
 +    wallcc_t        *wcc;
 +    /* variables for testing/debugging */
 +    gmx_bool         wc_barrier;
 +    wallcc_t        *wcc_all;
 +    int              wc_depth;
 +#ifdef DEBUG_WCYCLE
 +#define DEPTH_MAX 6
 +    int               counterlist[DEPTH_MAX];
 +    int               count_depth;
 +#endif
 +    int               ewc_prev;
 +    gmx_cycles_t      cycle_prev;
 +    gmx_large_int_t   reset_counters;
 +#ifdef GMX_MPI
 +    MPI_Comm          mpi_comm_mygroup;
 +#endif
 +    int               nthreads_pp;
 +    int               nthreads_pme;
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    wallcc_t         *wcsc;
 +#endif
 +    double           *cycles_sum;
 +} gmx_wallcycle_t_t;
 +
 +/* Each name should not exceed 19 characters */
 +static const char *wcn[ewcNR] =
 +{
 +    "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 +    "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 +    "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
 +    "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve",
 +    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
 +    "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
 +    "Enforced rotation", "Add rot. forces", "Test"
 +};
 +
 +static const char *wcsn[ewcsNR] =
 +{
 +    "DD redist.", "DD NS grid + sort", "DD setup comm.",
 +    "DD make top.", "DD make constr.", "DD top. other",
 +    "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 +    "Bonded F", "Nonbonded F", "Ewald F correction",
 +    "NB X buffer ops.", "NB F buffer ops."
 +};
 +
 +gmx_bool wallcycle_have_counter(void)
 +{
 +    return gmx_cycles_have_counter();
 +}
 +
 +gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec *cr,
 +                               int nthreads_pp, int nthreads_pme)
 +{
 +    gmx_wallcycle_t wc;
 +
 +
 +    if (!wallcycle_have_counter())
 +    {
 +        return NULL;
 +    }
 +
 +    snew(wc, 1);
 +
 +    wc->wc_barrier          = FALSE;
 +    wc->wcc_all             = NULL;
 +    wc->wc_depth            = 0;
 +    wc->ewc_prev            = -1;
 +    wc->reset_counters      = resetstep;
 +    wc->nthreads_pp         = nthreads_pp;
 +    wc->nthreads_pme        = nthreads_pme;
 +    wc->cycles_sum          = NULL;
 +
 +#ifdef GMX_MPI
 +    if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 +        }
 +        wc->wc_barrier       = TRUE;
 +        wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 +    }
 +#endif
 +
 +    snew(wc->wcc, ewcNR);
 +    if (getenv("GMX_CYCLE_ALL") != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\nWill time all the code during the run\n\n");
 +        }
 +        snew(wc->wcc_all, ewcNR*ewcNR);
 +    }
 +
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    snew(wc->wcsc, ewcsNR);
 +#endif
 +
 +#ifdef DEBUG_WCYCLE
 +    wc->count_depth = 0;
 +#endif
 +
 +    return wc;
 +}
 +
 +void wallcycle_destroy(gmx_wallcycle_t wc)
 +{
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    if (wc->wcc != NULL)
 +    {
 +        sfree(wc->wcc);
 +    }
 +    if (wc->wcc_all != NULL)
 +    {
 +        sfree(wc->wcc_all);
 +    }
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    if (wc->wcsc != NULL)
 +    {
 +        sfree(wc->wcsc);
 +    }
 +#endif
 +    sfree(wc);
 +}
 +
 +static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 +{
 +    wc->ewc_prev   = ewc;
 +    wc->cycle_prev = cycle;
 +}
 +
 +static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 +{
 +    wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 +    wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 +}
 +
 +
 +#ifdef DEBUG_WCYCLE
 +static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 +{
 +    /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 +
 +    if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 +    {
 +        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 +                  wc->count_depth);
 +    }
 +    wc->counterlist[wc->count_depth] = ewc;
 +    wc->count_depth++;
 +}
 +
 +static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 +{
 +    wc->count_depth--;
 +
 +    /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 +
 +    if (wc->count_depth < 0)
 +    {
 +        gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 +    }
 +    if (wc->counterlist[wc->count_depth] != ewc)
 +    {
 +        gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 +                  wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 +    }
 +}
 +#endif
 +
 +void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 +{
 +    gmx_cycles_t cycle;
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +#ifdef GMX_MPI
 +    if (wc->wc_barrier)
 +    {
 +        MPI_Barrier(wc->mpi_comm_mygroup);
 +    }
 +#endif
 +
 +#ifdef DEBUG_WCYCLE
 +    debug_start_check(wc, ewc);
 +#endif
 +
 +    cycle              = gmx_cycles_read();
 +    wc->wcc[ewc].start = cycle;
 +    if (wc->wcc_all != NULL)
 +    {
 +        wc->wc_depth++;
 +        if (ewc == ewcRUN)
 +        {
 +            wallcycle_all_start(wc, ewc, cycle);
 +        }
 +        else if (wc->wc_depth == 3)
 +        {
 +            wallcycle_all_stop(wc, ewc, cycle);
 +        }
 +    }
 +}
 +
 +void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 +{
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    wallcycle_start(wc, ewc);
 +    wc->wcc[ewc].n--;
 +}
 +
 +double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 +{
 +    gmx_cycles_t cycle, last;
 +
 +    if (wc == NULL)
 +    {
 +        return 0;
 +    }
 +
 +#ifdef GMX_MPI
 +    if (wc->wc_barrier)
 +    {
 +        MPI_Barrier(wc->mpi_comm_mygroup);
 +    }
 +#endif
 +
 +#ifdef DEBUG_WCYCLE
 +    debug_stop_check(wc, ewc);
 +#endif
 +
 +    cycle           = gmx_cycles_read();
 +    last            = cycle - wc->wcc[ewc].start;
 +    wc->wcc[ewc].c += last;
 +    wc->wcc[ewc].n++;
 +    if (wc->wcc_all)
 +    {
 +        wc->wc_depth--;
 +        if (ewc == ewcRUN)
 +        {
 +            wallcycle_all_stop(wc, ewc, cycle);
 +        }
 +        else if (wc->wc_depth == 2)
 +        {
 +            wallcycle_all_start(wc, ewc, cycle);
 +        }
 +    }
 +
 +    return last;
 +}
 +
 +void wallcycle_reset_all(gmx_wallcycle_t wc)
 +{
 +    int i;
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    for (i = 0; i < ewcNR; i++)
 +    {
 +        wc->wcc[i].n = 0;
 +        wc->wcc[i].c = 0;
 +    }
 +    if (wc->wcc_all)
 +    {
 +        for (i = 0; i < ewcNR*ewcNR; i++)
 +        {
 +            wc->wcc_all[i].n = 0;
 +            wc->wcc_all[i].c = 0;
 +        }
 +    }
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    for (i = 0; i < ewcsNR; i++)
 +    {
 +        wc->wcsc[i].n = 0;
 +        wc->wcsc[i].c = 0;
 +    }
 +#endif
 +}
 +
 +static gmx_bool is_pme_counter(int ewc)
 +{
 +    return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 +}
 +
 +static gmx_bool is_pme_subcounter(int ewc)
 +{
 +    return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 +}
 +
 +void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 +{
 +    wallcc_t *wcc;
 +    double   *cycles;
 +    double    cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
 +    int       i, j;
 +    int       nsum;
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    snew(wc->cycles_sum, ewcNR+ewcsNR);
 +    cycles = wc->cycles_sum;
 +
 +    wcc = wc->wcc;
 +
 +    for (i = 0; i < ewcNR; i++)
 +    {
 +        if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
 +        {
 +            wcc[i].c *= wc->nthreads_pme;
 +
 +            if (wc->wcc_all)
 +            {
 +                for (j = 0; j < ewcNR; j++)
 +                {
 +                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            wcc[i].c *= wc->nthreads_pp;
 +
 +            if (wc->wcc_all)
 +            {
 +                for (j = 0; j < ewcNR; j++)
 +                {
 +                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (wcc[ewcDDCOMMLOAD].n > 0)
 +    {
 +        wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c;
 +    }
 +    if (wcc[ewcDDCOMMBOUND].n > 0)
 +    {
 +        wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c;
 +    }
 +    if (wcc[ewcPME_FFTCOMM].n > 0)
 +    {
 +        wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c;
 +    }
 +
 +    if (cr->npmenodes == 0)
 +    {
 +        /* All nodes do PME (or no PME at all) */
 +        if (wcc[ewcPMEMESH].n > 0)
 +        {
 +            wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c;
 +        }
 +    }
 +    else
 +    {
 +        /* The are PME-only nodes */
 +        if (wcc[ewcPMEMESH].n > 0)
 +        {
 +            /* This must be a PME only node, calculate the Wait + Comm. time */
 +            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 +        }
 +    }
 +
 +    /* Store the cycles in a double buffer for summing */
 +    for (i = 0; i < ewcNR; i++)
 +    {
 +        cycles_n[i] = (double)wcc[i].n;
 +        cycles[i]   = (double)wcc[i].c;
 +    }
 +    nsum = ewcNR;
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    for (i = 0; i < ewcsNR; i++)
 +    {
 +        wc->wcsc[i].c    *= wc->nthreads_pp;
 +        cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
 +        cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
 +    }
 +    nsum += ewcsNR;
 +#endif
 +
 +#ifdef GMX_MPI
 +    if (cr->nnodes > 1)
 +    {
 +        MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
 +                      cr->mpi_comm_mysim);
 +        for (i = 0; i < ewcNR; i++)
 +        {
 +            wcc[i].n = (int)(buf[i] + 0.5);
 +        }
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +        for (i = 0; i < ewcsNR; i++)
 +        {
 +            wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
 +        }
 +#endif
 +
 +        MPI_Allreduce(cycles, buf, nsum, MPI_DOUBLE, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        for (i = 0; i < nsum; i++)
 +        {
 +            cycles[i] = buf[i];
 +        }
 +
 +        if (wc->wcc_all != NULL)
 +        {
 +            snew(cyc_all, ewcNR*ewcNR);
 +            snew(buf_all, ewcNR*ewcNR);
 +            for (i = 0; i < ewcNR*ewcNR; i++)
 +            {
 +                cyc_all[i] = wc->wcc_all[i].c;
 +            }
 +            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 +                          cr->mpi_comm_mysim);
 +            for (i = 0; i < ewcNR*ewcNR; i++)
 +            {
 +                wc->wcc_all[i].c = buf_all[i];
 +            }
 +            sfree(buf_all);
 +            sfree(cyc_all);
 +        }
 +    }
 +#endif
 +}
 +
 +static void print_cycles(FILE *fplog, double c2t, const char *name,
-         wallt = c*c2t*nnodes_tot/(double)nnodes;
++                         int nthreads_tot,
++                         int nnodes, int nthreads,
 +                         int n, double c, double tot)
 +{
 +    char   num[11];
 +    char   thstr[6];
 +    double wallt;
 +
 +    if (c > 0)
 +    {
 +        if (n > 0)
 +        {
 +            snprintf(num, sizeof(num), "%10d", n);
 +            if (nthreads < 0)
 +            {
 +                snprintf(thstr, sizeof(thstr), "N/A");
 +            }
 +            else
 +            {
 +                snprintf(thstr, sizeof(thstr), "%4d", nthreads);
 +            }
 +        }
 +        else
 +        {
 +            sprintf(num, "          ");
 +            sprintf(thstr, "    ");
 +        }
-     int         i, j, npp, nth_pp, nth_pme;
++        /* Convert the cycle count to wallclock time for this task */
++        if (nthreads > 0)
++        {
++            /* Cycle count has been multiplied by the thread count,
++             * correct for the number of threads used.
++             */
++            wallt = c*c2t*nthreads_tot/(double)(nnodes*nthreads);
++        }
++        else
++        {
++            /* nthreads=-1 signals total run time, no correction required */
++            wallt = c*c2t;
++        }
 +        fprintf(fplog, " %-19s %4d %4s %10s  %10.3f %12.3f   %5.1f\n",
 +                name, nnodes, thstr, num, wallt, c*1e-9, 100*c/tot);
 +    }
 +}
 +
 +static void print_gputimes(FILE *fplog, const char *name,
 +                           int n, double t, double tot_t)
 +{
 +    char num[11];
 +    char avg_perf[11];
 +
 +    if (n > 0)
 +    {
 +        snprintf(num, sizeof(num), "%10d", n);
 +        snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 +    }
 +    else
 +    {
 +        sprintf(num, "          ");
 +        sprintf(avg_perf, "          ");
 +    }
 +    if (t != tot_t)
 +    {
 +        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 +                name, num, t/1000, avg_perf, 100 * t/tot_t);
 +    }
 +    else
 +    {
 +        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 +                name, "", t/1000, avg_perf, 100.0);
 +    }
 +}
 +
 +void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
 +                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
 +{
 +    double     *cycles;
 +    double      c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
-             print_cycles(fplog, c2t, wcn[i], nnodes,
++    int         i, j, npp, nth_pp, nth_pme, nth_tot;
 +    char        buf[STRLEN];
 +    const char *hline = "-----------------------------------------------------------------------------";
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    nth_pp  = wc->nthreads_pp;
 +    nth_pme = wc->nthreads_pme;
 +
 +    cycles = wc->cycles_sum;
 +
 +    if (npme > 0)
 +    {
 +        npp = nnodes - npme;
 +    }
 +    else
 +    {
 +        npp  = nnodes;
 +        npme = nnodes;
 +    }
++    nth_tot = npp*nth_pp + npme*nth_pme;
++
 +    tot = cycles[ewcRUN];
 +
 +    /* Conversion factor from cycles to seconds */
 +    if (tot > 0)
 +    {
 +        c2t = realtime/tot;
 +    }
 +    else
 +    {
 +        c2t = 0;
 +    }
 +
 +    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 +
 +    fprintf(fplog, " Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n", '%');
 +    fprintf(fplog, "%s\n", hline);
 +    sum = 0;
 +    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 +    {
 +        if (!is_pme_subcounter(i))
 +        {
-                 print_cycles(fplog, c2t, buf, nnodes,
++            print_cycles(fplog, c2t, wcn[i], nth_tot,
 +                         is_pme_counter(i) ? npme : npp,
 +                         is_pme_counter(i) ? nth_pme : nth_pp,
 +                         wc->wcc[i].n, cycles[i], tot);
 +            sum += cycles[i];
 +        }
 +    }
 +    if (wc->wcc_all != NULL)
 +    {
 +        for (i = 0; i < ewcNR; i++)
 +        {
 +            for (j = 0; j < ewcNR; j++)
 +            {
 +                snprintf(buf, 9, "%-9s", wcn[i]);
 +                buf[9] = ' ';
 +                snprintf(buf+10, 9, "%-9s", wcn[j]);
 +                buf[19] = '\0';
-     print_cycles(fplog, c2t, "Rest", npp, npp, -1, 0, tot-sum, tot);
++                print_cycles(fplog, c2t, buf, nth_tot,
 +                             is_pme_counter(i) ? npme : npp,
 +                             is_pme_counter(i) ? nth_pme : nth_pp,
 +                             wc->wcc_all[i*ewcNR+j].n,
 +                             wc->wcc_all[i*ewcNR+j].c,
 +                             tot);
 +            }
 +        }
 +    }
-     print_cycles(fplog, c2t, "Total", nnodes, nnodes, -1, 0, tot, tot);
++    print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
 +    fprintf(fplog, "%s\n", hline);
-                 print_cycles(fplog, c2t, wcn[i], nnodes,
++    print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
 +    fprintf(fplog, "%s\n", hline);
 +
 +    if (wc->wcc[ewcPMEMESH].n > 0)
 +    {
 +        fprintf(fplog, "%s\n", hline);
 +        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 +        {
 +            if (is_pme_subcounter(i))
 +            {
-         print_cycles(fplog, c2t, wcsn[i], nnodes, npp, nth_pp,
++                print_cycles(fplog, c2t, wcn[i], nth_tot,
 +                             is_pme_counter(i) ? npme : npp,
 +                             is_pme_counter(i) ? nth_pme : nth_pp,
 +                             wc->wcc[i].n, cycles[i], tot);
 +            }
 +        }
 +        fprintf(fplog, "%s\n", hline);
 +    }
 +
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    fprintf(fplog, "%s\n", hline);
 +    for (i = 0; i < ewcsNR; i++)
 +    {
++        print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
 +                     wc->wcsc[i].n, cycles[ewcNR+i], tot);
 +    }
 +    fprintf(fplog, "%s\n", hline);
 +#endif
 +
 +    /* print GPU timing summary */
 +    if (gpu_t)
 +    {
 +        const char *k_log_str[2][2] = {
 +            {"Nonbonded F kernel", "Nonbonded F+ene k."},
 +            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 +        };
 +
 +        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
 +
 +        /* add up the kernel timings */
 +        tot_k = 0.0;
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                tot_k += gpu_t->ktime[i][j].t;
 +            }
 +        }
 +        tot_gpu += tot_k;
 +
 +        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 +        if (wc->wcc[ewcPMEMESH].n > 0)
 +        {
 +            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 +        }
 +        tot_cpu_overlap *= c2t * 1000; /* convert s to ms */
 +
 +        fprintf(fplog, "\n GPU timings\n%s\n", hline);
 +        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 +        fprintf(fplog, "%s\n", hline);
 +        print_gputimes(fplog, "Pair list H2D",
 +                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
 +        print_gputimes(fplog, "X / q H2D",
 +                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
 +
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (gpu_t->ktime[i][j].c)
 +                {
 +                    print_gputimes(fplog, k_log_str[i][j],
 +                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
 +                }
 +            }
 +        }
 +
 +        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
 +        fprintf(fplog, "%s\n", hline);
 +        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
 +        fprintf(fplog, "%s\n", hline);
 +
 +        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 +        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
 +                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 +                gpu_cpu_ratio);
 +
 +        /* only print notes related to CPU-GPU load balance with PME */
 +        if (wc->wcc[ewcPMEMESH].n > 0)
 +        {
 +            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");
 +
 +            /* print note if the imbalance is high with PME case in which
 +             * CPU-GPU load balancing is possible */
 +            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
 +            {
 +                /* Only the sim master calls this function, so always print to stderr */
 +                if (gpu_cpu_ratio < 0.75)
 +                {
 +                    if (npp > 1)
 +                    {
 +                        /* The user could have used -notunepme,
 +                         * but we currently can't check that here.
 +                         */
 +                        md_print_warn(NULL, fplog,
 +                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 +                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
 +                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
 +                    }
 +                    else
 +                    {
 +                        /* We should not end up here, unless the box is
 +                         * too small for increasing the cut-off for PME tuning.
 +                         */
 +                        md_print_warn(NULL, fplog,
 +                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 +                                      "      performance loss.");
 +                    }
 +                }
 +                if (gpu_cpu_ratio > 1.2)
 +                {
 +                    md_print_warn(NULL, fplog,
 +                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
 +                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
 +                }
 +            }
 +        }
 +    }
 +
 +    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
 +        (cycles[ewcDOMDEC] > tot*0.1 ||
 +         cycles[ewcNS] > tot*0.1))
 +    {
 +        /* Only the sim master calls this function, so always print to stderr */
 +        if (wc->wcc[ewcDOMDEC].n == 0)
 +        {
 +            md_print_warn(NULL, fplog,
 +                          "NOTE: %d %% of the run time was spent in pair search,\n"
 +                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
 +                          (int)(100*cycles[ewcNS]/tot+0.5));
 +        }
 +        else
 +        {
 +            md_print_warn(NULL, fplog,
 +                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
 +                          "      %d %% of the run time was spent in pair search,\n"
 +                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
 +                          (int)(100*cycles[ewcDOMDEC]/tot+0.5),
 +                          (int)(100*cycles[ewcNS]/tot+0.5));
 +        }
 +    }
 +
 +    if (cycles[ewcMoveE] > tot*0.05)
 +    {
 +        /* Only the sim master calls this function, so always print to stderr */
 +        md_print_warn(NULL, fplog,
 +                      "NOTE: %d %% of the run time was spent communicating energies,\n"
 +                      "      you might want to use the -gcom option of mdrun\n",
 +                      (int)(100*cycles[ewcMoveE]/tot+0.5));
 +    }
 +}
 +
 +extern gmx_large_int_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
 +{
 +    if (wc == NULL)
 +    {
 +        return -1;
 +    }
 +
 +    return wc->reset_counters;
 +}
 +
 +extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_large_int_t reset_counters)
 +{
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    wc->reset_counters = reset_counters;
 +}
 +
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +
 +void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
 +{
 +    if (wc != NULL)
 +    {
 +        wc->wcsc[ewcs].start = gmx_cycles_read();
 +    }
 +}
 +
 +void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
 +{
 +    if (wc != NULL)
 +    {
 +        wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
 +        wc->wcsc[ewcs].n++;
 +    }
 +}
 +
 +#endif /* GMX_CYCLE_SUBCOUNTERS */
index da84f681dd110ab00462ca52f6e36e848ae713a6,0000000000000000000000000000000000000000..c01ffb2b4eedb800fdba45d4e423e596fd6fe085
mode 100644,000000..100644
--- /dev/null
@@@ -1,1348 -1,0 +1,1348 @@@
-     default:
-         gmx_incons("Unsupported nbnxn_atomdata_t format");
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "gmx_omp_nthreads.h"
 +
 +/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
 +void nbnxn_alloc_aligned(void **ptr, size_t nbytes)
 +{
 +    *ptr = save_malloc_aligned("ptr", __FILE__, __LINE__, nbytes, 1, NBNXN_MEM_ALIGN);
 +}
 +
 +/* Free function for memory allocated with nbnxn_alloc_aligned */
 +void nbnxn_free_aligned(void *ptr)
 +{
 +    sfree_aligned(ptr);
 +}
 +
 +/* Reallocation wrapper function for nbnxn data structures */
 +void nbnxn_realloc_void(void **ptr,
 +                        int nbytes_copy, int nbytes_new,
 +                        nbnxn_alloc_t *ma,
 +                        nbnxn_free_t  *mf)
 +{
 +    void *ptr_new;
 +
 +    ma(&ptr_new, nbytes_new);
 +
 +    if (nbytes_new > 0 && ptr_new == NULL)
 +    {
 +        gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
 +    }
 +
 +    if (nbytes_copy > 0)
 +    {
 +        if (nbytes_new < nbytes_copy)
 +        {
 +            gmx_incons("In nbnxn_realloc_void: new size less than copy size");
 +        }
 +        memcpy(ptr_new, *ptr, nbytes_copy);
 +    }
 +    if (*ptr != NULL)
 +    {
 +        mf(*ptr);
 +    }
 +    *ptr = ptr_new;
 +}
 +
 +/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
 +void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n)
 +{
 +    int t;
 +
 +    nbnxn_realloc_void((void **)&nbat->type,
 +                       nbat->natoms*sizeof(*nbat->type),
 +                       n*sizeof(*nbat->type),
 +                       nbat->alloc, nbat->free);
 +    nbnxn_realloc_void((void **)&nbat->lj_comb,
 +                       nbat->natoms*2*sizeof(*nbat->lj_comb),
 +                       n*2*sizeof(*nbat->lj_comb),
 +                       nbat->alloc, nbat->free);
 +    if (nbat->XFormat != nbatXYZQ)
 +    {
 +        nbnxn_realloc_void((void **)&nbat->q,
 +                           nbat->natoms*sizeof(*nbat->q),
 +                           n*sizeof(*nbat->q),
 +                           nbat->alloc, nbat->free);
 +    }
 +    if (nbat->nenergrp > 1)
 +    {
 +        nbnxn_realloc_void((void **)&nbat->energrp,
 +                           nbat->natoms/nbat->na_c*sizeof(*nbat->energrp),
 +                           n/nbat->na_c*sizeof(*nbat->energrp),
 +                           nbat->alloc, nbat->free);
 +    }
 +    nbnxn_realloc_void((void **)&nbat->x,
 +                       nbat->natoms*nbat->xstride*sizeof(*nbat->x),
 +                       n*nbat->xstride*sizeof(*nbat->x),
 +                       nbat->alloc, nbat->free);
 +    for (t = 0; t < nbat->nout; t++)
 +    {
 +        /* Allocate one element extra for possible signaling with CUDA */
 +        nbnxn_realloc_void((void **)&nbat->out[t].f,
 +                           nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
 +                           n*nbat->fstride*sizeof(*nbat->out[t].f),
 +                           nbat->alloc, nbat->free);
 +    }
 +    nbat->nalloc = n;
 +}
 +
 +/* Initializes an nbnxn_atomdata_output_t data structure */
 +static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
 +                                       int nb_kernel_type,
 +                                       int nenergrp, int stride,
 +                                       nbnxn_alloc_t *ma)
 +{
 +    int cj_size;
 +
 +    out->f = NULL;
 +    ma((void **)&out->fshift, SHIFTS*DIM*sizeof(*out->fshift));
 +    out->nV = nenergrp*nenergrp;
 +    ma((void **)&out->Vvdw, out->nV*sizeof(*out->Vvdw));
 +    ma((void **)&out->Vc, out->nV*sizeof(*out->Vc  ));
 +
 +    if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
 +        nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
 +    {
 +        cj_size  = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +        out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
 +        ma((void **)&out->VSvdw, out->nVS*sizeof(*out->VSvdw));
 +        ma((void **)&out->VSc, out->nVS*sizeof(*out->VSc  ));
 +    }
 +    else
 +    {
 +        out->nVS = 0;
 +    }
 +}
 +
 +static void copy_int_to_nbat_int(const int *a, int na, int na_round,
 +                                 const int *in, int fill, int *innb)
 +{
 +    int i, j;
 +
 +    j = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        innb[j++] = in[a[i]];
 +    }
 +    /* Complete the partially filled last cell with fill */
 +    for (; i < na_round; i++)
 +    {
 +        innb[j++] = fill;
 +    }
 +}
 +
 +static void clear_nbat_real(int na, int nbatFormat, real *xnb, int a0)
 +{
 +    int a, d, j, c;
 +
 +    switch (nbatFormat)
 +    {
 +        case nbatXYZ:
 +            for (a = 0; a < na; a++)
 +            {
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    xnb[(a0+a)*STRIDE_XYZ+d] = 0;
 +                }
 +            }
 +            break;
 +        case nbatXYZQ:
 +            for (a = 0; a < na; a++)
 +            {
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    xnb[(a0+a)*STRIDE_XYZQ+d] = 0;
 +                }
 +            }
 +            break;
 +        case nbatX4:
 +            j = X4_IND_A(a0);
 +            c = a0 & (PACK_X4-1);
 +            for (a = 0; a < na; a++)
 +            {
 +                xnb[j+XX*PACK_X4] = 0;
 +                xnb[j+YY*PACK_X4] = 0;
 +                xnb[j+ZZ*PACK_X4] = 0;
 +                j++;
 +                c++;
 +                if (c == PACK_X4)
 +                {
 +                    j += (DIM-1)*PACK_X4;
 +                    c  = 0;
 +                }
 +            }
 +            break;
 +        case nbatX8:
 +            j = X8_IND_A(a0);
 +            c = a0 & (PACK_X8-1);
 +            for (a = 0; a < na; a++)
 +            {
 +                xnb[j+XX*PACK_X8] = 0;
 +                xnb[j+YY*PACK_X8] = 0;
 +                xnb[j+ZZ*PACK_X8] = 0;
 +                j++;
 +                c++;
 +                if (c == PACK_X8)
 +                {
 +                    j += (DIM-1)*PACK_X8;
 +                    c  = 0;
 +                }
 +            }
 +            break;
 +    }
 +}
 +
 +void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
 +                            rvec *x, int nbatFormat, real *xnb, int a0,
 +                            int cx, int cy, int cz)
 +{
 +    int i, j, c;
 +
 +/* We might need to place filler particles to fill up the cell to na_round.
 + * The coefficients (LJ and q) for such particles are zero.
 + * But we might still get NaN as 0*NaN when distances are too small.
 + * We hope that -107 nm is far away enough from to zero
 + * to avoid accidental short distances to particles shifted down for pbc.
 + */
 +#define NBAT_FAR_AWAY 107
 +
 +    switch (nbatFormat)
 +    {
 +        case nbatXYZ:
 +            j = a0*STRIDE_XYZ;
 +            for (i = 0; i < na; i++)
 +            {
 +                xnb[j++] = x[a[i]][XX];
 +                xnb[j++] = x[a[i]][YY];
 +                xnb[j++] = x[a[i]][ZZ];
 +            }
 +            /* Complete the partially filled last cell with copies of the last element.
 +             * This simplifies the bounding box calculation and avoid
 +             * numerical issues with atoms that are coincidentally close.
 +             */
 +            for (; i < na_round; i++)
 +            {
 +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
 +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
 +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            }
 +            break;
 +        case nbatXYZQ:
 +            j = a0*STRIDE_XYZQ;
 +            for (i = 0; i < na; i++)
 +            {
 +                xnb[j++] = x[a[i]][XX];
 +                xnb[j++] = x[a[i]][YY];
 +                xnb[j++] = x[a[i]][ZZ];
 +                j++;
 +            }
 +            /* Complete the partially filled last cell with particles far apart */
 +            for (; i < na_round; i++)
 +            {
 +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
 +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
 +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
 +                j++;
 +            }
 +            break;
 +        case nbatX4:
 +            j = X4_IND_A(a0);
 +            c = a0 & (PACK_X4-1);
 +            for (i = 0; i < na; i++)
 +            {
 +                xnb[j+XX*PACK_X4] = x[a[i]][XX];
 +                xnb[j+YY*PACK_X4] = x[a[i]][YY];
 +                xnb[j+ZZ*PACK_X4] = x[a[i]][ZZ];
 +                j++;
 +                c++;
 +                if (c == PACK_X4)
 +                {
 +                    j += (DIM-1)*PACK_X4;
 +                    c  = 0;
 +                }
 +            }
 +            /* Complete the partially filled last cell with particles far apart */
 +            for (; i < na_round; i++)
 +            {
 +                xnb[j+XX*PACK_X4] = -NBAT_FAR_AWAY*(1 + cx);
 +                xnb[j+YY*PACK_X4] = -NBAT_FAR_AWAY*(1 + cy);
 +                xnb[j+ZZ*PACK_X4] = -NBAT_FAR_AWAY*(1 + cz + i);
 +                j++;
 +                c++;
 +                if (c == PACK_X4)
 +                {
 +                    j += (DIM-1)*PACK_X4;
 +                    c  = 0;
 +                }
 +            }
 +            break;
 +        case nbatX8:
 +            j = X8_IND_A(a0);
 +            c = a0 & (PACK_X8 - 1);
 +            for (i = 0; i < na; i++)
 +            {
 +                xnb[j+XX*PACK_X8] = x[a[i]][XX];
 +                xnb[j+YY*PACK_X8] = x[a[i]][YY];
 +                xnb[j+ZZ*PACK_X8] = x[a[i]][ZZ];
 +                j++;
 +                c++;
 +                if (c == PACK_X8)
 +                {
 +                    j += (DIM-1)*PACK_X8;
 +                    c  = 0;
 +                }
 +            }
 +            /* Complete the partially filled last cell with particles far apart */
 +            for (; i < na_round; i++)
 +            {
 +                xnb[j+XX*PACK_X8] = -NBAT_FAR_AWAY*(1 + cx);
 +                xnb[j+YY*PACK_X8] = -NBAT_FAR_AWAY*(1 + cy);
 +                xnb[j+ZZ*PACK_X8] = -NBAT_FAR_AWAY*(1 + cz + i);
 +                j++;
 +                c++;
 +                if (c == PACK_X8)
 +                {
 +                    j += (DIM-1)*PACK_X8;
 +                    c  = 0;
 +                }
 +            }
 +            break;
 +        default:
 +            gmx_incons("Unsupported nbnxn_atomdata_t format");
 +    }
 +}
 +
 +/* Determines the combination rule (or none) to be used, stores it,
 + * and sets the LJ parameters required with the rule.
 + */
 +static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
 +{
 +    int  nt, i, j;
 +    real c6, c12;
 +
 +    nt = nbat->ntype;
 +
 +    switch (nbat->comb_rule)
 +    {
 +        case  ljcrGEOM:
 +            nbat->comb_rule = ljcrGEOM;
 +
 +            for (i = 0; i < nt; i++)
 +            {
 +                /* Copy the diagonal from the nbfp matrix */
 +                nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
 +                nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
 +            }
 +            break;
 +        case ljcrLB:
 +            for (i = 0; i < nt; i++)
 +            {
 +                /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
 +                c6  = nbat->nbfp[(i*nt+i)*2  ];
 +                c12 = nbat->nbfp[(i*nt+i)*2+1];
 +                if (c6 > 0 && c12 > 0)
 +                {
 +                    /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
 +                     * so we get 6*C6 and 12*C12 after combining.
 +                     */
 +                    nbat->nbfp_comb[i*2  ] = 0.5*pow(c12/c6, 1.0/6.0);
 +                    nbat->nbfp_comb[i*2+1] = sqrt(c6*c6/c12);
 +                }
 +                else
 +                {
 +                    nbat->nbfp_comb[i*2  ] = 0;
 +                    nbat->nbfp_comb[i*2+1] = 0;
 +                }
 +            }
 +            break;
 +        case ljcrNONE:
 +            /* nbfp_s4 stores two parameters using a stride of 4,
 +             * because this would suit x86 SIMD single-precision
 +             * quad-load intrinsics. There's a slight inefficiency in
 +             * allocating and initializing nbfp_s4 when it might not
 +             * be used, but introducing the conditional code is not
 +             * really worth it. */
 +            nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
 +            for (i = 0; i < nt; i++)
 +            {
 +                for (j = 0; j < nt; j++)
 +                {
 +                    nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
 +                    nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
 +                    nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
 +                    nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
 +                }
 +            }
 +            break;
 +        default:
 +            gmx_incons("Unknown combination rule");
 +            break;
 +    }
 +}
 +
 +/* Initializes an nbnxn_atomdata_t data structure */
 +void nbnxn_atomdata_init(FILE *fp,
 +                         nbnxn_atomdata_t *nbat,
 +                         int nb_kernel_type,
 +                         int ntype, const real *nbfp,
 +                         int n_energygroups,
 +                         int nout,
 +                         nbnxn_alloc_t *alloc,
 +                         nbnxn_free_t  *free)
 +{
 +    int      i, j;
 +    real     c6, c12, tol;
 +    char    *ptr;
 +    gmx_bool simple, bCombGeom, bCombLB;
 +
 +    if (alloc == NULL)
 +    {
 +        nbat->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbat->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbat->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbat->free = free;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype);
 +    }
 +    nbat->ntype = ntype + 1;
 +    nbat->alloc((void **)&nbat->nbfp,
 +                nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp));
 +    nbat->alloc((void **)&nbat->nbfp_comb, nbat->ntype*2*sizeof(*nbat->nbfp_comb));
 +
 +    /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
 +     * force-field floating point parameters.
 +     */
 +    tol = 1e-5;
 +    ptr = getenv("GMX_LJCOMB_TOL");
 +    if (ptr != NULL)
 +    {
 +        double dbl;
 +
 +        sscanf(ptr, "%lf", &dbl);
 +        tol = dbl;
 +    }
 +    bCombGeom = TRUE;
 +    bCombLB   = TRUE;
 +
 +    /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
 +     * to check for the LB rule.
 +     */
 +    for (i = 0; i < ntype; i++)
 +    {
 +        c6  = nbfp[(i*ntype+i)*2  ]/6.0;
 +        c12 = nbfp[(i*ntype+i)*2+1]/12.0;
 +        if (c6 > 0 && c12 > 0)
 +        {
 +            nbat->nbfp_comb[i*2  ] = pow(c12/c6, 1.0/6.0);
 +            nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12;
 +        }
 +        else if (c6 == 0 && c12 == 0)
 +        {
 +            nbat->nbfp_comb[i*2  ] = 0;
 +            nbat->nbfp_comb[i*2+1] = 0;
 +        }
 +        else
 +        {
 +            /* Can not use LB rule with only dispersion or repulsion */
 +            bCombLB = FALSE;
 +        }
 +    }
 +
 +    for (i = 0; i < nbat->ntype; i++)
 +    {
 +        for (j = 0; j < nbat->ntype; j++)
 +        {
 +            if (i < ntype && j < ntype)
 +            {
 +                /* fr->nbfp has been updated, so that array too now stores c6/c12 including
 +                 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
 +                 */
 +                c6  = nbfp[(i*ntype+j)*2  ];
 +                c12 = nbfp[(i*ntype+j)*2+1];
 +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = c6;
 +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12;
 +
 +                /* Compare 6*C6 and 12*C12 for geometric cobination rule */
 +                bCombGeom = bCombGeom &&
 +                    gmx_within_tol(c6*c6, nbfp[(i*ntype+i)*2  ]*nbfp[(j*ntype+j)*2  ], tol) &&
 +                    gmx_within_tol(c12*c12, nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1], tol);
 +
 +                /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
 +                c6     /= 6.0;
 +                c12    /= 12.0;
 +                bCombLB = bCombLB &&
 +                    ((c6 == 0 && c12 == 0 &&
 +                      (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
 +                     (c6 > 0 && c12 > 0 &&
 +                      gmx_within_tol(pow(c12/c6, 1.0/6.0), 0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]), tol) &&
 +                      gmx_within_tol(0.25*c6*c6/c12, sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]), tol)));
 +            }
 +            else
 +            {
 +                /* Add zero parameters for the additional dummy atom type */
 +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = 0;
 +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0;
 +            }
 +        }
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
 +                bCombGeom, bCombLB);
 +    }
 +
 +    simple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    if (simple)
 +    {
 +        /* We prefer the geometic combination rule,
 +         * as that gives a slightly faster kernel than the LB rule.
 +         */
 +        if (bCombGeom)
 +        {
 +            nbat->comb_rule = ljcrGEOM;
 +        }
 +        else if (bCombLB)
 +        {
 +            nbat->comb_rule = ljcrLB;
 +        }
 +        else
 +        {
 +            nbat->comb_rule = ljcrNONE;
 +
 +            nbat->free(nbat->nbfp_comb);
 +        }
 +
 +        if (fp)
 +        {
 +            if (nbat->comb_rule == ljcrNONE)
 +            {
 +                fprintf(fp, "Using full Lennard-Jones parameter combination matrix\n\n");
 +            }
 +            else
 +            {
 +                fprintf(fp, "Using %s Lennard-Jones combination rule\n\n",
 +                        nbat->comb_rule == ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
 +            }
 +        }
 +
 +        set_combination_rule_data(nbat);
 +    }
 +    else
 +    {
 +        nbat->comb_rule = ljcrNONE;
 +
 +        nbat->free(nbat->nbfp_comb);
 +    }
 +
 +    nbat->natoms  = 0;
 +    nbat->type    = NULL;
 +    nbat->lj_comb = NULL;
 +    if (simple)
 +    {
 +        int pack_x;
 +
 +        switch (nb_kernel_type)
 +        {
 +            case nbnxnk4xN_SIMD_4xN:
 +            case nbnxnk4xN_SIMD_2xNN:
 +                pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
 +                             nbnxn_kernel_to_cj_size(nb_kernel_type));
 +                switch (pack_x)
 +                {
 +                    case 4:
 +                        nbat->XFormat = nbatX4;
 +                        break;
 +                    case 8:
 +                        nbat->XFormat = nbatX8;
 +                        break;
 +                    default:
 +                        gmx_incons("Unsupported packing width");
 +                }
 +                break;
 +            default:
 +                nbat->XFormat = nbatXYZ;
 +                break;
 +        }
 +
 +        nbat->FFormat = nbat->XFormat;
 +    }
 +    else
 +    {
 +        nbat->XFormat = nbatXYZQ;
 +        nbat->FFormat = nbatXYZ;
 +    }
 +    nbat->q        = NULL;
 +    nbat->nenergrp = n_energygroups;
 +    if (!simple)
 +    {
 +        /* Energy groups not supported yet for super-sub lists */
 +        if (n_energygroups > 1 && fp != NULL)
 +        {
 +            fprintf(fp, "\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
 +        }
 +        nbat->nenergrp = 1;
 +    }
 +    /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
 +    if (nbat->nenergrp > 64)
 +    {
 +        gmx_fatal(FARGS, "With NxN kernels not more than 64 energy groups are supported\n");
 +    }
 +    nbat->neg_2log = 1;
 +    while (nbat->nenergrp > (1<<nbat->neg_2log))
 +    {
 +        nbat->neg_2log++;
 +    }
 +    nbat->energrp = NULL;
 +    nbat->alloc((void **)&nbat->shift_vec, SHIFTS*sizeof(*nbat->shift_vec));
 +    nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 +    nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 +    nbat->x       = NULL;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    if (simple)
 +    {
 +        /* Set the diagonal cluster pair exclusion mask setup data.
 +         * In the kernel we check 0 < j - i to generate the masks.
 +         * Here we store j - i for generating the mask for the first i,
 +         * we substract 0.5 to avoid rounding issues.
 +         * In the kernel we can subtract 1 to generate the subsequent mask.
 +         */
 +        const int simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
 +        int       simd_4xn_diag_size, j;
 +
 +        simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
 +        snew_aligned(nbat->simd_4xn_diag, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
 +        for (j = 0; j < simd_4xn_diag_size; j++)
 +        {
 +            nbat->simd_4xn_diag[j] = j - 0.5;
 +        }
 +
 +        snew_aligned(nbat->simd_2xnn_diag, simd_width, NBNXN_MEM_ALIGN);
 +        for (j = 0; j < simd_width/2; j++)
 +        {
 +            /* The j-cluster size is half the SIMD width */
 +            nbat->simd_2xnn_diag[j]              = j - 0.5;
 +            /* The next half of the SIMD width is for i + 1 */
 +            nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
 +        }
 +    }
 +#endif
 +
 +    /* Initialize the output data structures */
 +    nbat->nout    = nout;
 +    snew(nbat->out, nbat->nout);
 +    nbat->nalloc  = 0;
 +    for (i = 0; i < nbat->nout; i++)
 +    {
 +        nbnxn_atomdata_output_init(&nbat->out[i],
 +                                   nb_kernel_type,
 +                                   nbat->nenergrp, 1<<nbat->neg_2log,
 +                                   nbat->alloc);
 +    }
 +    nbat->buffer_flags.flag        = NULL;
 +    nbat->buffer_flags.flag_nalloc = 0;
 +}
 +
 +static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
 +                                       const int *type, int na,
 +                                       real *ljparam_at)
 +{
 +    int is, k, i;
 +
 +    /* The LJ params follow the combination rule:
 +     * copy the params for the type array to the atom array.
 +     */
 +    for (is = 0; is < na; is += PACK_X4)
 +    {
 +        for (k = 0; k < PACK_X4; k++)
 +        {
 +            i = is + k;
 +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
 +            ljparam_at[is*2+PACK_X4+k] = ljparam_type[type[i]*2+1];
 +        }
 +    }
 +}
 +
 +static void copy_lj_to_nbat_lj_comb_x8(const real *ljparam_type,
 +                                       const int *type, int na,
 +                                       real *ljparam_at)
 +{
 +    int is, k, i;
 +
 +    /* The LJ params follow the combination rule:
 +     * copy the params for the type array to the atom array.
 +     */
 +    for (is = 0; is < na; is += PACK_X8)
 +    {
 +        for (k = 0; k < PACK_X8; k++)
 +        {
 +            i = is + k;
 +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
 +            ljparam_at[is*2+PACK_X8+k] = ljparam_type[type[i]*2+1];
 +        }
 +    }
 +}
 +
 +/* Sets the atom type and LJ data in nbnxn_atomdata_t */
 +static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t    *nbat,
 +                                         int                  ngrid,
 +                                         const nbnxn_search_t nbs,
 +                                         const int           *type)
 +{
 +    int                 g, i, ncz, ash;
 +    const nbnxn_grid_t *grid;
 +
 +    for (g = 0; g < ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for (i = 0; i < grid->ncx*grid->ncy; i++)
 +        {
 +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
 +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
 +
 +            copy_int_to_nbat_int(nbs->a+ash, grid->cxy_na[i], ncz*grid->na_sc,
 +                                 type, nbat->ntype-1, nbat->type+ash);
 +
 +            if (nbat->comb_rule != ljcrNONE)
 +            {
 +                if (nbat->XFormat == nbatX4)
 +                {
 +                    copy_lj_to_nbat_lj_comb_x4(nbat->nbfp_comb,
 +                                               nbat->type+ash, ncz*grid->na_sc,
 +                                               nbat->lj_comb+ash*2);
 +                }
 +                else if (nbat->XFormat == nbatX8)
 +                {
 +                    copy_lj_to_nbat_lj_comb_x8(nbat->nbfp_comb,
 +                                               nbat->type+ash, ncz*grid->na_sc,
 +                                               nbat->lj_comb+ash*2);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Sets the charges in nbnxn_atomdata_t *nbat */
 +static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t    *nbat,
 +                                       int                  ngrid,
 +                                       const nbnxn_search_t nbs,
 +                                       const real          *charge)
 +{
 +    int                 g, cxy, ncz, ash, na, na_round, i, j;
 +    real               *q;
 +    const nbnxn_grid_t *grid;
 +
 +    for (g = 0; g < ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for (cxy = 0; cxy < grid->ncx*grid->ncy; cxy++)
 +        {
 +            ash      = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +            na       = grid->cxy_na[cxy];
 +            na_round = (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
 +
 +            if (nbat->XFormat == nbatXYZQ)
 +            {
 +                q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1;
 +                for (i = 0; i < na; i++)
 +                {
 +                    *q = charge[nbs->a[ash+i]];
 +                    q += STRIDE_XYZQ;
 +                }
 +                /* Complete the partially filled last cell with zeros */
 +                for (; i < na_round; i++)
 +                {
 +                    *q = 0;
 +                    q += STRIDE_XYZQ;
 +                }
 +            }
 +            else
 +            {
 +                q = nbat->q + ash;
 +                for (i = 0; i < na; i++)
 +                {
 +                    *q = charge[nbs->a[ash+i]];
 +                    q++;
 +                }
 +                /* Complete the partially filled last cell with zeros */
 +                for (; i < na_round; i++)
 +                {
 +                    *q = 0;
 +                    q++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Copies the energy group indices to a reordered and packed array */
 +static void copy_egp_to_nbat_egps(const int *a, int na, int na_round,
 +                                  int na_c, int bit_shift,
 +                                  const int *in, int *innb)
 +{
 +    int i, j, sa, at;
 +    int comb;
 +
 +    j = 0;
 +    for (i = 0; i < na; i += na_c)
 +    {
 +        /* Store na_c energy group numbers into one int */
 +        comb = 0;
 +        for (sa = 0; sa < na_c; sa++)
 +        {
 +            at = a[i+sa];
 +            if (at >= 0)
 +            {
 +                comb |= (GET_CGINFO_GID(in[at]) << (sa*bit_shift));
 +            }
 +        }
 +        innb[j++] = comb;
 +    }
 +    /* Complete the partially filled last cell with fill */
 +    for (; i < na_round; i += na_c)
 +    {
 +        innb[j++] = 0;
 +    }
 +}
 +
 +/* Set the energy group indices for atoms in nbnxn_atomdata_t */
 +static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t    *nbat,
 +                                            int                  ngrid,
 +                                            const nbnxn_search_t nbs,
 +                                            const int           *atinfo)
 +{
 +    int                 g, i, ncz, ash;
 +    const nbnxn_grid_t *grid;
 +
 +    for (g = 0; g < ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for (i = 0; i < grid->ncx*grid->ncy; i++)
 +        {
 +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
 +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
 +
 +            copy_egp_to_nbat_egps(nbs->a+ash, grid->cxy_na[i], ncz*grid->na_sc,
 +                                  nbat->na_c, nbat->neg_2log,
 +                                  atinfo, nbat->energrp+(ash>>grid->na_c_2log));
 +        }
 +    }
 +}
 +
 +/* Sets all required atom parameter data in nbnxn_atomdata_t */
 +void nbnxn_atomdata_set(nbnxn_atomdata_t    *nbat,
 +                        int                  locality,
 +                        const nbnxn_search_t nbs,
 +                        const t_mdatoms     *mdatoms,
 +                        const int           *atinfo)
 +{
 +    int ngrid;
 +
 +    if (locality == eatLocal)
 +    {
 +        ngrid = 1;
 +    }
 +    else
 +    {
 +        ngrid = nbs->ngrid;
 +    }
 +
 +    nbnxn_atomdata_set_atomtypes(nbat, ngrid, nbs, mdatoms->typeA);
 +
 +    nbnxn_atomdata_set_charges(nbat, ngrid, nbs, mdatoms->chargeA);
 +
 +    if (nbat->nenergrp > 1)
 +    {
 +        nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo);
 +    }
 +}
 +
 +/* Copies the shift vector array to nbnxn_atomdata_t */
 +void nbnxn_atomdata_copy_shiftvec(gmx_bool          bDynamicBox,
 +                                  rvec             *shift_vec,
 +                                  nbnxn_atomdata_t *nbat)
 +{
 +    int i;
 +
 +    nbat->bDynamicBox = bDynamicBox;
 +    for (i = 0; i < SHIFTS; i++)
 +    {
 +        copy_rvec(shift_vec[i], nbat->shift_vec[i]);
 +    }
 +}
 +
 +/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
 +void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
 +                                     int                  locality,
 +                                     gmx_bool             FillLocal,
 +                                     rvec                *x,
 +                                     nbnxn_atomdata_t    *nbat)
 +{
 +    int g0 = 0, g1 = 0;
 +    int nth, th;
 +
 +    switch (locality)
 +    {
 +        case eatAll:
 +            g0 = 0;
 +            g1 = nbs->ngrid;
 +            break;
 +        case eatLocal:
 +            g0 = 0;
 +            g1 = 1;
 +            break;
 +        case eatNonlocal:
 +            g0 = 1;
 +            g1 = nbs->ngrid;
 +            break;
 +    }
 +
 +    if (FillLocal)
 +    {
 +        nbat->natoms_local = nbs->grid[0].nc*nbs->grid[0].na_sc;
 +    }
 +
 +    nth = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +    for (th = 0; th < nth; th++)
 +    {
 +        int g;
 +
 +        for (g = g0; g < g1; g++)
 +        {
 +            const nbnxn_grid_t *grid;
 +            int                 cxy0, cxy1, cxy;
 +
 +            grid = &nbs->grid[g];
 +
 +            cxy0 = (grid->ncx*grid->ncy* th   +nth-1)/nth;
 +            cxy1 = (grid->ncx*grid->ncy*(th+1)+nth-1)/nth;
 +
 +            for (cxy = cxy0; cxy < cxy1; cxy++)
 +            {
 +                int na, ash, na_fill;
 +
 +                na  = grid->cxy_na[cxy];
 +                ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +                if (g == 0 && FillLocal)
 +                {
 +                    na_fill =
 +                        (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
 +                }
 +                else
 +                {
 +                    /* We fill only the real particle locations.
 +                     * We assume the filling entries at the end have been
 +                     * properly set before during ns.
 +                     */
 +                    na_fill = na;
 +                }
 +                copy_rvec_to_nbat_real(nbs->a+ash, na, na_fill, x,
 +                                       nbat->XFormat, nbat->x, ash,
 +                                       0, 0, 0);
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_clear_reals(real * gmx_restrict dest,
 +                           int i0, int i1)
 +{
 +    int i;
 +
 +    for (i = i0; i < i1; i++)
 +    {
 +        dest[i] = 0;
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
 +                            gmx_bool bDestSet,
 +                            real ** gmx_restrict src,
 +                            int nsrc,
 +                            int i0, int i1)
 +{
 +    int i, s;
 +
 +    if (bDestSet)
 +    {
 +        /* The destination buffer contains data, add to it */
 +        for (i = i0; i < i1; i++)
 +        {
 +            for (s = 0; s < nsrc; s++)
 +            {
 +                dest[i] += src[s][i];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* The destination buffer is unitialized, set it first */
 +        for (i = i0; i < i1; i++)
 +        {
 +            dest[i] = src[0][i];
 +            for (s = 1; s < nsrc; s++)
 +            {
 +                dest[i] += src[s][i];
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
 +                                 gmx_bool bDestSet,
 +                                 real ** gmx_restrict src,
 +                                 int nsrc,
 +                                 int i0, int i1)
 +{
 +#ifdef GMX_NBNXN_SIMD
 +/* The SIMD width here is actually independent of that in the kernels,
 + * but we use the same width for simplicity (usually optimal anyhow).
 + */
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define GMX_MM128_HERE
 +#endif
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#define GMX_MM256_HERE
 +#endif
 +#include "gmx_simd_macros.h"
 +
 +    int       i, s;
 +    gmx_mm_pr dest_SSE, src_SSE;
 +
 +    if (bDestSet)
 +    {
 +        for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
 +        {
 +            dest_SSE = gmx_load_pr(dest+i);
 +            for (s = 0; s < nsrc; s++)
 +            {
 +                src_SSE  = gmx_load_pr(src[s]+i);
 +                dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
 +            }
 +            gmx_store_pr(dest+i, dest_SSE);
 +        }
 +    }
 +    else
 +    {
 +        for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
 +        {
 +            dest_SSE = gmx_load_pr(src[0]+i);
 +            for (s = 1; s < nsrc; s++)
 +            {
 +                src_SSE  = gmx_load_pr(src[s]+i);
 +                dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
 +            }
 +            gmx_store_pr(dest+i, dest_SSE);
 +        }
 +    }
 +
 +#undef GMX_MM128_HERE
 +#undef GMX_MM256_HERE
 +#endif
 +}
 +
 +/* Add part of the force array(s) from nbnxn_atomdata_t to f */
 +static void
 +nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
 +                                    const nbnxn_atomdata_t *nbat,
 +                                    nbnxn_atomdata_output_t *out,
 +                                    int nfa,
 +                                    int a0, int a1,
 +                                    rvec *f)
 +{
 +    int         a, i, fa;
 +    const int  *cell;
 +    const real *fnb;
 +
 +    cell = nbs->cell;
 +
 +    /* Loop over all columns and copy and fill */
 +    switch (nbat->FFormat)
 +    {
 +        case nbatXYZ:
 +        case nbatXYZQ:
 +            if (nfa == 1)
 +            {
 +                fnb = out[0].f;
 +
 +                for (a = a0; a < a1; a++)
 +                {
 +                    i = cell[a]*nbat->fstride;
 +
 +                    f[a][XX] += fnb[i];
 +                    f[a][YY] += fnb[i+1];
 +                    f[a][ZZ] += fnb[i+2];
 +                }
 +            }
 +            else
 +            {
 +                for (a = a0; a < a1; a++)
 +                {
 +                    i = cell[a]*nbat->fstride;
 +
 +                    for (fa = 0; fa < nfa; fa++)
 +                    {
 +                        f[a][XX] += out[fa].f[i];
 +                        f[a][YY] += out[fa].f[i+1];
 +                        f[a][ZZ] += out[fa].f[i+2];
 +                    }
 +                }
 +            }
 +            break;
 +        case nbatX4:
 +            if (nfa == 1)
 +            {
 +                fnb = out[0].f;
 +
 +                for (a = a0; a < a1; a++)
 +                {
 +                    i = X4_IND_A(cell[a]);
 +
 +                    f[a][XX] += fnb[i+XX*PACK_X4];
 +                    f[a][YY] += fnb[i+YY*PACK_X4];
 +                    f[a][ZZ] += fnb[i+ZZ*PACK_X4];
 +                }
 +            }
 +            else
 +            {
 +                for (a = a0; a < a1; a++)
 +                {
 +                    i = X4_IND_A(cell[a]);
 +
 +                    for (fa = 0; fa < nfa; fa++)
 +                    {
 +                        f[a][XX] += out[fa].f[i+XX*PACK_X4];
 +                        f[a][YY] += out[fa].f[i+YY*PACK_X4];
 +                        f[a][ZZ] += out[fa].f[i+ZZ*PACK_X4];
 +                    }
 +                }
 +            }
 +            break;
 +        case nbatX8:
 +            if (nfa == 1)
 +            {
 +                fnb = out[0].f;
 +
 +                for (a = a0; a < a1; a++)
 +                {
 +                    i = X8_IND_A(cell[a]);
 +
 +                    f[a][XX] += fnb[i+XX*PACK_X8];
 +                    f[a][YY] += fnb[i+YY*PACK_X8];
 +                    f[a][ZZ] += fnb[i+ZZ*PACK_X8];
 +                }
 +            }
 +            else
 +            {
 +                for (a = a0; a < a1; a++)
 +                {
 +                    i = X8_IND_A(cell[a]);
 +
 +                    for (fa = 0; fa < nfa; fa++)
 +                    {
 +                        f[a][XX] += out[fa].f[i+XX*PACK_X8];
 +                        f[a][YY] += out[fa].f[i+YY*PACK_X8];
 +                        f[a][ZZ] += out[fa].f[i+ZZ*PACK_X8];
 +                    }
 +                }
 +            }
 +            break;
++        default:
++            gmx_incons("Unsupported nbnxn_atomdata_t format");
 +    }
 +}
 +
 +/* Add the force array(s) from nbnxn_atomdata_t to f */
 +void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t    nbs,
 +                                    int                     locality,
 +                                    const nbnxn_atomdata_t *nbat,
 +                                    rvec                   *f)
 +{
 +    int a0 = 0, na = 0;
 +    int nth, th;
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCreducef]);
 +
 +    switch (locality)
 +    {
 +        case eatAll:
 +            a0 = 0;
 +            na = nbs->natoms_nonlocal;
 +            break;
 +        case eatLocal:
 +            a0 = 0;
 +            na = nbs->natoms_local;
 +            break;
 +        case eatNonlocal:
 +            a0 = nbs->natoms_local;
 +            na = nbs->natoms_nonlocal - nbs->natoms_local;
 +            break;
 +    }
 +
 +    nth = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (nbat->nout > 1)
 +    {
 +        if (locality != eatAll)
 +        {
 +            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
 +        }
 +
 +        /* Reduce the force thread output buffers into buffer 0, before adding
 +         * them to the, differently ordered, "real" force buffer.
 +         */
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +        for (th = 0; th < nth; th++)
 +        {
 +            const nbnxn_buffer_flags_t *flags;
 +            int   b0, b1, b;
 +            int   i0, i1;
 +            int   nfptr;
 +            real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
 +            int   out;
 +
 +            flags = &nbat->buffer_flags;
 +
 +            /* Calculate the cell-block range for our thread */
 +            b0 = (flags->nflag* th   )/nth;
 +            b1 = (flags->nflag*(th+1))/nth;
 +
 +            for (b = b0; b < b1; b++)
 +            {
 +                i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
 +                i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
 +
 +                nfptr = 0;
 +                for (out = 1; out < nbat->nout; out++)
 +                {
 +                    if (flags->flag[b] & (1U<<out))
 +                    {
 +                        fptr[nfptr++] = nbat->out[out].f;
 +                    }
 +                }
 +                if (nfptr > 0)
 +                {
 +#ifdef GMX_NBNXN_SIMD
 +                    nbnxn_atomdata_reduce_reals_simd
 +#else
 +                    nbnxn_atomdata_reduce_reals
 +#endif
 +                        (nbat->out[0].f,
 +                        flags->flag[b] & (1U<<0),
 +                        fptr, nfptr,
 +                        i0, i1);
 +                }
 +                else if (!(flags->flag[b] & (1U<<0)))
 +                {
 +                    nbnxn_atomdata_clear_reals(nbat->out[0].f,
 +                                               i0, i1);
 +                }
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +    for (th = 0; th < nth; th++)
 +    {
 +        nbnxn_atomdata_add_nbat_f_to_f_part(nbs, nbat,
 +                                            nbat->out,
 +                                            1,
 +                                            a0+((th+0)*na)/nth,
 +                                            a0+((th+1)*na)/nth,
 +                                            f);
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
 +}
 +
 +/* Adds the shift forces from nbnxn_atomdata_t to fshift */
 +void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
 +                                              rvec                   *fshift)
 +{
 +    const nbnxn_atomdata_output_t *out;
 +    int  th;
 +    int  s;
 +    rvec sum;
 +
 +    out = nbat->out;
 +
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        clear_rvec(sum);
 +        for (th = 0; th < nbat->nout; th++)
 +        {
 +            sum[XX] += out[th].fshift[s*DIM+XX];
 +            sum[YY] += out[th].fshift[s*DIM+YY];
 +            sum[ZZ] += out[th].fshift[s*DIM+ZZ];
 +        }
 +        rvec_inc(fshift[s], sum);
 +    }
 +}
index 1788eee771ab5c50af3e8dd2333b63b1174a31fc,0000000000000000000000000000000000000000..4acca9c3b1de7199554bef079d7e87a90e66b4f3
mode 100644,000000..100644
--- /dev/null
@@@ -1,675 -1,0 +1,676 @@@
-                          float *e_lj, float *e_el, rvec *fshift)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#if defined(_MSVC)
 +#include <limits>
 +#endif
 +
 +#include <cuda.h>
 +
 +#include "types/simple.h" 
 +#include "types/nbnxn_pairlist.h"
 +#include "types/nb_verlet.h"
 +#include "types/ishift.h"
 +#include "types/force_flags.h"
 +#include "../nbnxn_consts.h"
 +
 +#ifdef TMPI_ATOMICS
 +#include "thread_mpi/atomic.h"
 +#endif
 +
 +#include "nbnxn_cuda_types.h"
 +#include "../../gmxlib/cuda_tools/cudautils.cuh"
 +#include "nbnxn_cuda.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +
 +/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
 +texture<float, 1, cudaReadModeElementType> tex_nbfp;
 +
 +/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
 +texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
 +
 +/* Convenience defines */
 +#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 +
 +/***** The kernels come here *****/
 +#include "nbnxn_cuda_kernel_utils.cuh"
 +
 +/* Generate all combinations of kernels through multiple inclusion:
 +   F, F + E, F + prune, F + E + prune. */
 +/** Force only **/
 +#include "nbnxn_cuda_kernels.cuh"
 +/** Force & energy **/
 +#define CALC_ENERGIES
 +#include "nbnxn_cuda_kernels.cuh"
 +#undef CALC_ENERGIES
 +
 +/*** Pair-list pruning kernels ***/
 +/** Force only **/
 +#define PRUNE_NBL
 +#include "nbnxn_cuda_kernels.cuh"
 +/** Force & energy **/
 +#define CALC_ENERGIES
 +#include "nbnxn_cuda_kernels.cuh"
 +#undef CALC_ENERGIES
 +#undef PRUNE_NBL
 +
 +/*! Nonbonded kernel function pointer type */
 +typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 +                                     const cu_nbparam_t,
 +                                     const cu_plist_t,
 +                                     bool);
 +
 +/*********************************/
 +
 +/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 +static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 +static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 +static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 +
 +
 +/* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 + * and corresponds to having the exponent set to the maximum (127 -- single
 + * precision) and the mantissa to 0.
 + */
 +static unsigned int poll_wait_pattern = (0x7FU << 23);
 +
 +/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 +static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 +{
 +    int max_grid_x_size;
 +
 +    assert(dinfo);
 +
 +    max_grid_x_size = dinfo->prop.maxGridSize[0];
 +
 +    /* do we exceed the grid x dimension limit? */
 +    if (nwork_units > max_grid_x_size)
 +    {
 +        gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 +                  "The number of nonbonded work units (=number of super-clusters) exceeds the"
 +                  "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 +    }
 +
 +    return nwork_units;
 +}
 +
 +
 +/* Constant arrays listing all kernel function pointers and enabling selection
 +   of a kernel in an elegant manner. */
 +
 +static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 +static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 +
 +/* Default kernels */
 +static const nbnxn_cu_kfunc_ptr_t
 +nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 +{
 +    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 +      { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 +    { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 +      { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 +    { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 +      { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 +    { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 +      { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 +};
 +
 +/* Legacy kernels */
 +static const nbnxn_cu_kfunc_ptr_t
 +nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 +{
 +    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 +      { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 +    { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 +      { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 +    { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 +      { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 +    { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 +      { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 +};
 +
 +/*! Return a pointer to the kernel version to be executed at the current step. */
 +static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 +                                                       bool bDoEne, bool bDoPrune)
 +{
 +    assert(kver < eNbnxnCuKNR);
 +    assert(eeltype < eelCuNR);
 +
 +    if (NBNXN_KVER_LEGACY(kver))
 +    {
 +        return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 +    }
 +    else
 +    {
 +        return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 +    }
 +}
 +
 +/*! Calculates the amount of shared memory required for kernel version in use. */
 +static inline int calc_shmem_required(int kver)
 +{
 +    int shmem;
 +
 +    /* size of shmem (force-buffers/xq/atom type preloading) */
 +    if (NBNXN_KVER_LEGACY(kver))
 +    {
 +        /* i-atom x+q in shared memory */
 +        shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 +        /* force reduction buffers in shared memory */
 +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 +    }
 +    else
 +    {
 +        /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 +        /* i-atom x+q in shared memory */
 +        shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 +        /* cj in shared memory, for both warps separately */
 +        shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 +#ifdef IATYPE_SHMEM
 +        /* i-atom types in shared memory */
 +        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 +#endif
 +#if __CUDA_ARCH__ < 300
 +        /* force reduction buffers in shared memory */
 +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 +#endif
 +    }
 +
 +    return shmem;
 +}
 +
 +/*! As we execute nonbonded workload in separate streams, before launching 
 +   the kernel we need to make sure that he following operations have completed:
 +   - atomdata allocation and related H2D transfers (every nstlist step);
 +   - pair list H2D transfer (every nstlist step);
 +   - shift vector H2D transfer (every nstlist step);
 +   - force (+shift force and energy) output clearing (every step).
 +
 +   These operations are issued in the local stream at the beginning of the step
 +   and therefore always complete before the local kernel launch. The non-local
 +   kernel is launched after the local on the same device/context, so this is
 +   inherently scheduled after the operations in the local stream (including the
 +   above "misc_ops").
 +   However, for the sake of having a future-proof implementation, we use the
 +   misc_ops_done event to record the point in time when the above  operations
 +   are finished and synchronize with this event in the non-local stream.
 +*/
 +void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_atomdata_t *nbatom,
 +                              int flags,
 +                              int iloc)
 +{
 +    cudaError_t stat;
 +    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 +    /* CUDA kernel launch-related stuff */
 +    int  shmem, nblock;
 +    dim3 dim_block, dim_grid;
 +    nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 +
 +    cu_atomdata_t   *adat   = cu_nb->atdat;
 +    cu_nbparam_t    *nbp    = cu_nb->nbparam;
 +    cu_plist_t      *plist  = cu_nb->plist[iloc];
 +    cu_timers_t     *t      = cu_nb->timers;
 +    cudaStream_t    stream  = cu_nb->stream[iloc];
 +
 +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +    bool bDoTime     = cu_nb->bDoTime;
 +
 +    /* turn energy calculation always on/off (for debugging/testing only) */
 +    bCalcEner = (bCalcEner || always_ener) && !never_ener;
 +
 +    /* don't launch the kernel if there is no work to do */
 +    if (plist->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_I(iloc))
 +    {
 +        adat_begin  = 0;
 +        adat_len    = adat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_begin  = adat->natoms_local;
 +        adat_len    = adat->natoms - adat->natoms_local;
 +    }
 +
 +    /* When we get here all misc operations issues in the local stream are done,
 +       so we record that in the local stream and wait for it in the nonlocal one. */
 +    if (cu_nb->bUseTwoStreams)
 +    {
 +        if (iloc == eintLocal)
 +        {
 +            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 +            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 +        }
 +        else
 +        {
 +            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 +            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 +        }
 +    }
 +
 +    /* beginning of timed HtoD section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* HtoD x, q */
 +    cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 +                      adat_len * sizeof(*adat->xq), stream); 
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* beginning of timed nonbonded calculation section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* get the pointer to the kernel flavor we need to use */
 +    nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 +                                    plist->bDoPrune || always_prune);
 +
 +    /* kernel launch config */
 +    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 +    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 +    dim_grid  = dim3(nblock, 1, 1);
 +    shmem     = calc_shmem_required(cu_nb->kernel_ver);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 +                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 +                dim_block.x, dim_block.y, dim_block.z,
 +                dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 +                NCL_PER_SUPERCL, plist->na_c);
 +    }
 +
 +    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 +    CU_LAUNCH_ERR("k_calc_nb");
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 +                               const nbnxn_atomdata_t *nbatom,
 +                               int flags,
 +                               int aloc)
 +{
 +    cudaError_t stat;
 +    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 +    int iloc = -1;
 +
 +    /* determine interaction locality from atom locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        iloc = eintLocal;
 +    }
 +    else if (NONLOCAL_A(aloc))
 +    {
 +        iloc = eintNonlocal;
 +    }
 +    else
 +    {
 +        char stmp[STRLEN];
 +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 +        gmx_incons(stmp);
 +    }
 +
 +    cu_atomdata_t   *adat   = cu_nb->atdat;
 +    cu_timers_t     *t      = cu_nb->timers;
 +    bool            bDoTime = cu_nb->bDoTime;
 +    cudaStream_t    stream  = cu_nb->stream[iloc];
 +
 +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +
 +    /* don't launch copy-back if there was no work to do */
 +    if (cu_nb->plist[iloc]->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        adat_begin  = 0;
 +        adat_len    = adat->natoms_local;
 +        adat_end    = cu_nb->atdat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_begin  = adat->natoms_local;
 +        adat_len    = adat->natoms - adat->natoms_local;
 +        adat_end    = cu_nb->atdat->natoms;
 +    }
 +
 +    /* beginning of timed D2H section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    if (!cu_nb->bUseStreamSync)
 +    {
 +        /* For safety reasons set a few (5%) forces to NaN. This way even if the
 +           polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 +        for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 +        {
 +#ifdef NAN
 +            nbatom->out[0].f[i] = NAN;
 +#else
 +#  ifdef _MSVC
 +            if (numeric_limits<float>::has_quiet_NaN)
 +            {
 +                nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 +            }
 +            else
 +#  endif
 +            {
 +                nbatom->out[0].f[i] = GMX_REAL_MAX;
 +            }
 +#endif
 +        }
 +
 +        /* Set the last four bytes of the force array to a bit pattern
 +           which can't be the result of the force calculation:
 +           max exponent (127) and zero mantissa. */
 +        *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 +    }
 +
 +    /* With DD the local D2H transfer can only start after the non-local 
 +       has been launched. */
 +    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 +    {
 +        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 +        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 +    }
 +
 +    /* DtoH f */
 +    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
 +                      (adat_len)*sizeof(*adat->f), stream);
 +
 +    /* After the non-local D2H is launched the nonlocal_done event can be
 +       recorded which signals that the local D2H can proceed. This event is not
 +       placed after the non-local kernel because we first need the non-local
 +       data back first. */
 +    if (iloc == eintNonlocal)
 +    {
 +        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 +        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 +    }
 +
 +    /* only transfer energies in the local stream */
 +    if (LOCAL_I(iloc))
 +    {
 +        /* DtoH fshift */
 +        if (bCalcFshift)
 +        {
 +            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 +                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 +        }
 +
 +        /* DtoH energies */
 +        if (bCalcEner)
 +        {
 +            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 +                              sizeof(*cu_nb->nbst.e_lj), stream);
 +            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 +                              sizeof(*cu_nb->nbst.e_el), stream);
 +        }
 +    }
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +/* Atomic compare-exchange operation on unsigned values. It is used in
 + * polling wait for the GPU.
 + */
 +static inline bool atomic_cas(volatile unsigned int *ptr,
 +                              unsigned int oldval,
 +                              unsigned int newval)
 +{
 +    assert(ptr);
 +
 +#ifdef TMPI_ATOMICS
 +    return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 +#else
 +    gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 +    return true;
 +#endif
 +}
 +
 +void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 +                         const nbnxn_atomdata_t *nbatom,
 +                         int flags, int aloc,
++                         real *e_lj, real *e_el, rvec *fshift)
 +{
++    /* NOTE:  only implemented for single-precision at this time */
 +    cudaError_t stat;
 +    int i, adat_end, iloc = -1;
 +    volatile unsigned int *poll_word;
 +
 +    /* determine interaction locality from atom locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        iloc = eintLocal;
 +    }
 +    else if (NONLOCAL_A(aloc))
 +    {
 +        iloc = eintNonlocal;
 +    }
 +    else
 +    {
 +        char stmp[STRLEN];
 +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 +        gmx_incons(stmp);
 +    }
 +
 +    cu_plist_t      *plist   = cu_nb->plist[iloc];
 +    cu_timers_t     *timers  = cu_nb->timers;
 +    wallclock_gpu_t *timings = cu_nb->timings;
 +    nb_staging      nbst     = cu_nb->nbst;
 +
 +    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +
 +    /* turn energy calculation always on/off (for debugging/testing only) */
 +    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
 +
 +    /* don't launch wait/update timers & counters if there was no work to do
 +
 +       NOTE: if timing with multiple GPUs (streams) becomes possible, the
 +       counters could end up being inconsistent due to not being incremented
 +       on some of the nodes! */
 +    if (cu_nb->plist[iloc]->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        adat_end = cu_nb->atdat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_end = cu_nb->atdat->natoms;
 +    }
 +
 +    if (cu_nb->bUseStreamSync)
 +    {
 +        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 +        CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 +    }
 +    else 
 +    {
 +        /* Busy-wait until we get the signal pattern set in last byte
 +         * of the l/nl float vector. This pattern corresponds to a floating
 +         * point number which can't be the result of the force calculation
 +         * (maximum, 127 exponent and 0 mantissa).
 +         * The polling uses atomic compare-exchange.
 +         */
 +        poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 +        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 +    }
 +
 +    /* timing data accumulation */
 +    if (cu_nb->bDoTime)
 +    {
 +        /* only increase counter once (at local F wait) */
 +        if (LOCAL_I(iloc))
 +        {
 +            timings->nb_c++;
 +            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 +        }
 +
 +        /* kernel timings */
 +        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 +            cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 +
 +        /* X/q H2D and F D2H timings */
 +        timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 +                                                 timers->stop_nb_h2d[iloc]);
 +        timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 +                                                 timers->stop_nb_d2h[iloc]);
 +
 +        /* only count atdat and pair-list H2D at pair-search step */
 +        if (plist->bDoPrune)
 +        {
 +            /* atdat transfer timing (add only once, at local F wait) */
 +            if (LOCAL_A(aloc))
 +            {
 +                timings->pl_h2d_c++;
 +                timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 +                                                         timers->stop_atdat);
 +            }
 +
 +            timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 +                                                     timers->stop_pl_h2d[iloc]);
 +        }
 +    }
 +
 +    /* add up energies and shift forces (only once at local F wait) */
 +    if (LOCAL_I(iloc))
 +    {
 +        if (bCalcEner)
 +        {
 +            *e_lj += *nbst.e_lj;
 +            *e_el += *nbst.e_el;
 +        }
 +
 +        if (bCalcFshift)
 +        {
 +            for (i = 0; i < SHIFTS; i++)
 +            {
 +                fshift[i][0] += nbst.fshift[i].x;
 +                fshift[i][1] += nbst.fshift[i].y;
 +                fshift[i][2] += nbst.fshift[i].z;
 +            }
 +        }
 +    }
 +
 +    /* turn off pruning (doesn't matter if this is pair-search step or not) */
 +    plist->bDoPrune = false;
 +}
 +
 +/*! Return the reference to the nbfp texture. */
 +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 +{
 +    return tex_nbfp;
 +}
 +
 +/*! Return the reference to the coulomb_tab. */
 +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 +{
 +    return tex_coulomb_tab;
 +}
 +
 +/*! Set up the cache configuration for the non-bonded kernels,
 + */
 +void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 +{
 +    cudaError_t stat;
 +
 +    for (int i = 0; i < eelCuNR; i++)
 +        for (int j = 0; j < nEnergyKernelTypes; j++)
 +            for (int k = 0; k < nPruneKernelTypes; k++)
 +            {
 +                /* Legacy kernel 16/48 kB Shared/L1 */
 +                stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 +
 +                if (devinfo->prop.major >= 3)
 +                {
 +                    /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 +                }
 +                else
 +                {
 +                    /* On Fermi prefer L1 gives 2% higher performance */
 +                    /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 +                }
 +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 +            }
 +}
index f88d0ba43121da7bff96150ebeec98af6561bbee,0000000000000000000000000000000000000000..319178e066611a71cc6d1a844d8e5fee978515c0
mode 100644,000000..100644
--- /dev/null
@@@ -1,262 -1,0 +1,262 @@@
-     int                     cj_ind;    /* The current cj_ind index for the current list     */
-     int                     cj4_init;  /* The first unitialized cj4 block                   */
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_internal_h
 +#define _nbnxn_internal_h
 +
 +#include "typedefs.h"
 +#include "domdec.h"
 +#include "gmx_cyclecounter.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +/* Use 4-way SIMD for, always, single precision bounding box calculations */
 +#define NBNXN_SEARCH_BB_SSE
 +#endif
 +
 +
 +#ifdef GMX_NBNXN_SIMD
 +/* Memory alignment in bytes as required by SIMD aligned loads/stores */
 +#define NBNXN_MEM_ALIGN  (GMX_NBNXN_SIMD_BITWIDTH/8)
 +#else
 +/* No alignment required, but set it so we can call the same routines */
 +#define NBNXN_MEM_ALIGN  32
 +#endif
 +
 +
 +/* A pair-search grid struct for one domain decomposition zone */
 +typedef struct {
 +    rvec     c0;               /* The lower corner of the (local) grid        */
 +    rvec     c1;               /* The upper corner of the (local) grid        */
 +    real     atom_density;     /* The atom number density for the local grid  */
 +
 +    gmx_bool bSimple;          /* Is this grid simple or super/sub            */
 +    int      na_c;             /* Number of atoms per cluster                 */
 +    int      na_cj;            /* Number of atoms for list j-clusters         */
 +    int      na_sc;            /* Number of atoms per super-cluster           */
 +    int      na_c_2log;        /* 2log of na_c                                */
 +
 +    int      ncx;              /* Number of (super-)cells along x             */
 +    int      ncy;              /* Number of (super-)cells along y             */
 +    int      nc;               /* Total number of (super-)cells               */
 +
 +    real     sx;               /* x-size of a (super-)cell                    */
 +    real     sy;               /* y-size of a (super-)cell                    */
 +    real     inv_sx;           /* 1/sx                                        */
 +    real     inv_sy;           /* 1/sy                                        */
 +
 +    int      cell0;            /* Index in nbs->cell corresponding to cell 0  */
 +
 +    int     *cxy_na;           /* The number of atoms for each column in x,y  */
 +    int     *cxy_ind;          /* Grid (super)cell index, offset from cell0   */
 +    int      cxy_nalloc;       /* Allocation size for cxy_na and cxy_ind      */
 +
 +    int     *nsubc;            /* The number of sub cells for each super cell */
 +    float   *bbcz;             /* Bounding boxes in z for the super cells     */
 +    float   *bb;               /* 3D bounding boxes for the sub cells         */
 +    float   *bbj;              /* 3D j-b.boxes for SSE-double or AVX-single   */
 +    int     *flags;            /* Flag for the super cells                    */
 +    int      nc_nalloc;        /* Allocation size for the pointers above      */
 +
 +    float   *bbcz_simple;      /* bbcz for simple grid converted from super   */
 +    float   *bb_simple;        /* bb for simple grid converted from super     */
 +    int     *flags_simple;     /* flags for simple grid converted from super  */
 +    int      nc_nalloc_simple; /* Allocation size for the pointers above   */
 +
 +    int      nsubc_tot;        /* Total number of subcell, used for printing  */
 +} nbnxn_grid_t;
 +
 +#ifdef GMX_NBNXN_SIMD
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define GMX_MM128_HERE
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#define GMX_MM256_HERE
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 +#endif
 +#endif
 +#include "gmx_simd_macros.h"
 +
 +typedef struct nbnxn_x_ci_simd_4xn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
 +    gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
 +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
 +    gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
 +} nbnxn_x_ci_simd_4xn_t;
 +
 +typedef struct nbnxn_x_ci_simd_2xnn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
 +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
 +} nbnxn_x_ci_simd_2xnn_t;
 +
 +#endif
 +
 +/* Working data for the actual i-supercell during pair search */
 +typedef struct nbnxn_list_work {
 +    gmx_cache_protect_t     cp0;   /* Protect cache between threads               */
 +
 +    float                  *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
 +    real                   *x_ci;  /* The coordinates, pbc shifted, for each atom       */
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_x_ci_simd_4xn_t  *x_ci_simd_4xn;
 +    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 +#endif
-     float                  *d2;        /* Bounding box distance work array                  */
++    int                     cj_ind;          /* The current cj_ind index for the current list     */
++    int                     cj4_init;        /* The first unitialized cj4 block                   */
 +
-     nbnxn_cj_t             *cj;        /* The j-cell list                                   */
-     int                     cj_nalloc; /* Allocation size of cj                             */
++    float                  *d2;              /* Bounding box distance work array                  */
 +
-     int                     ncj_noq;   /* Nr. of cluster pairs without Coul for flop count  */
-     int                     ncj_hlj;   /* Nr. of cluster pairs with 1/2 LJ for flop count   */
++    nbnxn_cj_t             *cj;              /* The j-cell list                                   */
++    int                     cj_nalloc;       /* Allocation size of cj                             */
 +
-     gmx_cache_protect_t     cp1;       /* Protect cache between threads               */
++    int                     ncj_noq;         /* Nr. of cluster pairs without Coul for flop count  */
++    int                     ncj_hlj;         /* Nr. of cluster pairs with 1/2 LJ for flop count   */
 +
 +    int                    *sort;            /* Sort index                    */
 +    int                     sort_nalloc;     /* Allocation size of sort       */
 +
 +    nbnxn_sci_t            *sci_sort;        /* Second sci array, for sorting */
 +    int                     sci_sort_nalloc; /* Allocation size of sci_sort   */
 +
++    gmx_cache_protect_t     cp1;             /* Protect cache between threads               */
 +} nbnxn_list_work_t;
 +
 +/* Function type for setting the i-atom coordinate working data */
 +typedef void
 +    gmx_icell_set_x_t (int ci,
 +                       real shx, real shy, real shz,
 +                       int na_c,
 +                       int stride, const real *x,
 +                       nbnxn_list_work_t *work);
 +
 +static gmx_icell_set_x_t icell_set_x_simple;
 +#ifdef GMX_NBNXN_SIMD
 +static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
 +static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 +#endif
 +static gmx_icell_set_x_t icell_set_x_supersub;
 +#ifdef NBNXN_SEARCH_SSE
 +static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 +#endif
 +
 +#undef GMX_MM128_HERE
 +#undef GMX_MM256_HERE
 +
 +/* Local cycle count struct for profiling */
 +typedef struct {
 +    int          count;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +} nbnxn_cycle_t;
 +
 +/* Local cycle count enum for profiling */
 +enum {
 +    enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr
 +};
 +
 +/* Thread-local work struct, contains part of nbnxn_grid_t */
 +typedef struct {
 +    gmx_cache_protect_t  cp0;
 +
 +    int                 *cxy_na;
 +    int                  cxy_na_nalloc;
 +
 +    int                 *sort_work;
 +    int                  sort_work_nalloc;
 +
 +    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
 +
 +    int                  ndistc;       /* Number of distance checks for flop counting */
 +
 +    nbnxn_cycle_t        cc[enbsCCnr];
 +
 +    gmx_cache_protect_t  cp1;
 +} nbnxn_search_work_t;
 +
 +/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
 +typedef struct nbnxn_search {
 +    int                 ePBC;            /* PBC type enum                              */
 +    matrix              box;             /* The periodic unit-cell                     */
 +
 +    gmx_bool            DomDec;          /* Are we doing domain decomposition?         */
 +    ivec                dd_dim;          /* Are we doing DD in x,y,z?                  */
 +    gmx_domdec_zones_t *zones;           /* The domain decomposition zones        */
 +
 +    int                 ngrid;           /* The number of grids, equal to #DD-zones    */
 +    nbnxn_grid_t       *grid;            /* Array of grids, size ngrid                 */
 +    int                *cell;            /* Actual allocated cell array for all grids  */
 +    int                 cell_nalloc;     /* Allocation size of cell                    */
 +    int                *a;               /* Atom index for grid, the inverse of cell   */
 +    int                 a_nalloc;        /* Allocation size of a                       */
 +
 +    int                 natoms_local;    /* The local atoms run from 0 to natoms_local */
 +    int                 natoms_nonlocal; /* The non-local atoms run from natoms_local
 +                                          * to natoms_nonlocal */
 +
 +    gmx_bool             print_cycles;
 +    int                  search_count;
 +    nbnxn_cycle_t        cc[enbsCCnr];
 +
 +    gmx_icell_set_x_t   *icell_set_x; /* Function for setting i-coords    */
 +
 +    int                  nthread_max; /* Maximum number of threads for pair-search  */
 +    nbnxn_search_work_t *work;        /* Work array, size nthread_max          */
 +} nbnxn_search_t_t;
 +
 +
 +static void nbs_cycle_start(nbnxn_cycle_t *cc)
 +{
 +    cc->start = gmx_cycles_read();
 +}
 +
 +static void nbs_cycle_stop(nbnxn_cycle_t *cc)
 +{
 +    cc->c += gmx_cycles_read() - cc->start;
 +    cc->count++;
 +}
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 7829fcdb95fbacac51fa409f868bb00a265dfa8f,0000000000000000000000000000000000000000..fb4aaf8da208c6591e70e6fd2691549ce670dd22
mode 100644,000000..100644
--- /dev/null
@@@ -1,5127 -1,0 +1,5147 @@@
-  * invh is the inverse hole spacing.
-  * nsort, the theortical hole limit, is only used for debugging.
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nrnb.h"
 +
 +
 +/* Pair search box lower and upper corner in x,y,z.
 + * Store this in 4 iso 3 reals, which is useful with SSE.
 + * To avoid complicating the code we also use 4 without SSE.
 + */
 +#define NNBSBB_C         4
 +#define NNBSBB_B         (2*NNBSBB_C)
 +/* Pair search box lower and upper bound in z only. */
 +#define NNBSBB_D         2
 +/* Pair search box lower and upper corner x,y,z indices */
 +#define BBL_X  0
 +#define BBL_Y  1
 +#define BBL_Z  2
 +#define BBU_X  4
 +#define BBU_Y  5
 +#define BBU_Z  6
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* We use SSE or AVX-128bit for bounding box calculations */
 +
 +#ifndef GMX_DOUBLE
 +/* Single precision BBs + coordinates, we can also load coordinates using SSE */
 +#define NBNXN_SEARCH_SSE_SINGLE
 +#endif
 +
 +/* Include basic SSE2 stuff */
 +#include <emmintrin.h>
 +
 +#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
 +/* Store bounding boxes with x, y and z coordinates in packs of 4 */
 +#define NBNXN_PBB_SSE
 +#endif
 +
 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
 + * Here AVX-256 turns out to be slightly slower than AVX-128.
 + */
 +#define STRIDE_PBB        4
 +#define STRIDE_PBB_2LOG   2
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef GMX_NBNXN_SIMD
 +
 +/* The functions below are macros as they are performance sensitive */
 +
 +/* 4x4 list, pack=4: no complex conversion required */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J4(ci)   (ci)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
 +
 +/* 4x2 list, pack=4: j-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
 +
 +/* 4x8 list, pack=8: i-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
 +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 +
 +/* The j-cluster size is matched to the SIMD width */
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#ifdef GMX_DOUBLE
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
 +#else
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#endif
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#ifdef GMX_DOUBLE
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#else
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
 +/* Half SIMD with j-cluster size */
 +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
 +#endif
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_WIDTH"
 +#endif
 +#endif
 +
 +#endif /* GMX_NBNXN_SIMD */
 +
 +
 +/* Interaction masks for 4xN atom interactions.
 + * Bit i*CJ_SIZE + j tells if atom i and j interact.
 + */
 +/* All interaction mask is the same for all kernels */
 +#define NBNXN_INT_MASK_ALL        0xffffffff
 +/* 4x4 kernel diagonal mask */
 +#define NBNXN_INT_MASK_DIAG       0x08ce
 +/* 4x2 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J2_0  0x0002
 +#define NBNXN_INT_MASK_DIAG_J2_1  0x002F
 +/* 4x8 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
 +#define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 +#define NBNXN_BBXXXX
 +/* Size of bounding box corners quadruplet */
 +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
 +#endif
 +
 +/* We shift the i-particles backward for PBC.
 + * This leads to more conditionals than shifting forward.
 + * We do this to get more balanced pair lists.
 + */
 +#define NBNXN_SHIFT_BACKWARD
 +
 +
 +/* This define is a lazy way to avoid interdependence of the grid
 + * and searching data structures.
 + */
 +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
 +
 +
 +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 +{
 +    int i;
 +
 +    for (i = 0; i < enbsCCnr; i++)
 +    {
 +        cc[i].count = 0;
 +        cc[i].c     = 0;
 +    }
 +}
 +
 +static double Mcyc_av(const nbnxn_cycle_t *cc)
 +{
 +    return (double)cc->c*1e-6/cc->count;
 +}
 +
 +static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
 +{
 +    int n;
 +    int t;
 +
 +    fprintf(fp, "\n");
 +    fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
 +            nbs->cc[enbsCCgrid].count,
 +            Mcyc_av(&nbs->cc[enbsCCgrid]),
 +            Mcyc_av(&nbs->cc[enbsCCsearch]),
 +            Mcyc_av(&nbs->cc[enbsCCreducef]));
 +
 +    if (nbs->nthread_max > 1)
 +    {
 +        if (nbs->cc[enbsCCcombine].count > 0)
 +        {
 +            fprintf(fp, " comb %5.2f",
 +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
 +        }
 +        fprintf(fp, " s. th");
 +        for (t = 0; t < nbs->nthread_max; t++)
 +        {
 +            fprintf(fp, " %4.1f",
 +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
 +        }
 +    }
 +    fprintf(fp, "\n");
 +}
 +
 +static void nbnxn_grid_init(nbnxn_grid_t * grid)
 +{
 +    grid->cxy_na      = NULL;
 +    grid->cxy_ind     = NULL;
 +    grid->cxy_nalloc  = 0;
 +    grid->bb          = NULL;
 +    grid->bbj         = NULL;
 +    grid->nc_nalloc   = 0;
 +}
 +
 +static int get_2log(int n)
 +{
 +    int log2;
 +
 +    log2 = 0;
 +    while ((1<<log2) < n)
 +    {
 +        log2++;
 +    }
 +    if ((1<<log2) != n)
 +    {
 +        gmx_fatal(FARGS, "nbnxn na_c (%d) is not a power of 2", n);
 +    }
 +
 +    return log2;
 +}
 +
 +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk4x4_PlainC:
 +        case nbnxnk4xN_SIMD_4xN:
 +        case nbnxnk4xN_SIMD_2xNN:
 +            return NBNXN_CPU_CLUSTER_I_SIZE;
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            /* The cluster size for super/sub lists is only set here.
 +             * Any value should work for the pair-search and atomdata code.
 +             * The kernels, of course, might require a particular value.
 +             */
 +            return NBNXN_GPU_CLUSTER_SIZE;
 +        default:
 +            gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 +{
 +    int nbnxn_simd_width = 0;
 +    int cj_size          = 0;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
 +#endif
 +
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk4x4_PlainC:
 +            cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
 +            break;
 +        case nbnxnk4xN_SIMD_4xN:
 +            cj_size = nbnxn_simd_width;
 +            break;
 +        case nbnxnk4xN_SIMD_2xNN:
 +            cj_size = nbnxn_simd_width/2;
 +            break;
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +            break;
 +        default:
 +            gmx_incons("unknown kernel type");
 +    }
 +
 +    return cj_size;
 +}
 +
 +static int ci_to_cj(int na_cj_2log, int ci)
 +{
 +    switch (na_cj_2log)
 +    {
 +        case 2: return ci;     break;
 +        case 1: return (ci<<1); break;
 +        case 3: return (ci>>1); break;
 +    }
 +
 +    return 0;
 +}
 +
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 +{
 +    if (nb_kernel_type == nbnxnkNotSet)
 +    {
 +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 +    }
 +
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            return FALSE;
 +
 +        case nbnxnk4x4_PlainC:
 +        case nbnxnk4xN_SIMD_4xN:
 +        case nbnxnk4xN_SIMD_2xNN:
 +            return TRUE;
 +
 +        default:
 +            gmx_incons("Invalid nonbonded kernel type passed!");
 +            return FALSE;
 +    }
 +}
 +
 +void nbnxn_init_search(nbnxn_search_t    * nbs_ptr,
 +                       ivec               *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int                 nthread_max)
 +{
 +    nbnxn_search_t nbs;
 +    int            d, g, t;
 +
 +    snew(nbs, 1);
 +    *nbs_ptr = nbs;
 +
 +    nbs->DomDec = (n_dd_cells != NULL);
 +
 +    clear_ivec(nbs->dd_dim);
 +    nbs->ngrid = 1;
 +    if (nbs->DomDec)
 +    {
 +        nbs->zones = zones;
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if ((*n_dd_cells)[d] > 1)
 +            {
 +                nbs->dd_dim[d] = 1;
 +                /* Each grid matches a DD zone */
 +                nbs->ngrid *= 2;
 +            }
 +        }
 +    }
 +
 +    snew(nbs->grid, nbs->ngrid);
 +    for (g = 0; g < nbs->ngrid; g++)
 +    {
 +        nbnxn_grid_init(&nbs->grid[g]);
 +    }
 +    nbs->cell        = NULL;
 +    nbs->cell_nalloc = 0;
 +    nbs->a           = NULL;
 +    nbs->a_nalloc    = 0;
 +
 +    nbs->nthread_max = nthread_max;
 +
 +    /* Initialize the work data structures for each thread */
 +    snew(nbs->work, nbs->nthread_max);
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        nbs->work[t].cxy_na           = NULL;
 +        nbs->work[t].cxy_na_nalloc    = 0;
 +        nbs->work[t].sort_work        = NULL;
 +        nbs->work[t].sort_work_nalloc = 0;
 +    }
 +
 +    /* Initialize detailed nbsearch cycle counting */
 +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
 +    nbs->search_count = 0;
 +    nbs_cycle_clear(nbs->cc);
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        nbs_cycle_clear(nbs->work[t].cc);
 +    }
 +}
 +
 +static real grid_atom_density(int n, rvec corner0, rvec corner1)
 +{
 +    rvec size;
 +
 +    rvec_sub(corner1, corner0, size);
 +
 +    return n/(size[XX]*size[YY]*size[ZZ]);
 +}
 +
 +static int set_grid_size_xy(const nbnxn_search_t nbs,
 +                            nbnxn_grid_t *grid,
 +                            int dd_zone,
 +                            int n, rvec corner0, rvec corner1,
 +                            real atom_density,
 +                            int XFormat)
 +{
 +    rvec size;
 +    int  na_c;
 +    real adens, tlen, tlen_x, tlen_y, nc_max;
 +    int  t;
 +
 +    rvec_sub(corner1, corner0, size);
 +
 +    if (n > grid->na_sc)
 +    {
 +        /* target cell length */
 +        if (grid->bSimple)
 +        {
 +            /* To minimize the zero interactions, we should make
 +             * the largest of the i/j cell cubic.
 +             */
 +            na_c = max(grid->na_c, grid->na_cj);
 +
 +            /* Approximately cubic cells */
 +            tlen   = pow(na_c/atom_density, 1.0/3.0);
 +            tlen_x = tlen;
 +            tlen_y = tlen;
 +        }
 +        else
 +        {
 +            /* Approximately cubic sub cells */
 +            tlen   = pow(grid->na_c/atom_density, 1.0/3.0);
 +            tlen_x = tlen*GPU_NSUBCELL_X;
 +            tlen_y = tlen*GPU_NSUBCELL_Y;
 +        }
 +        /* We round ncx and ncy down, because we get less cell pairs
 +         * in the nbsist when the fixed cell dimensions (x,y) are
 +         * larger than the variable one (z) than the other way around.
 +         */
 +        grid->ncx = max(1, (int)(size[XX]/tlen_x));
 +        grid->ncy = max(1, (int)(size[YY]/tlen_y));
 +    }
 +    else
 +    {
 +        grid->ncx = 1;
 +        grid->ncy = 1;
 +    }
 +
 +    grid->sx     = size[XX]/grid->ncx;
 +    grid->sy     = size[YY]/grid->ncy;
 +    grid->inv_sx = 1/grid->sx;
 +    grid->inv_sy = 1/grid->sy;
 +
 +    if (dd_zone > 0)
 +    {
 +        /* This is a non-home zone, add an extra row of cells
 +         * for particles communicated for bonded interactions.
 +         * These can be beyond the cut-off. It doesn't matter where
 +         * they end up on the grid, but for performance it's better
 +         * if they don't end up in cells that can be within cut-off range.
 +         */
 +        grid->ncx++;
 +        grid->ncy++;
 +    }
 +
 +    /* We need one additional cell entry for particles moved by DD */
 +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 +    {
 +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +        srenew(grid->cxy_na, grid->cxy_nalloc);
 +        srenew(grid->cxy_ind, grid->cxy_nalloc+1);
 +    }
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 +        {
 +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +            srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 +        }
 +    }
 +
 +    /* Worst case scenario of 1 atom in each last cell */
 +    if (grid->na_cj <= grid->na_c)
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 +    }
 +    else
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 +    }
 +
 +    if (nc_max > grid->nc_nalloc)
 +    {
 +        int bb_nalloc;
 +
 +        grid->nc_nalloc = over_alloc_large(nc_max);
 +        srenew(grid->nsubc, grid->nc_nalloc);
 +        srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
 +#ifdef NBNXN_PBB_SSE
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
 +#else
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
 +#endif
 +        sfree_aligned(grid->bb);
 +        /* This snew also zeros the contents, this avoid possible
 +         * floating exceptions in SSE with the unused bb elements.
 +         */
 +        snew_aligned(grid->bb, bb_nalloc, 16);
 +
 +        if (grid->bSimple)
 +        {
 +            if (grid->na_cj == grid->na_c)
 +            {
 +                grid->bbj = grid->bb;
 +            }
 +            else
 +            {
 +                sfree_aligned(grid->bbj);
 +                snew_aligned(grid->bbj, bb_nalloc*grid->na_c/grid->na_cj, 16);
 +            }
 +        }
 +
 +        srenew(grid->flags, grid->nc_nalloc);
 +    }
 +
 +    copy_rvec(corner0, grid->c0);
 +    copy_rvec(corner1, grid->c1);
 +
 +    return nc_max;
 +}
 +
 +/* We need to sort paricles in grid columns on z-coordinate.
 + * As particle are very often distributed homogeneously, we a sorting
 + * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 + * by a factor, cast to an int and try to store in that hole. If the hole
 + * is full, we move this or another particle. A second pass is needed to make
 + * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 + * 4 is the optimal value for homogeneous particle distribution and allows
 + * for an O(#particles) sort up till distributions were all particles are
 + * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 + * as it can be expensive to detect imhomogeneous particle distributions.
 + * SGSF is the maximum ratio of holes used, in the worst case all particles
 + * end up in the last hole and we need #particles extra holes at the end.
 + */
 +#define SORT_GRID_OVERSIZE 4
 +#define SGSF (SORT_GRID_OVERSIZE + 1)
 +
 +/* Sort particle index a on coordinates x along dim.
 + * Backwards tells if we want decreasing iso increasing coordinates.
 + * h0 is the minimum of the coordinate range.
-                        real h0, real invh, int nsort, int *sort)
++ * invh is the 1/length of the sorting range.
++ * n_per_h (>=n) is the expected average number of particles per 1/invh
 + * sort is the sorting work array.
++ * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
++ * or easier, allocate at least n*SGSF elements.
 + */
 +static void sort_atoms(int dim, gmx_bool Backwards,
 +                       int *a, int n, rvec *x,
-     int i, c;
++                       real h0, real invh, int n_per_h,
++                       int *sort)
 +{
- #ifdef DEBUG_NBNXN_GRIDDING
-         if (zi < 0 || zi >= nsort)
++    int nsort, i, c;
 +    int zi, zim, zi_min, zi_max;
 +    int cp, tmp;
 +
 +    if (n <= 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
++#ifndef NDEBUG
++    if (n > n_per_h)
++    {
++        gmx_incons("n > n_per_h");
++    }
++#endif
++
++    /* Transform the inverse range height into the inverse hole height */
++    invh *= n_per_h*SORT_GRID_OVERSIZE;
++
++    /* Set nsort to the maximum possible number of holes used.
++     * In worst case all n elements end up in the last bin.
++     */
++    nsort = n_per_h*SORT_GRID_OVERSIZE + n;
++
 +    /* Determine the index range used, so we can limit it for the second pass */
 +    zi_min = INT_MAX;
 +    zi_max = -1;
 +
 +    /* Sort the particles using a simple index sort */
 +    for (i = 0; i < n; i++)
 +    {
 +        /* The cast takes care of float-point rounding effects below zero.
 +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 +         * times the box height out of the box.
 +         */
 +        zi = (int)((x[a[i]][dim] - h0)*invh);
 +
-             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
-                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi, nsort);
++#ifndef NDEBUG
++        /* As we can have rounding effect, we use > iso >= here */
++        if (zi < 0 || zi > n_per_h*SORT_GRID_OVERSIZE)
 +        {
-                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
-                    ncz*grid->na_sc*SGSF, sort_work);
++            gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
++                      a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
++                      n_per_h, SORT_GRID_OVERSIZE);
 +        }
 +#endif
 +
 +        /* Ideally this particle should go in sort cell zi,
 +         * but that might already be in use,
 +         * in that case find the first empty cell higher up
 +         */
 +        if (sort[zi] < 0)
 +        {
 +            sort[zi] = a[i];
 +            zi_min   = min(zi_min, zi);
 +            zi_max   = max(zi_max, zi);
 +        }
 +        else
 +        {
 +            /* We have multiple atoms in the same sorting slot.
 +             * Sort on real z for minimal bounding box size.
 +             * There is an extra check for identical z to ensure
 +             * well-defined output order, independent of input order
 +             * to ensure binary reproducibility after restarts.
 +             */
 +            while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 +                                      (x[a[i]][dim] == x[sort[zi]][dim] &&
 +                                       a[i] > sort[zi])))
 +            {
 +                zi++;
 +            }
 +
 +            if (sort[zi] >= 0)
 +            {
 +                /* Shift all elements by one slot until we find an empty slot */
 +                cp  = sort[zi];
 +                zim = zi + 1;
 +                while (sort[zim] >= 0)
 +                {
 +                    tmp       = sort[zim];
 +                    sort[zim] = cp;
 +                    cp        = tmp;
 +                    zim++;
 +                }
 +                sort[zim] = cp;
 +                zi_max    = max(zi_max, zim);
 +            }
 +            sort[zi] = a[i];
 +            zi_max   = max(zi_max, zi);
 +        }
 +    }
 +
 +    c = 0;
 +    if (!Backwards)
 +    {
 +        for (zi = 0; zi < nsort; zi++)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++]   = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (zi = zi_max; zi >= zi_min; zi--)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++]   = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    if (c < n)
 +    {
 +        gmx_incons("Lost particles while sorting");
 +    }
 +}
 +
 +#ifdef GMX_DOUBLE
 +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 +#else
 +#define R2F_D(x) (x)
 +#define R2F_U(x) (x)
 +#endif
 +
 +/* Coordinate order x,y,z, bb order xyz0 */
 +static void calc_bounding_box(int na, int stride, const real *x, float *bb)
 +{
 +    int  i, j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    i  = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[i+XX]);
 +        xh = max(xh, x[i+XX]);
 +        yl = min(yl, x[i+YY]);
 +        yh = max(yh, x[i+YY]);
 +        zl = min(zl, x[i+ZZ]);
 +        zh = max(zh, x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4(int na, const real *x, float *bb)
 +{
 +    int  j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    xl = x[XX*PACK_X4];
 +    xh = x[XX*PACK_X4];
 +    yl = x[YY*PACK_X4];
 +    yh = x[YY*PACK_X4];
 +    zl = x[ZZ*PACK_X4];
 +    zh = x[ZZ*PACK_X4];
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[j+XX*PACK_X4]);
 +        xh = max(xh, x[j+XX*PACK_X4]);
 +        yl = min(yl, x[j+YY*PACK_X4]);
 +        yh = max(yh, x[j+YY*PACK_X4]);
 +        zl = min(zl, x[j+ZZ*PACK_X4]);
 +        zh = max(zh, x[j+ZZ*PACK_X4]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x8(int na, const real *x, float *bb)
 +{
 +    int  j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    xl = x[XX*PACK_X8];
 +    xh = x[XX*PACK_X8];
 +    yl = x[YY*PACK_X8];
 +    yh = x[YY*PACK_X8];
 +    zl = x[ZZ*PACK_X8];
 +    zh = x[ZZ*PACK_X8];
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[j+XX*PACK_X8]);
 +        xh = max(xh, x[j+XX*PACK_X8]);
 +        yl = min(yl, x[j+YY*PACK_X8]);
 +        yh = max(yh, x[j+YY*PACK_X8]);
 +        zl = min(zl, x[j+ZZ*PACK_X8]);
 +        zh = max(zh, x[j+ZZ*PACK_X8]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4_halves(int na, const real *x,
 +                                          float *bb, float *bbj)
 +{
 +    calc_bounding_box_x_x4(min(na, 2), x, bbj);
 +
 +    if (na > 2)
 +    {
 +        calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+NNBSBB_B);
 +    }
 +    else
 +    {
 +        /* Set the "empty" bounding box to the same as the first one,
 +         * so we don't need to treat special cases in the rest of the code.
 +         */
 +        _mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
 +        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
 +    }
 +
 +    _mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
 +                                _mm_load_ps(bbj+NNBSBB_B)));
 +    _mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
 +                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
 +}
 +
 +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 +{
 +    int  i, j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    i  = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[i+XX]);
 +        xh = max(xh, x[i+XX]);
 +        yl = min(yl, x[i+YY]);
 +        yh = max(yh, x[i+YY]);
 +        zl = min(zl, x[i+ZZ]);
 +        zh = max(zh, x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[0*STRIDE_PBB] = R2F_D(xl);
 +    bb[1*STRIDE_PBB] = R2F_D(yl);
 +    bb[2*STRIDE_PBB] = R2F_D(zl);
 +    bb[3*STRIDE_PBB] = R2F_U(xh);
 +    bb[4*STRIDE_PBB] = R2F_U(yh);
 +    bb[5*STRIDE_PBB] = R2F_U(zh);
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +
 +/* Coordinate order xyz?, bb order xyz0 */
 +static void calc_bounding_box_sse(int na, const float *x, float *bb)
 +{
 +    __m128 bb_0_SSE, bb_1_SSE;
 +    __m128 x_SSE;
 +
 +    int    i;
 +
 +    bb_0_SSE = _mm_load_ps(x);
 +    bb_1_SSE = bb_0_SSE;
 +
 +    for (i = 1; i < na; i++)
 +    {
 +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
 +        bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
 +        bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
 +    }
 +
 +    _mm_store_ps(bb, bb_0_SSE);
 +    _mm_store_ps(bb+4, bb_1_SSE);
 +}
 +
 +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx_sse(int na, const float *x,
 +                                       float *bb_work,
 +                                       real *bb)
 +{
 +    calc_bounding_box_sse(na, x, bb_work);
 +
 +    bb[0*STRIDE_PBB] = bb_work[BBL_X];
 +    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
 +    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
 +    bb[3*STRIDE_PBB] = bb_work[BBU_X];
 +    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
 +    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE_SINGLE */
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Combines pairs of consecutive bounding boxes */
 +static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
 +{
 +    int    i, j, sc2, nc2, c2;
 +    __m128 min_SSE, max_SSE;
 +
 +    for (i = 0; i < grid->ncx*grid->ncy; i++)
 +    {
 +        /* Starting bb in a column is expected to be 2-aligned */
 +        sc2 = grid->cxy_ind[i]>>1;
 +        /* For odd numbers skip the last bb here */
 +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
 +        for (c2 = sc2; c2 < sc2+nc2; c2++)
 +        {
 +            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
 +            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
 +            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
 +            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
 +        }
 +        if (((grid->cxy_na[i]+3)>>2) & 1)
 +        {
 +            /* Copy the last bb for odd bb count in this column */
 +            for (j = 0; j < NNBSBB_C; j++)
 +            {
 +                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
 +                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
 +            }
 +        }
 +    }
 +}
 +
 +#endif
 +
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_simple(FILE                *fp,
 +                                 const nbnxn_search_t nbs,
 +                                 const nbnxn_grid_t  *grid)
 +{
 +    int  c, d;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    for (c = 0; c < grid->nc; c++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
 +        }
 +    }
 +    dsvmul(1.0/grid->nc, ba, ba);
 +
 +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/grid->ncx,
 +            nbs->box[YY][YY]/grid->ncy,
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
 +            ba[XX], ba[YY], ba[ZZ],
 +            ba[XX]*grid->ncx/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_supersub(FILE                *fp,
 +                                   const nbnxn_search_t nbs,
 +                                   const nbnxn_grid_t  *grid)
 +{
 +    int  ns, c, s;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    ns = 0;
 +    for (c = 0; c < grid->nc; c++)
 +    {
 +#ifdef NBNXN_BBXXXX
 +        for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 +        {
 +            int cs_w, i, d;
 +
 +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
 +            for (i = 0; i < STRIDE_PBB; i++)
 +            {
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    ba[d] +=
 +                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 +                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 +                }
 +            }
 +        }
 +#else
 +        for (s = 0; s < grid->nsubc[c]; s++)
 +        {
 +            int cs, d;
 +
 +            cs = c*GPU_NSUBCELL + s;
 +            for (d = 0; d < DIM; d++)
 +            {
 +                ba[d] +=
 +                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
 +                    grid->bb[cs*NNBSBB_B         +d];
 +            }
 +        }
 +#endif
 +        ns += grid->nsubc[c];
 +    }
 +    dsvmul(1.0/ns, ba, ba);
 +
 +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
 +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
 +            ba[XX], ba[YY], ba[ZZ],
 +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
 + * Also sets interaction flags.
 + */
 +void sort_on_lj(nbnxn_atomdata_t *nbat, int na_c,
 +                int a0, int a1, const int *atinfo,
 +                int *order,
 +                int *flags)
 +{
 +    int      subc, s, a, n1, n2, a_lj_max, i, j;
 +    int      sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    int      sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    gmx_bool haveQ;
 +
 +    *flags = 0;
 +
 +    subc = 0;
 +    for (s = a0; s < a1; s += na_c)
 +    {
 +        /* Make lists for this (sub-)cell on atoms with and without LJ */
 +        n1       = 0;
 +        n2       = 0;
 +        haveQ    = FALSE;
 +        a_lj_max = -1;
 +        for (a = s; a < min(s+na_c, a1); a++)
 +        {
 +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 +
 +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 +            {
 +                sort1[n1++] = order[a];
 +                a_lj_max    = a;
 +            }
 +            else
 +            {
 +                sort2[n2++] = order[a];
 +            }
 +        }
 +
 +        /* If we don't have atom with LJ, there's nothing to sort */
 +        if (n1 > 0)
 +        {
 +            *flags |= NBNXN_CI_DO_LJ(subc);
 +
 +            if (2*n1 <= na_c)
 +            {
 +                /* Only sort when strictly necessary. Ordering particles
 +                 * Ordering particles can lead to less accurate summation
 +                 * due to rounding, both for LJ and Coulomb interactions.
 +                 */
 +                if (2*(a_lj_max - s) >= na_c)
 +                {
 +                    for (i = 0; i < n1; i++)
 +                    {
 +                        order[a0+i] = sort1[i];
 +                    }
 +                    for (j = 0; j < n2; j++)
 +                    {
 +                        order[a0+n1+j] = sort2[j];
 +                    }
 +                }
 +
 +                *flags |= NBNXN_CI_HALF_LJ(subc);
 +            }
 +        }
 +        if (haveQ)
 +        {
 +            *flags |= NBNXN_CI_DO_COUL(subc);
 +        }
 +        subc++;
 +    }
 +}
 +
 +/* Fill a pair search cell with atoms.
 + * Potentially sorts atoms and sets the interaction flags.
 + */
 +void fill_cell(const nbnxn_search_t nbs,
 +               nbnxn_grid_t *grid,
 +               nbnxn_atomdata_t *nbat,
 +               int a0, int a1,
 +               const int *atinfo,
 +               rvec *x,
 +               int sx, int sy, int sz,
 +               float *bb_work)
 +{
 +    int     na, a;
 +    size_t  offset;
 +    float  *bb_ptr;
 +
 +    na = a1 - a0;
 +
 +    if (grid->bSimple)
 +    {
 +        sort_on_lj(nbat, grid->na_c, a0, a1, atinfo, nbs->a,
 +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 +    }
 +
 +    /* Now we have sorted the atoms, set the cell indices */
 +    for (a = a0; a < a1; a++)
 +    {
 +        nbs->cell[nbs->a[a]] = a;
 +    }
 +
 +    copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
 +                           nbat->XFormat, nbat->x, a0,
 +                           sx, sy, sz);
 +
 +    if (nbat->XFormat == nbatX4)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
 +        if (2*grid->na_cj == grid->na_c)
 +        {
 +            calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
 +                                          grid->bbj+offset*2);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
 +        }
 +    }
 +    else if (nbat->XFormat == nbatX8)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
 +    }
 +#ifdef NBNXN_BBXXXX
 +    else if (!grid->bSimple)
 +    {
 +        /* Store the bounding boxes in a format convenient
 +         * for SSE calculations: xxxxyyyyzzzz...
 +         */
 +        bb_ptr =
 +            grid->bb +
 +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +        if (nbat->XFormat == nbatXYZQ)
 +        {
 +            calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
 +                                       bb_work, bb_ptr);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 +                                   bb_ptr);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx, sy, sz,
 +                    bb_ptr[0*STRIDE_PBB], bb_ptr[3*STRIDE_PBB],
 +                    bb_ptr[1*STRIDE_PBB], bb_ptr[4*STRIDE_PBB],
 +                    bb_ptr[2*STRIDE_PBB], bb_ptr[5*STRIDE_PBB]);
 +        }
 +    }
 +#endif
 +    else
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +
 +        calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 +                          bb_ptr);
 +
 +        if (gmx_debug_at)
 +        {
 +            int bbo;
 +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx, sy, sz,
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_simple(const nbnxn_search_t nbs,
 +                                int dd_zone,
 +                                nbnxn_grid_t *grid,
 +                                int a0, int a1,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                nbnxn_atomdata_t *nbat,
 +                                int cxy_start, int cxy_end,
 +                                int *sort_work)
 +{
 +    int  cxy;
 +    int  cx, cy, cz, ncz, cfilled, c;
 +    int  na, ash, ind, a;
 +    int  na_c, ash_c;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0, cxy_start, cxy_end, a0, a1);
 +    }
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ, FALSE,
 +                   nbs->a+ash, na, x,
 +                   grid->c0[ZZ],
-                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
-                    ncz*grid->na_sc*SGSF, sort_work);
++                   1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
++                   sort_work);
 +
 +        /* Fill the ncz cells in this column */
 +        cfilled = grid->cxy_ind[cxy];
 +        for (cz = 0; cz < ncz; cz++)
 +        {
 +            c  = grid->cxy_ind[cxy] + cz;
 +
 +            ash_c = ash + cz*grid->na_sc;
 +            na_c  = min(grid->na_sc, na-(ash_c-ash));
 +
 +            fill_cell(nbs, grid, nbat,
 +                      ash_c, ash_c+na_c, atinfo, x,
 +                      grid->na_sc*cx + (dd_zone >> 2),
 +                      grid->na_sc*cy + (dd_zone & 3),
 +                      grid->na_sc*cz,
 +                      NULL);
 +
 +            /* This copy to bbcz is not really necessary.
 +             * But it allows to use the same grid search code
 +             * for the simple and supersub cell setups.
 +             */
 +            if (na_c > 0)
 +            {
 +                cfilled = c;
 +            }
 +            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
 +            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for (ind = na; ind < ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_supersub(const nbnxn_search_t nbs,
 +                                  int dd_zone,
 +                                  nbnxn_grid_t *grid,
 +                                  int a0, int a1,
 +                                  const int *atinfo,
 +                                  rvec *x,
 +                                  nbnxn_atomdata_t *nbat,
 +                                  int cxy_start, int cxy_end,
 +                                  int *sort_work)
 +{
 +    int  cxy;
 +    int  cx, cy, cz = -1, c = -1, ncz;
 +    int  na, ash, na_c, ind, a;
 +    int  subdiv_z, sub_z, na_z, ash_z;
 +    int  subdiv_y, sub_y, na_y, ash_y;
 +    int  subdiv_x, sub_x, na_x, ash_x;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    float bb_work_array[NNBSBB_B+3], *bb_work_align;
 +
 +    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0, cxy_start, cxy_end, a0, a1);
 +    }
 +
 +    subdiv_x = grid->na_c;
 +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
 +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ, FALSE,
 +                   nbs->a+ash, na, x,
 +                   grid->c0[ZZ],
-                        subdiv_y*SORT_GRID_OVERSIZE*grid->inv_sy,
-                        subdiv_y*SGSF, sort_work);
++                   1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
++                   sort_work);
 +
 +        /* This loop goes over the supercells and subcells along z at once */
 +        for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
 +        {
 +            ash_z = ash + sub_z*subdiv_z;
 +            na_z  = min(subdiv_z, na-(ash_z-ash));
 +
 +            /* We have already sorted on z */
 +
 +            if (sub_z % GPU_NSUBCELL_Z == 0)
 +            {
 +                cz = sub_z/GPU_NSUBCELL_Z;
 +                c  = grid->cxy_ind[cxy] + cz;
 +
 +                /* The number of atoms in this supercell */
 +                na_c = min(grid->na_sc, na-(ash_z-ash));
 +
 +                grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
 +
 +                /* Store the z-boundaries of the super cell */
 +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
 +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
 +            }
 +
 +#if GPU_NSUBCELL_Y > 1
 +            /* Sort the atoms along y */
 +            sort_atoms(YY, (sub_z & 1),
 +                       nbs->a+ash_z, na_z, x,
 +                       grid->c0[YY]+cy*grid->sy,
-                            subdiv_x*SORT_GRID_OVERSIZE*grid->inv_sx,
-                            subdiv_x*SGSF, sort_work);
++                       grid->inv_sy, subdiv_z,
++                       sort_work);
 +#endif
 +
 +            for (sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
 +            {
 +                ash_y = ash_z + sub_y*subdiv_y;
 +                na_y  = min(subdiv_y, na-(ash_y-ash));
 +
 +#if GPU_NSUBCELL_X > 1
 +                /* Sort the atoms along x */
 +                sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
 +                           nbs->a+ash_y, na_y, x,
 +                           grid->c0[XX]+cx*grid->sx,
- #ifdef DEBUG_NBNXN_GRIDDING
-                 if (cx < 0 || cx >= grid->ncx ||
-                     cy < 0 || cy >= grid->ncy)
++                           grid->inv_sx, subdiv_y,
++                           sort_work);
 +#endif
 +
 +                for (sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
 +                {
 +                    ash_x = ash_y + sub_x*subdiv_x;
 +                    na_x  = min(subdiv_x, na-(ash_x-ash));
 +
 +                    fill_cell(nbs, grid, nbat,
 +                              ash_x, ash_x+na_x, atinfo, x,
 +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
 +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
 +                              grid->na_c*sub_z,
 +                              bb_work_align);
 +                }
 +            }
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for (ind = na; ind < ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid column atoms should go */
 +static void calc_column_indices(nbnxn_grid_t *grid,
 +                                int a0, int a1,
 +                                rvec *x,
 +                                int dd_zone, const int *move,
 +                                int thread, int nthread,
 +                                int *cell,
 +                                int *cxy_na)
 +{
 +    int  n0, n1, i;
 +    int  cx, cy;
 +
 +    /* We add one extra cell for particles which moved during DD */
 +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
 +    {
 +        cxy_na[i] = 0;
 +    }
 +
 +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
 +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
 +    if (dd_zone == 0)
 +    {
 +        /* Home zone */
 +        for (i = n0; i < n1; i++)
 +        {
 +            if (move == NULL || move[i] >= 0)
 +            {
 +                /* We need to be careful with rounding,
 +                 * particles might be a few bits outside the local zone.
 +                 * The int cast takes care of the lower bound,
 +                 * we will explicitly take care of the upper bound.
 +                 */
 +                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
-                 subc_in_range_sse8
++#ifndef NDEBUG
++                if (cx < 0 || cx > grid->ncx ||
++                    cy < 0 || cy > grid->ncy)
 +                {
 +                    gmx_fatal(FARGS,
 +                              "grid cell cx %d cy %d out of range (max %d %d)\n"
 +                              "atom %f %f %f, grid->c0 %f %f",
 +                              cx, cy, grid->ncx, grid->ncy,
 +                              x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
 +                }
 +#endif
 +                /* Take care of potential rouding issues */
 +                cx = min(cx, grid->ncx - 1);
 +                cy = min(cy, grid->ncy - 1);
 +
 +                /* For the moment cell will contain only the, grid local,
 +                 * x and y indices, not z.
 +                 */
 +                cell[i] = cx*grid->ncy + cy;
 +            }
 +            else
 +            {
 +                /* Put this moved particle after the end of the grid,
 +                 * so we can process it later without using conditionals.
 +                 */
 +                cell[i] = grid->ncx*grid->ncy;
 +            }
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +    else
 +    {
 +        /* Non-home zone */
 +        for (i = n0; i < n1; i++)
 +        {
 +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +            /* For non-home zones there could be particles outside
 +             * the non-bonded cut-off range, which have been communicated
 +             * for bonded interactions only. For the result it doesn't
 +             * matter where these end up on the grid. For performance
 +             * we put them in an extra row at the border.
 +             */
 +            cx = max(cx, 0);
 +            cx = min(cx, grid->ncx - 1);
 +            cy = max(cy, 0);
 +            cy = min(cy, grid->ncy - 1);
 +
 +            /* For the moment cell will contain only the, grid local,
 +             * x and y indices, not z.
 +             */
 +            cell[i] = cx*grid->ncy + cy;
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid cells the atoms should go */
 +static void calc_cell_indices(const nbnxn_search_t nbs,
 +                              int dd_zone,
 +                              nbnxn_grid_t *grid,
 +                              int a0, int a1,
 +                              const int *atinfo,
 +                              rvec *x,
 +                              const int *move,
 +                              nbnxn_atomdata_t *nbat)
 +{
 +    int   n0, n1, i;
 +    int   cx, cy, cxy, ncz_max, ncz;
 +    int   nthread, thread;
 +    int  *cxy_na, cxy_na_i;
 +
 +    nthread = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
 +                            nbs->cell, nbs->work[thread].cxy_na);
 +    }
 +
 +    /* Make the cell index as a function of x and y */
 +    ncz_max          = 0;
 +    ncz              = 0;
 +    grid->cxy_ind[0] = 0;
 +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
 +    {
 +        /* We set ncz_max at the beginning of the loop iso at the end
 +         * to skip i=grid->ncx*grid->ncy which are moved particles
 +         * that do not need to be ordered on the grid.
 +         */
 +        if (ncz > ncz_max)
 +        {
 +            ncz_max = ncz;
 +        }
 +        cxy_na_i = nbs->work[0].cxy_na[i];
 +        for (thread = 1; thread < nthread; thread++)
 +        {
 +            cxy_na_i += nbs->work[thread].cxy_na[i];
 +        }
 +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
 +        if (nbat->XFormat == nbatX8)
 +        {
 +            /* Make the number of cell a multiple of 2 */
 +            ncz = (ncz + 1) & ~1;
 +        }
 +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
 +        /* Clear cxy_na, so we can reuse the array below */
 +        grid->cxy_na[i] = 0;
 +    }
 +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
 +
 +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
 +                grid->na_sc, grid->na_c, grid->nc,
 +                grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
 +                ncz_max);
 +        if (gmx_debug_at)
 +        {
 +            i = 0;
 +            for (cy = 0; cy < grid->ncy; cy++)
 +            {
 +                for (cx = 0; cx < grid->ncx; cx++)
 +                {
 +                    fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
 +                    i++;
 +                }
 +                fprintf(debug, "\n");
 +            }
 +        }
 +    }
 +
 +    /* Make sure the work array for sorting is large enough */
 +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
 +    {
 +        for (thread = 0; thread < nbs->nthread_max; thread++)
 +        {
 +            nbs->work[thread].sort_work_nalloc =
 +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
 +            srenew(nbs->work[thread].sort_work,
 +                   nbs->work[thread].sort_work_nalloc);
 +            /* When not in use, all elements should be -1 */
 +            for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
 +            {
 +                nbs->work[thread].sort_work[i] = -1;
 +            }
 +        }
 +    }
 +
 +    /* Now we know the dimensions we can fill the grid.
 +     * This is the first, unsorted fill. We sort the columns after this.
 +     */
 +    for (i = a0; i < a1; i++)
 +    {
 +        /* At this point nbs->cell contains the local grid x,y indices */
 +        cxy = nbs->cell[i];
 +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
 +    }
 +
 +    if (dd_zone == 0)
 +    {
 +        /* Set the cell indices for the moved particles */
 +        n0 = grid->nc*grid->na_sc;
 +        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
 +        if (dd_zone == 0)
 +        {
 +            for (i = n0; i < n1; i++)
 +            {
 +                nbs->cell[nbs->a[i]] = i;
 +            }
 +        }
 +    }
 +
 +    /* Sort the super-cell columns along z into the sub-cells. */
 +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
 +    for (thread = 0; thread < nbs->nthread_max; thread++)
 +    {
 +        if (grid->bSimple)
 +        {
 +            sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
 +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                nbs->work[thread].sort_work);
 +        }
 +        else
 +        {
 +            sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
 +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                  nbs->work[thread].sort_work);
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid, grid->bb);
 +    }
 +#endif
 +
 +    if (!grid->bSimple)
 +    {
 +        grid->nsubc_tot = 0;
 +        for (i = 0; i < grid->nc; i++)
 +        {
 +            grid->nsubc_tot += grid->nsubc[i];
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (grid->bSimple)
 +        {
 +            print_bbsizes_simple(debug, nbs, grid);
 +        }
 +        else
 +        {
 +            fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
 +                    grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
 +
 +            print_bbsizes_supersub(debug, nbs, grid);
 +        }
 +    }
 +}
 +
 +static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 +                              int                   natoms)
 +{
 +    int b;
 +
 +    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 +    if (flags->nflag > flags->flag_nalloc)
 +    {
 +        flags->flag_nalloc = over_alloc_large(flags->nflag);
 +        srenew(flags->flag, flags->flag_nalloc);
 +    }
 +    for (b = 0; b < flags->nflag; b++)
 +    {
 +        flags->flag[b] = 0;
 +    }
 +}
 +
 +/* Sets up a grid and puts the atoms on the grid.
 + * This function only operates on one domain of the domain decompostion.
 + * Note that without domain decomposition there is only one domain.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC, matrix box,
 +                       int dd_zone,
 +                       rvec corner0, rvec corner1,
 +                       int a0, int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved, int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    int           n;
 +    int           nc_max_grid, nc_max;
 +
 +    grid = &nbs->grid[dd_zone];
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
 +
 +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
 +    grid->na_c_2log = get_2log(grid->na_c);
 +
 +    nbat->na_c = grid->na_c;
 +
 +    if (dd_zone == 0)
 +    {
 +        grid->cell0 = 0;
 +    }
 +    else
 +    {
 +        grid->cell0 =
 +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
 +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
 +    }
 +
 +    n = a1 - a0;
 +
 +    if (dd_zone == 0)
 +    {
 +        nbs->ePBC = ePBC;
 +        copy_mat(box, nbs->box);
 +
 +        if (atom_density >= 0)
 +        {
 +            grid->atom_density = atom_density;
 +        }
 +        else
 +        {
 +            grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
 +        }
 +
 +        grid->cell0 = 0;
 +
 +        nbs->natoms_local    = a1 - nmoved;
 +        /* We assume that nbnxn_put_on_grid is called first
 +         * for the local atoms (dd_zone=0).
 +         */
 +        nbs->natoms_nonlocal = a1 - nmoved;
 +    }
 +    else
 +    {
 +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1);
 +    }
 +
 +    nc_max_grid = set_grid_size_xy(nbs, grid,
 +                                   dd_zone, n-nmoved, corner0, corner1,
 +                                   nbs->grid[0].atom_density,
 +                                   nbat->XFormat);
 +
 +    nc_max = grid->cell0 + nc_max_grid;
 +
 +    if (a1 > nbs->cell_nalloc)
 +    {
 +        nbs->cell_nalloc = over_alloc_large(a1);
 +        srenew(nbs->cell, nbs->cell_nalloc);
 +    }
 +
 +    /* To avoid conditionals we store the moved particles at the end of a,
 +     * make sure we have enough space.
 +     */
 +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
 +    {
 +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
 +        srenew(nbs->a, nbs->a_nalloc);
 +    }
 +
 +    /* We need padding up to a multiple of the buffer flag size: simply add */
 +    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
 +    {
 +        nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
 +    }
 +
 +    calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
 +
 +    if (dd_zone == 0)
 +    {
 +        nbat->natoms_local = nbat->natoms;
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
 +}
 +
 +/* Calls nbnxn_put_on_grid for all non-local domains */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t            nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int                *atinfo,
 +                                rvec                     *x,
 +                                int                       nb_kernel_type,
 +                                nbnxn_atomdata_t         *nbat)
 +{
 +    int  zone, d;
 +    rvec c0, c1;
 +
 +    for (zone = 1; zone < zones->n; zone++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            c0[d] = zones->size[zone].bb_x0[d];
 +            c1[d] = zones->size[zone].bb_x1[d];
 +        }
 +
 +        nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
 +                          zone, c0, c1,
 +                          zones->cg_range[zone],
 +                          zones->cg_range[zone+1],
 +                          -1,
 +                          atinfo,
 +                          x,
 +                          0, NULL,
 +                          nb_kernel_type,
 +                          nbat);
 +    }
 +}
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
 +                           nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    float        *bbcz, *bb;
 +    int           ncd, sc;
 +
 +    grid = &nbs->grid[0];
 +
 +    if (grid->bSimple)
 +    {
 +        gmx_incons("nbnxn_grid_simple called with a simple grid");
 +    }
 +
 +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (grid->nc*ncd > grid->nc_nalloc_simple)
 +    {
 +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
 +        srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
 +        srenew(grid->bb_simple, grid->nc_nalloc_simple*NNBSBB_B);
 +        srenew(grid->flags_simple, grid->nc_nalloc_simple);
 +        if (nbat->XFormat)
 +        {
 +            sfree_aligned(grid->bbj);
 +            snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
 +        }
 +    }
 +
 +    bbcz = grid->bbcz_simple;
 +    bb   = grid->bb_simple;
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for (sc = 0; sc < grid->nc; sc++)
 +    {
 +        int c, tx, na;
 +
 +        for (c = 0; c < ncd; c++)
 +        {
 +            tx = sc*ncd + c;
 +
 +            na = NBNXN_CPU_CLUSTER_I_SIZE;
 +            while (na > 0 &&
 +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
 +            {
 +                na--;
 +            }
 +
 +            if (na > 0)
 +            {
 +                switch (nbat->XFormat)
 +                {
 +                    case nbatX4:
 +                        /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
 +                        calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
 +                                               bb+tx*NNBSBB_B);
 +                        break;
 +                    case nbatX8:
 +                        /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
 +                        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
 +                                               bb+tx*NNBSBB_B);
 +                        break;
 +                    default:
 +                        calc_bounding_box(na, nbat->xstride,
 +                                          nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
 +                                          bb+tx*NNBSBB_B);
 +                        break;
 +                }
 +                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
 +                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
 +
 +                /* No interaction optimization yet here */
 +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
 +            }
 +            else
 +            {
 +                grid->flags_simple[tx] = 0;
 +            }
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid, grid->bb_simple);
 +    }
 +#endif
 +}
 +
 +void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
 +{
 +    *ncx = nbs->grid[0].ncx;
 +    *ncy = nbs->grid[0].ncy;
 +}
 +
 +void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)
 +{
 +    const nbnxn_grid_t *grid;
 +
 +    grid = &nbs->grid[0];
 +
 +    /* Return the atom order for the home cell (index 0) */
 +    *a  = nbs->a;
 +
 +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
 +}
 +
 +void nbnxn_set_atomorder(nbnxn_search_t nbs)
 +{
 +    nbnxn_grid_t *grid;
 +    int           ao, cx, cy, cxy, cz, j;
 +
 +    /* Set the atom order for the home cell (index 0) */
 +    grid = &nbs->grid[0];
 +
 +    ao = 0;
 +    for (cx = 0; cx < grid->ncx; cx++)
 +    {
 +        for (cy = 0; cy < grid->ncy; cy++)
 +        {
 +            cxy = cx*grid->ncy + cy;
 +            j   = grid->cxy_ind[cxy]*grid->na_sc;
 +            for (cz = 0; cz < grid->cxy_na[cxy]; cz++)
 +            {
 +                nbs->a[j]     = ao;
 +                nbs->cell[ao] = j;
 +                ao++;
 +                j++;
 +            }
 +        }
 +    }
 +}
 +
 +/* Determines the cell range along one dimension that
 + * the bounding box b0 - b1 sees.
 + */
 +static void get_cell_range(real b0, real b1,
 +                           int nc, real c0, real s, real invs,
 +                           real d2, real r2, int *cf, int *cl)
 +{
 +    *cf = max((int)((b0 - c0)*invs), 0);
 +
 +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
 +    {
 +        (*cf)--;
 +    }
 +
 +    *cl = min((int)((b1 - c0)*invs), nc-1);
 +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
 +    {
 +        (*cl)++;
 +    }
 +}
 +
 +/* Reference code calculating the distance^2 between two bounding boxes */
 +static float box_dist2(float bx0, float bx1, float by0,
 +                       float by1, float bz0, float bz1,
 +                       const float *bb)
 +{
 +    float d2;
 +    float dl, dh, dm, dm0;
 +
 +    d2 = 0;
 +
 +    dl  = bx0 - bb[BBU_X];
 +    dh  = bb[BBL_X] - bx1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = by0 - bb[BBU_Y];
 +    dh  = bb[BBL_Y] - by1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bz0 - bb[BBU_Z];
 +    dh  = bb[BBL_Z] - bz1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +/* Plain C code calculating the distance^2 between two bounding boxes */
 +static float subc_bb_dist2(int si, const float *bb_i_ci,
 +                           int csj, const float *bb_j_all)
 +{
 +    const float *bb_i, *bb_j;
 +    float        d2;
 +    float        dl, dh, dm, dm0;
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    d2 = 0;
 +
 +    dl  = bb_i[BBL_X] - bb_j[BBU_X];
 +    dh  = bb_j[BBL_X] - bb_i[BBU_X];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
 +    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
 +    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* SSE code for bb distance for bb format xyz0 */
 +static float subc_bb_dist2_sse(int na_c,
 +                               int si, const float *bb_i_ci,
 +                               int csj, const float *bb_j_all)
 +{
 +    const float *bb_i, *bb_j;
 +
 +    __m128       bb_i_SSE0, bb_i_SSE1;
 +    __m128       bb_j_SSE0, bb_j_SSE1;
 +    __m128       dl_SSE;
 +    __m128       dh_SSE;
 +    __m128       dm_SSE;
 +    __m128       dm0_SSE;
 +    __m128       d2_SSE;
 +#ifndef GMX_X86_SSE4_1
 +    float        d2_array[7], *d2_align;
 +
 +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
 +#else
 +    float d2;
 +#endif
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    bb_i_SSE0 = _mm_load_ps(bb_i);
 +    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
 +    bb_j_SSE0 = _mm_load_ps(bb_j);
 +    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
 +
 +    dl_SSE    = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
 +    dh_SSE    = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
 +
 +    dm_SSE    = _mm_max_ps(dl_SSE, dh_SSE);
 +    dm0_SSE   = _mm_max_ps(dm_SSE, _mm_setzero_ps());
 +#ifndef GMX_X86_SSE4_1
 +    d2_SSE    = _mm_mul_ps(dm0_SSE, dm0_SSE);
 +
 +    _mm_store_ps(d2_align, d2_SSE);
 +
 +    return d2_align[0] + d2_align[1] + d2_align[2];
 +#else
 +    /* SSE4.1 dot product of components 0,1,2 */
 +    d2_SSE    = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
 +
 +    _mm_store_ss(&d2, d2_SSE);
 +
 +    return d2;
 +#endif
 +}
 +
 +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
 +    {                                                \
 +        int    shi;                                  \
 +                                                 \
 +        __m128 dx_0, dy_0, dz_0;                       \
 +        __m128 dx_1, dy_1, dz_1;                       \
 +                                                 \
 +        __m128 mx, my, mz;                             \
 +        __m128 m0x, m0y, m0z;                          \
 +                                                 \
 +        __m128 d2x, d2y, d2z;                          \
 +        __m128 d2s, d2t;                              \
 +                                                 \
 +        shi = si*NNBSBB_D*DIM;                       \
 +                                                 \
 +        xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
 +        yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
 +        zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
 +        xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
 +        yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
 +        zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
 +                                                 \
 +        dx_0 = _mm_sub_ps(xi_l, xj_h);                \
 +        dy_0 = _mm_sub_ps(yi_l, yj_h);                \
 +        dz_0 = _mm_sub_ps(zi_l, zj_h);                \
 +                                                 \
 +        dx_1 = _mm_sub_ps(xj_l, xi_h);                \
 +        dy_1 = _mm_sub_ps(yj_l, yi_h);                \
 +        dz_1 = _mm_sub_ps(zj_l, zi_h);                \
 +                                                 \
 +        mx   = _mm_max_ps(dx_0, dx_1);                \
 +        my   = _mm_max_ps(dy_0, dy_1);                \
 +        mz   = _mm_max_ps(dz_0, dz_1);                \
 +                                                 \
 +        m0x  = _mm_max_ps(mx, zero);                  \
 +        m0y  = _mm_max_ps(my, zero);                  \
 +        m0z  = _mm_max_ps(mz, zero);                  \
 +                                                 \
 +        d2x  = _mm_mul_ps(m0x, m0x);                  \
 +        d2y  = _mm_mul_ps(m0y, m0y);                  \
 +        d2z  = _mm_mul_ps(m0z, m0z);                  \
 +                                                 \
 +        d2s  = _mm_add_ps(d2x, d2y);                  \
 +        d2t  = _mm_add_ps(d2s, d2z);                  \
 +                                                 \
 +        _mm_store_ps(d2+si, d2t);                     \
 +    }
 +
 +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
 +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 +                                   int nsi, const float *bb_i,
 +                                   float *d2)
 +{
 +    __m128 xj_l, yj_l, zj_l;
 +    __m128 xj_h, yj_h, zj_h;
 +    __m128 xi_l, yi_l, zi_l;
 +    __m128 xi_h, yi_h, zi_h;
 +
 +    __m128 zero;
 +
 +    zero = _mm_setzero_ps();
 +
 +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
 +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
 +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
 +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
 +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
 +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
 +
 +    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
 +     * But as we know the number of iterations is 1 or 2, we unroll manually.
 +     */
 +    SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
 +    if (STRIDE_PBB < nsi)
 +    {
 +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
 +    }
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +/* Plain C function which determines if any atom pair between two cells
 + * is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_x(int na_c,
 +                                int si, const real *x_i,
 +                                int csj, int stride, const real *x_j,
 +                                real rl2)
 +{
 +    int  i, j, i0, j0;
 +    real d2;
 +
 +    for (i = 0; i < na_c; i++)
 +    {
 +        i0 = (si*na_c + i)*DIM;
 +        for (j = 0; j < na_c; j++)
 +        {
 +            j0 = (csj*na_c + j)*stride;
 +
 +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
 +                sqr(x_i[i0+1] - x_j[j0+1]) +
 +                sqr(x_i[i0+2] - x_j[j0+2]);
 +
 +            if (d2 < rl2)
 +            {
 +                return TRUE;
 +            }
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +/* SSE function which determines if any atom pair between two cells,
 + * both with 8 atoms, is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_sse8(int na_c,
 +                                   int si, const real *x_i,
 +                                   int csj, int stride, const real *x_j,
 +                                   real rl2)
 +{
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +    __m128 ix_SSE0, iy_SSE0, iz_SSE0;
 +    __m128 ix_SSE1, iy_SSE1, iz_SSE1;
 +
 +    __m128 rc2_SSE;
 +
 +    int    na_c_sse;
 +    int    j0, j1;
 +
 +    rc2_SSE   = _mm_set1_ps(rl2);
 +
 +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
 +    ix_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
 +    iy_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
 +    iz_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
 +    ix_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
 +    iy_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
 +    iz_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
 +
 +    /* We loop from the outer to the inner particles to maximize
 +     * the chance that we find a pair in range quickly and return.
 +     */
 +    j0 = csj*na_c;
 +    j1 = j0 + na_c - 1;
 +    while (j0 < j1)
 +    {
 +        __m128 jx0_SSE, jy0_SSE, jz0_SSE;
 +        __m128 jx1_SSE, jy1_SSE, jz1_SSE;
 +
 +        __m128 dx_SSE0, dy_SSE0, dz_SSE0;
 +        __m128 dx_SSE1, dy_SSE1, dz_SSE1;
 +        __m128 dx_SSE2, dy_SSE2, dz_SSE2;
 +        __m128 dx_SSE3, dy_SSE3, dz_SSE3;
 +
 +        __m128 rsq_SSE0;
 +        __m128 rsq_SSE1;
 +        __m128 rsq_SSE2;
 +        __m128 rsq_SSE3;
 +
 +        __m128 wco_SSE0;
 +        __m128 wco_SSE1;
 +        __m128 wco_SSE2;
 +        __m128 wco_SSE3;
 +        __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
 +
 +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
 +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
 +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
 +
 +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
 +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
 +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
 +
 +        /* Calculate distance */
 +        dx_SSE0            = _mm_sub_ps(ix_SSE0, jx0_SSE);
 +        dy_SSE0            = _mm_sub_ps(iy_SSE0, jy0_SSE);
 +        dz_SSE0            = _mm_sub_ps(iz_SSE0, jz0_SSE);
 +        dx_SSE1            = _mm_sub_ps(ix_SSE1, jx0_SSE);
 +        dy_SSE1            = _mm_sub_ps(iy_SSE1, jy0_SSE);
 +        dz_SSE1            = _mm_sub_ps(iz_SSE1, jz0_SSE);
 +        dx_SSE2            = _mm_sub_ps(ix_SSE0, jx1_SSE);
 +        dy_SSE2            = _mm_sub_ps(iy_SSE0, jy1_SSE);
 +        dz_SSE2            = _mm_sub_ps(iz_SSE0, jz1_SSE);
 +        dx_SSE3            = _mm_sub_ps(ix_SSE1, jx1_SSE);
 +        dy_SSE3            = _mm_sub_ps(iy_SSE1, jy1_SSE);
 +        dz_SSE3            = _mm_sub_ps(iz_SSE1, jz1_SSE);
 +
 +        /* rsq = dx*dx+dy*dy+dz*dz */
 +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
 +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
 +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
 +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
 +
 +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
 +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
 +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
 +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
 +
 +        wco_any_SSE01      = _mm_or_ps(wco_SSE0, wco_SSE1);
 +        wco_any_SSE23      = _mm_or_ps(wco_SSE2, wco_SSE3);
 +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
 +
 +        if (_mm_movemask_ps(wco_any_SSE))
 +        {
 +            return TRUE;
 +        }
 +
 +        j0++;
 +        j1--;
 +    }
 +    return FALSE;
 +
 +#else
 +    /* No SSE */
 +    gmx_incons("SSE function called without SSE support");
 +
 +    return TRUE;
 +#endif
 +}
 +
 +/* Returns the j sub-cell for index cj_ind */
 +static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
 +}
 +
 +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
 +}
 +
 +/* Ensures there is enough space for extra extra exclusion masks */
 +static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
 +{
 +    if (nbl->nexcl+extra > nbl->excl_nalloc)
 +    {
 +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 +        nbnxn_realloc_void((void **)&nbl->excl,
 +                           nbl->nexcl*sizeof(*nbl->excl),
 +                           nbl->excl_nalloc*sizeof(*nbl->excl),
 +                           nbl->alloc, nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-cells in the list */
 +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
 +                                            int               ncell)
 +{
 +    int cj_max;
 +
 +    cj_max = nbl->ncj + ncell;
 +
 +    if (cj_max > nbl->cj_nalloc)
 +    {
 +        nbl->cj_nalloc = over_alloc_small(cj_max);
 +        nbnxn_realloc_void((void **)&nbl->cj,
 +                           nbl->ncj*sizeof(*nbl->cj),
 +                           nbl->cj_nalloc*sizeof(*nbl->cj),
 +                           nbl->alloc, nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-subcells in the list */
 +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
 +                                              int               nsupercell)
 +{
 +    int ncj4_max, j4, j, w, t;
 +
 +#define NWARP       2
 +#define WARP_SIZE  32
 +
 +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
 +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
 +     * since we round down, we need one extra entry.
 +     */
 +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +
 +    if (ncj4_max > nbl->cj4_nalloc)
 +    {
 +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 +        nbnxn_realloc_void((void **)&nbl->cj4,
 +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
 +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
 +                           nbl->alloc, nbl->free);
 +    }
 +
 +    if (ncj4_max > nbl->work->cj4_init)
 +    {
 +        for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
 +        {
 +            /* No i-subcells and no excl's in the list initially */
 +            for (w = 0; w < NWARP; w++)
 +            {
 +                nbl->cj4[j4].imei[w].imask    = 0U;
 +                nbl->cj4[j4].imei[w].excl_ind = 0;
 +
 +            }
 +        }
 +        nbl->work->cj4_init = ncj4_max;
 +    }
 +}
 +
 +/* Set all excl masks for one GPU warp no exclusions */
 +static void set_no_excls(nbnxn_excl_t *excl)
 +{
 +    int t;
 +
 +    for (t = 0; t < WARP_SIZE; t++)
 +    {
 +        /* Turn all interaction bits on */
 +        excl->pair[t] = NBNXN_INT_MASK_ALL;
 +    }
 +}
 +
 +/* Initializes a single nbnxn_pairlist_t data structure */
 +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 +                                gmx_bool          bSimple,
 +                                nbnxn_alloc_t    *alloc,
 +                                nbnxn_free_t     *free)
 +{
 +    if (alloc == NULL)
 +    {
 +        nbl->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbl->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbl->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbl->free = free;
 +    }
 +
 +    nbl->bSimple     = bSimple;
 +    nbl->na_sc       = 0;
 +    nbl->na_ci       = 0;
 +    nbl->na_cj       = 0;
 +    nbl->nci         = 0;
 +    nbl->ci          = NULL;
 +    nbl->ci_nalloc   = 0;
 +    nbl->ncj         = 0;
 +    nbl->cj          = NULL;
 +    nbl->cj_nalloc   = 0;
 +    nbl->ncj4        = 0;
 +    /* We need one element extra in sj, so alloc initially with 1 */
 +    nbl->cj4_nalloc  = 0;
 +    nbl->cj4         = NULL;
 +    nbl->nci_tot     = 0;
 +
 +    if (!nbl->bSimple)
 +    {
 +        nbl->excl        = NULL;
 +        nbl->excl_nalloc = 0;
 +        nbl->nexcl       = 0;
 +        check_excl_space(nbl, 1);
 +        nbl->nexcl       = 1;
 +        set_no_excls(&nbl->excl[0]);
 +    }
 +
 +    snew(nbl->work, 1);
 +#ifdef NBNXN_BBXXXX
 +    snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
 +#else
 +    snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL*NNBSBB_B, NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
 +#ifdef GMX_NBNXN_SIMD
 +    snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
 +    snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
 +
 +    nbl->work->sort            = NULL;
 +    nbl->work->sort_nalloc     = 0;
 +    nbl->work->sci_sort        = NULL;
 +    nbl->work->sci_sort_nalloc = 0;
 +}
 +
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool bSimple, gmx_bool bCombined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free)
 +{
 +    int i;
 +
 +    nbl_list->bSimple   = bSimple;
 +    nbl_list->bCombined = bCombined;
 +
 +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (!nbl_list->bCombined &&
 +        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 +    {
 +        gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 +                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
 +    }
 +
 +    snew(nbl_list->nbl, nbl_list->nnbl);
 +    /* Execute in order to avoid memory interleaving between threads */
 +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 +    for (i = 0; i < nbl_list->nnbl; i++)
 +    {
 +        /* Allocate the nblist data structure locally on each thread
 +         * to optimize memory access for NUMA architectures.
 +         */
 +        snew(nbl_list->nbl[i], 1);
 +
 +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 +        if (i == 0)
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);
 +        }
 +        else
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL, NULL);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair list, used for debug output */
 +static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
 +                                           const nbnxn_search_t nbs, real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int                 cs[SHIFTS];
 +    int                 s, i, j;
 +    int                 npexcl;
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp, "nbl nci %d ncj %d\n",
 +            nbl->nci, nbl->ncj);
 +    fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
 +
 +    fprintf(fp, "nbl average j cell list length %.1f\n",
 +            0.25*nbl->ncj/(double)nbl->nci);
 +
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        cs[s] = 0;
 +    }
 +    npexcl = 0;
 +    for (i = 0; i < nbl->nci; i++)
 +    {
 +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
 +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
 +
 +        j = nbl->ci[i].cj_ind_start;
 +        while (j < nbl->ci[i].cj_ind_end &&
 +               nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            npexcl++;
 +            j++;
 +        }
 +    }
 +    fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
 +            nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        if (cs[s] > 0)
 +        {
 +            fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair lists, used for debug output */
 +static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
 +                                             const nbnxn_search_t nbs, real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int                 i, j4, j, si, b;
 +    int                 c[GPU_NSUBCELL+1];
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
 +            nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
 +    fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
 +
 +    fprintf(fp, "nbl average j super cell list length %.1f\n",
 +            0.25*nbl->ncj4/(double)nbl->nsci);
 +    fprintf(fp, "nbl average i sub cell list length %.1f\n",
 +            nbl->nci_tot/((double)nbl->ncj4));
 +
 +    for (si = 0; si <= GPU_NSUBCELL; si++)
 +    {
 +        c[si] = 0;
 +    }
 +    for (i = 0; i < nbl->nsci; i++)
 +    {
 +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                b = 0;
 +                for (si = 0; si < GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        b++;
 +                    }
 +                }
 +                c[b]++;
 +            }
 +        }
 +    }
 +    for (b = 0; b <= GPU_NSUBCELL; b++)
 +    {
 +        fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
 +                b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
 +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,
 +                                   int warp, nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* No exclusions set, make a new list entry */
 +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
 +        nbl->nexcl++;
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +        set_no_excls(*excl);
 +    }
 +    else
 +    {
 +        /* We already have some exclusions, new ones can be added to the list */
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,
 +                                 int warp, nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* We need to make a new list entry, check if we have space */
 +        check_excl_space(nbl, 1);
 +    }
 +    low_get_nbl_exclusions(nbl, cj4, warp, excl);
 +}
 +
 +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,
 +                                 nbnxn_excl_t **excl_w0,
 +                                 nbnxn_excl_t **excl_w1)
 +{
 +    /* Check for space we might need */
 +    check_excl_space(nbl, 2);
 +
 +    low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
 +    low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
 +}
 +
 +/* Sets the self exclusions i=j and pair exclusions i>j */
 +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 +                                               int cj4_ind, int sj_offset,
 +                                               int si)
 +{
 +    nbnxn_excl_t *excl[2];
 +    int           ei, ej, w;
 +
 +    /* Here we only set the set self and double pair exclusions */
 +
 +    get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
 +
 +    /* Only minor < major bits set */
 +    for (ej = 0; ej < nbl->na_ci; ej++)
 +    {
 +        w = (ej>>2);
 +        for (ei = ej; ei < nbl->na_ci; ei++)
 +        {
 +            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
 +                ~(1U << (sj_offset*GPU_NSUBCELL + si));
 +        }
 +    }
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 +static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 +static unsigned int get_imask_x86_simd128(gmx_bool rdiag, int ci, int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#else              /* cj-size = 2 */
 +    return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
 +            (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
 +             NBNXN_INT_MASK_ALL));
 +#endif
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 +static unsigned int get_imask_x86_simd256(gmx_bool rdiag, int ci, int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 8 */
 +    return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
 +            (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
 +             NBNXN_INT_MASK_ALL));
 +#else              /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#endif
 +}
 +
 +#ifdef GMX_NBNXN_SIMD
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define get_imask_x86_simd_4xn  get_imask_x86_simd128
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#define get_imask_x86_simd_4xn  get_imask_x86_simd256
 +#define get_imask_x86_simd_2xnn get_imask_x86_simd128
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 +#endif
 +#endif
 +#endif
 +
 +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 +                                     nbnxn_pairlist_t *nbl,
 +                                     int ci, int cjf, int cjl,
 +                                     gmx_bool remove_sub_diag,
 +                                     const real *x_j,
 +                                     real rl2, float rbb2,
 +                                     int *ndistc)
 +{
 +    const nbnxn_list_work_t *work;
 +
 +    const float             *bb_ci;
 +    const real              *x_ci;
 +
 +    gmx_bool                 InRange;
 +    real                     d2;
 +    int                      cjf_gl, cjl_gl, cj;
 +
 +    work = nbl->work;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +        d2       = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i, j;
 +
 +            cjf_gl = gridj->cell0 + cjf;
 +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +        d2       = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i, j;
 +
 +            cjl_gl = gridj->cell0 + cjl;
 +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for (cj = cjf; cj <= cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#ifdef GMX_NBNXN_SIMD_4XN
 +#include "nbnxn_search_simd_4xn.h"
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +#include "nbnxn_search_simd_2xnn.h"
 +#endif
 +
 +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 +                                       const nbnxn_grid_t *gridi,
 +                                       const nbnxn_grid_t *gridj,
 +                                       nbnxn_pairlist_t *nbl,
 +                                       int sci, int scj,
 +                                       gmx_bool sci_equals_scj,
 +                                       int stride, const real *x,
 +                                       real rl2, float rbb2,
 +                                       int *ndistc)
 +{
 +    int          na_c;
 +    int          npair;
 +    int          cjo, ci1, ci, cj, cj_gl;
 +    int          cj4_ind, cj_offset;
 +    unsigned     imask;
 +    nbnxn_cj4_t *cj4;
 +    const float *bb_ci;
 +    const real  *x_ci;
 +    float       *d2l, d2;
 +    int          w;
 +#define PRUNE_LIST_CPU_ONE
 +#ifdef PRUNE_LIST_CPU_ONE
 +    int  ci_last = -1;
 +#endif
 +
 +    d2l = nbl->work->d2;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    na_c = gridj->na_c;
 +
 +    for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)
 +    {
 +        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
 +        cj4       = &nbl->cj4[cj4_ind];
 +
 +        cj = scj*GPU_NSUBCELL + cjo;
 +
 +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
 +
 +        /* Initialize this j-subcell i-subcell list */
 +        cj4->cj[cj_offset] = cj_gl;
 +        imask              = 0;
 +
 +        if (sci_equals_scj)
 +        {
 +            ci1 = cjo + 1;
 +        }
 +        else
 +        {
 +            ci1 = gridi->nsubc[sci];
 +        }
 +
 +#ifdef NBNXN_BBXXXX
 +        /* Determine all ci1 bb distances in one call with SSE */
 +        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
 +                               ci1, bb_ci, d2l);
 +        *ndistc += na_c*2;
 +#endif
 +
 +        npair = 0;
 +        /* We use a fixed upper-bound instead of ci1 to help optimization */
 +        for (ci = 0; ci < GPU_NSUBCELL; ci++)
 +        {
 +            if (ci == ci1)
 +            {
 +                break;
 +            }
 +
 +#ifndef NBNXN_BBXXXX
 +            /* Determine the bb distance between ci and cj */
 +            d2l[ci]  = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);
 +            *ndistc += 2;
 +#endif
 +            d2 = d2l[ci];
 +
 +#ifdef PRUNE_LIST_CPU_ALL
 +            /* Check if the distance is within the distance where
 +             * we use only the bounding box distance rbb,
 +             * or within the cut-off and there is at least one atom pair
 +             * within the cut-off. This check is very costly.
 +             */
 +            *ndistc += na_c*na_c;
 +            if (d2 < rbb2 ||
 +                (d2 < rl2 &&
 +#ifdef NBNXN_PBB_SSE
-                 subc_in_range_x
++                 subc_in_range_sse8
 +#else
-                     (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
++                 subc_in_range_x
 +#endif
-                 for (si=0; si<GPU_NSUBCELL; si++)
++                     (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
 +#else
 +            /* Check if the distance between the two bounding boxes
 +             * in within the pair-list cut-off.
 +             */
 +            if (d2 < rl2)
 +#endif
 +            {
 +                /* Flag this i-subcell to be taken into account */
 +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +                ci_last = ci;
 +#endif
 +
 +                npair++;
 +            }
 +        }
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +        /* If we only found 1 pair, check if any atoms are actually
 +         * within the cut-off, so we could get rid of it.
 +         */
 +        if (npair == 1 && d2l[ci_last] >= rbb2)
 +        {
 +            /* Avoid using function pointers here, as it's slower */
 +            if (
 +#ifdef NBNXN_PBB_SSE
 +                !subc_in_range_sse8
 +#else
 +                !subc_in_range_x
 +#endif
 +                    (na_c, ci_last, x_ci, cj_gl, stride, x, rl2))
 +            {
 +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
 +                npair--;
 +            }
 +        }
 +#endif
 +
 +        if (npair > 0)
 +        {
 +            /* We have a useful sj entry, close it now */
 +
 +            /* Set the exclucions for the ci== sj entry.
 +             * Here we don't bother to check if this entry is actually flagged,
 +             * as it will nearly always be in the list.
 +             */
 +            if (sci_equals_scj)
 +            {
 +                set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);
 +            }
 +
 +            /* Copy the cluster interaction mask to the list */
 +            for (w = 0; w < NWARP; w++)
 +            {
 +                cj4->imei[w].imask |= imask;
 +            }
 +
 +            nbl->work->cj_ind++;
 +
 +            /* Keep the count */
 +            nbl->nci_tot += npair;
 +
 +            /* Increase the closing index in i super-cell list */
 +            nbl->sci[nbl->nsci].cj4_ind_end =
 +                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for simple list i-entry nbl_ci
 + */
 +static void set_ci_top_excls(const nbnxn_search_t nbs,
 +                             nbnxn_pairlist_t    *nbl,
 +                             gmx_bool             diagRemoved,
 +                             int                  na_ci_2log,
 +                             int                  na_cj_2log,
 +                             const nbnxn_ci_t    *nbl_ci,
 +                             const t_blocka      *excl)
 +{
 +    const int    *cell;
 +    int           ci;
 +    int           cj_ind_first, cj_ind_last;
 +    int           cj_first, cj_last;
 +    int           ndirect;
 +    int           i, ai, aj, si, eind, ge, se;
 +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
 +    int           cj_m;
 +    gmx_bool      Found_si;
 +    int           si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int           inner_i, inner_e;
 +
 +    cell = nbs->cell;
 +
 +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    ci = nbl_ci->ci;
 +
 +    cj_ind_first = nbl_ci->cj_ind_start;
 +    cj_ind_last  = nbl->ncj - 1;
 +
 +    cj_first = nbl->cj[cj_ind_first].cj;
 +    cj_last  = nbl->cj[cj_ind_last].cj;
 +
 +    /* Determine how many contiguous j-cells we have starting
 +     * from the first i-cell. This number can be used to directly
 +     * calculate j-cell indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    if (na_ci_2log == na_cj_2log)
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    else
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#endif
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for (i = 0; i < nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[ci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_ci_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = (ge >> na_cj_2log);
 +
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found    = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl->cj[cj_ind_m].cj;
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - (si << na_ci_2log);
 +                        inner_e = ge - (se << na_cj_2log);
 +
 +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for i-super-cell entry nbl_sci
 + */
 +static void set_sci_top_excls(const nbnxn_search_t nbs,
 +                              nbnxn_pairlist_t    *nbl,
 +                              gmx_bool             diagRemoved,
 +                              int                  na_c_2log,
 +                              const nbnxn_sci_t   *nbl_sci,
 +                              const t_blocka      *excl)
 +{
 +    const int    *cell;
 +    int           na_c;
 +    int           sci;
 +    int           cj_ind_first, cj_ind_last;
 +    int           cj_first, cj_last;
 +    int           ndirect;
 +    int           i, ai, aj, si, eind, ge, se;
 +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
 +    int           cj_m;
 +    gmx_bool      Found_si;
 +    int           si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int           inner_i, inner_e, w;
 +
 +    cell = nbs->cell;
 +
 +    na_c = nbl->na_ci;
 +
 +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    sci = nbl_sci->sci;
 +
 +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
 +    cj_ind_last  = nbl->work->cj_ind - 1;
 +
 +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
 +    cj_last  = nbl_cj(nbl, cj_ind_last);
 +
 +    /* Determine how many contiguous j-clusters we have starting
 +     * from the first i-cluster. This number can be used to directly
 +     * calculate j-cluster indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    while (cj_ind_first + ndirect <= cj_ind_last &&
 +           nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
 +    {
 +        ndirect++;
 +    }
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for (i = 0; i < nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[sci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_c_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = ge>>na_c_2log;
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found    = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl_cj(nbl, cj_ind_m);
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - si*na_c;
 +                        inner_e = ge - se*na_c;
 +
 +/* Macro for getting the index of atom a within a cluster */
 +#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
 +/* Macro for converting an atom number to a cluster number */
 +#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
 +/* Macro for getting the index of an i-atom within a warp */
 +#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
 +
 +                        if (nbl_imask0(nbl, found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
 +                        {
 +                            w       = (inner_e >> 2);
 +
 +                            get_nbl_exclusions_1(nbl, A2CJ4(found), w, &nbl_excl);
 +
 +                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
 +                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
 +                        }
 +
 +#undef AMODCJ4
 +#undef A2CJ4
 +#undef AMODWI
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Reallocate the simple ci list for at least n entries */
 +static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
 +{
 +    nbl->ci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->ci,
 +                       nbl->nci*sizeof(*nbl->ci),
 +                       nbl->ci_nalloc*sizeof(*nbl->ci),
 +                       nbl->alloc, nbl->free);
 +}
 +
 +/* Reallocate the super-cell sci list for at least n entries */
 +static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
 +{
 +    nbl->sci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->sci,
 +                       nbl->nsci*sizeof(*nbl->sci),
 +                       nbl->sci_nalloc*sizeof(*nbl->sci),
 +                       nbl->alloc, nbl->free);
 +}
 +
 +/* Make a new ci entry at index nbl->nci */
 +static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags,
 +                         nbnxn_list_work_t *work)
 +{
 +    if (nbl->nci + 1 > nbl->ci_nalloc)
 +    {
 +        nb_realloc_ci(nbl, nbl->nci+1);
 +    }
 +    nbl->ci[nbl->nci].ci            = ci;
 +    nbl->ci[nbl->nci].shift         = shift;
 +    /* Store the interaction flags along with the shift */
 +    nbl->ci[nbl->nci].shift        |= flags;
 +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
 +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
 +}
 +
 +/* Make a new sci entry at index nbl->nsci */
 +static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift, int flags,
 +                          nbnxn_list_work_t *work)
 +{
 +    if (nbl->nsci + 1 > nbl->sci_nalloc)
 +    {
 +        nb_realloc_sci(nbl, nbl->nsci+1);
 +    }
 +    nbl->sci[nbl->nsci].sci           = sci;
 +    nbl->sci[nbl->nsci].shift         = shift;
 +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
 +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
 +}
 +
 +/* Sort the simple j-list cj on exclusions.
 + * Entries with exclusions will all be sorted to the beginning of the list.
 + */
 +static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
 +                         nbnxn_list_work_t *work)
 +{
 +    int jnew, j;
 +
 +    if (ncj > work->cj_nalloc)
 +    {
 +        work->cj_nalloc = over_alloc_large(ncj);
 +        srenew(work->cj, work->cj_nalloc);
 +    }
 +
 +    /* Make a list of the j-cells involving exclusions */
 +    jnew = 0;
 +    for (j = 0; j < ncj; j++)
 +    {
 +        if (cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            work->cj[jnew++] = cj[j];
 +        }
 +    }
 +    /* Check if there are exclusions at all or not just the first entry */
 +    if (!((jnew == 0) ||
 +          (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
 +    {
 +        for (j = 0; j < ncj; j++)
 +        {
 +            if (cj[j].excl == NBNXN_INT_MASK_ALL)
 +            {
 +                work->cj[jnew++] = cj[j];
 +            }
 +        }
 +        for (j = 0; j < ncj; j++)
 +        {
 +            cj[j] = work->cj[j];
 +        }
 +    }
 +}
 +
 +/* Close this simple list i entry */
 +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
 +{
 +    int jlen;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
 +    if (jlen > 0)
 +    {
 +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);
 +
 +        /* The counts below are used for non-bonded pair/flop counts
 +         * and should therefore match the available kernel setups.
 +         */
 +        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
 +        {
 +            nbl->work->ncj_noq += jlen;
 +        }
 +        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
 +                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
 +        {
 +            nbl->work->ncj_hlj += jlen;
 +        }
 +
 +        nbl->nci++;
 +    }
 +}
 +
 +/* Split sci entry for load balancing on the GPU.
 + * Splitting ensures we have enough lists to fully utilize the whole GPU.
 + * With progBal we generate progressively smaller lists, which improves
 + * load balancing. As we only know the current count on our own thread,
 + * we will need to estimate the current total amount of i-entries.
 + * As the lists get concatenated later, this estimate depends
 + * both on nthread and our own thread index.
 + */
 +static void split_sci_entry(nbnxn_pairlist_t *nbl,
 +                            int nsp_max_av, gmx_bool progBal, int nc_bal,
 +                            int thread, int nthread)
 +{
 +    int nsci_est;
 +    int nsp_max;
 +    int cj4_start, cj4_end, j4len, cj4;
 +    int sci;
 +    int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
 +    int p;
 +
 +    if (progBal)
 +    {
 +        /* Estimate the total numbers of ci's of the nblist combined
 +         * over all threads using the target number of ci's.
 +         */
 +        nsci_est = nc_bal*thread/nthread + nbl->nsci;
 +
 +        /* The first ci blocks should be larger, to avoid overhead.
 +         * The last ci blocks should be smaller, to improve load balancing.
 +         */
 +        nsp_max = max(1,
 +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
 +    }
 +    else
 +    {
 +        nsp_max = nsp_max_av;
 +    }
 +
 +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
 +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
 +    j4len     = cj4_end - cj4_start;
 +
 +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
 +    {
 +        /* Remove the last ci entry and process the cj4's again */
 +        nbl->nsci -= 1;
 +
 +        sci        = nbl->nsci;
 +        nsp        = 0;
 +        nsp_sci    = 0;
 +        nsp_cj4_e  = 0;
 +        nsp_cj4    = 0;
 +        for (cj4 = cj4_start; cj4 < cj4_end; cj4++)
 +        {
 +            nsp_cj4_p = nsp_cj4;
 +            /* Count the number of cluster pairs in this cj4 group */
 +            nsp_cj4   = 0;
 +            for (p = 0; p < GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
 +            {
 +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
 +            }
 +
 +            if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
 +            {
 +                /* Split the list at cj4 */
 +                nbl->sci[sci].cj4_ind_end = cj4;
 +                /* Create a new sci entry */
 +                sci++;
 +                nbl->nsci++;
 +                if (nbl->nsci+1 > nbl->sci_nalloc)
 +                {
 +                    nb_realloc_sci(nbl, nbl->nsci+1);
 +                }
 +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
 +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
 +                nbl->sci[sci].cj4_ind_start = cj4;
 +                nsp_sci                     = nsp;
 +                nsp_cj4_e                   = nsp_cj4_p;
 +                nsp                         = 0;
 +            }
 +            nsp += nsp_cj4;
 +        }
 +
 +        /* Put the remaining cj4's in the last sci entry */
 +        nbl->sci[sci].cj4_ind_end = cj4_end;
 +
 +        /* Possibly balance out the last two sci's
 +         * by moving the last cj4 of the second last sci.
 +         */
 +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
 +        {
 +            nbl->sci[sci-1].cj4_ind_end--;
 +            nbl->sci[sci].cj4_ind_start--;
 +        }
 +
 +        nbl->nsci++;
 +    }
 +}
 +
 +/* Clost this super/sub list i entry */
 +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
 +                                    int nsp_max_av,
 +                                    gmx_bool progBal, int nc_bal,
 +                                    int thread, int nthread)
 +{
 +    int j4len, tlen;
 +    int nb, b;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
 +    if (j4len > 0)
 +    {
 +        /* We can only have complete blocks of 4 j-entries in a list,
 +         * so round the count up before closing.
 +         */
 +        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +
 +        nbl->nsci++;
 +
 +        if (nsp_max_av > 0)
 +        {
 +            /* Measure the size of the new entry and potentially split it */
 +            split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);
 +        }
 +    }
 +}
 +
 +/* Syncs the working array before adding another grid pair to the list */
 +static void sync_work(nbnxn_pairlist_t *nbl)
 +{
 +    if (!nbl->bSimple)
 +    {
 +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +        nbl->work->cj4_init = nbl->ncj4;
 +    }
 +}
 +
 +/* Clears an nbnxn_pairlist_t data structure */
 +static void clear_pairlist(nbnxn_pairlist_t *nbl)
 +{
 +    nbl->nci           = 0;
 +    nbl->nsci          = 0;
 +    nbl->ncj           = 0;
 +    nbl->ncj4          = 0;
 +    nbl->nci_tot       = 0;
 +    nbl->nexcl         = 1;
 +
 +    nbl->work->ncj_noq = 0;
 +    nbl->work->ncj_hlj = 0;
 +}
 +
 +/* Sets a simple list i-cell bounding box, including PBC shift */
 +static void set_icell_bb_simple(const float *bb, int ci,
 +                                real shx, real shy, real shz,
 +                                float *bb_ci)
 +{
 +    int ia;
 +
 +    ia           = ci*NNBSBB_B;
 +    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
 +    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
 +    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
 +    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
 +    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
 +    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
 +}
 +
 +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
 +static void set_icell_bb_supersub(const float *bb, int ci,
 +                                  real shx, real shy, real shz,
 +                                  float *bb_ci)
 +{
 +    int ia, m, i;
 +
 +#ifdef NBNXN_BBXXXX
 +    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
 +    for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
 +    {
 +        for (i = 0; i < STRIDE_PBB; i++)
 +        {
 +            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
 +            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
 +            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
 +            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
 +            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
 +            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
 +        }
 +    }
 +#else
 +    ia = ci*GPU_NSUBCELL*NNBSBB_B;
 +    for (i = 0; i < GPU_NSUBCELL*NNBSBB_B; i += NNBSBB_B)
 +    {
 +        bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
 +        bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
 +        bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
 +        bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
 +        bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
 +        bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
 +    }
 +#endif
 +}
 +
 +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_simple(int ci,
 +                               real shx, real shy, real shz,
 +                               int na_c,
 +                               int stride, const real *x,
 +                               nbnxn_list_work_t *work)
 +{
 +    int  ia, i;
 +
 +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE; i++)
 +    {
 +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
 +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
 +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
 +    }
 +}
 +
 +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_supersub(int ci,
 +                                 real shx, real shy, real shz,
 +                                 int na_c,
 +                                 int stride, const real *x,
 +                                 nbnxn_list_work_t *work)
 +{
 +    int  ia, i;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    ia = ci*GPU_NSUBCELL*na_c;
 +    for (i = 0; i < GPU_NSUBCELL*na_c; i++)
 +    {
 +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
 +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
 +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Copies PBC shifted super-cell packed atom coordinates to working array */
 +static void icell_set_x_supersub_sse8(int ci,
 +                                      real shx, real shy, real shz,
 +                                      int na_c,
 +                                      int stride, const real *x,
 +                                      nbnxn_list_work_t *work)
 +{
 +    int  si, io, ia, i, j;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    for (si = 0; si < GPU_NSUBCELL; si++)
 +    {
 +        for (i = 0; i < na_c; i += STRIDE_PBB)
 +        {
 +            io = si*na_c + i;
 +            ia = ci*GPU_NSUBCELL*na_c + io;
 +            for (j = 0; j < STRIDE_PBB; j++)
 +            {
 +                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
 +                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
 +                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density)
 +{
 +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density), 1.0/3.0));
 +}
 +
 +/* Estimates the interaction volume^2 for non-local interactions */
 +static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)
 +{
 +    int  z, d;
 +    real cl, ca, za;
 +    real vold_est;
 +    real vol2_est_tot;
 +
 +    vol2_est_tot = 0;
 +
 +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
 +     * not home interaction volume^2. As these volumes are not additive,
 +     * this is an overestimate, but it would only be significant in the limit
 +     * of small cells, where we anyhow need to split the lists into
 +     * as small parts as possible.
 +     */
 +
 +    for (z = 0; z < zones->n; z++)
 +    {
 +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
 +        {
 +            cl = 0;
 +            ca = 1;
 +            za = 1;
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (zones->shift[z][d] == 0)
 +                {
 +                    cl += 0.5*ls[d];
 +                    ca *= ls[d];
 +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
 +                }
 +            }
 +
 +            /* 4 octants of a sphere */
 +            vold_est  = 0.25*M_PI*r*r*r*r;
 +            /* 4 quarter pie slices on the edges */
 +            vold_est += 4*cl*M_PI/6.0*r*r*r;
 +            /* One rectangular volume on a face */
 +            vold_est += ca*0.5*r*r;
 +
 +            vol2_est_tot += vold_est*za;
 +        }
 +    }
 +
 +    return vol2_est_tot;
 +}
 +
 +/* Estimates the average size of a full j-list for super/sub setup */
 +static int get_nsubpair_max(const nbnxn_search_t nbs,
 +                            int                  iloc,
 +                            real                 rlist,
 +                            int                  min_ci_balanced)
 +{
 +    const nbnxn_grid_t *grid;
 +    rvec ls;
 +    real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;
 +    int  nsubpair_max;
 +
 +    grid = &nbs->grid[0];
 +
 +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
 +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
 +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
 +
 +    /* The average squared length of the diagonal of a sub cell */
 +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
 +
 +    /* The formulas below are a heuristic estimate of the average nsj per si*/
 +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
 +
 +    if (!nbs->DomDec || nbs->zones->n == 1)
 +    {
 +        nsp_est_nl = 0;
 +    }
 +    else
 +    {
 +        nsp_est_nl =
 +            sqr(grid->atom_density/grid->na_c)*
 +            nonlocal_vol2(nbs->zones, ls, r_eff_sup);
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Sub-cell interacts with itself */
 +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
 +        /* 6/2 rectangular volume on the faces */
 +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
 +        /* 12/2 quarter pie slices on the edges */
 +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
 +        /* 4 octants of a sphere */
 +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
 +
 +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 +
 +        /* Subtract the non-local pair count */
 +        nsp_est -= nsp_est_nl;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
 +                    nsp_est, nsp_est_nl);
 +        }
 +    }
 +    else
 +    {
 +        nsp_est = nsp_est_nl;
 +    }
 +
 +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
 +    {
 +        /* We don't need to worry */
 +        nsubpair_max = -1;
 +    }
 +    else
 +    {
 +        /* Thus the (average) maximum j-list size should be as follows */
 +        nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5));
 +
 +        /* Since the target value is a maximum (this avoids high outliers,
 +         * which lead to load imbalance), not average, we add half the
 +         * number of pairs in a cj4 block to get the average about right.
 +         */
 +        nsubpair_max += GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE/2;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",
 +                nsp_est, nsubpair_max);
 +    }
 +
 +    return nsubpair_max;
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
 +{
 +    int i, j;
 +
 +    for (i = 0; i < nbl->nci; i++)
 +    {
 +        fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
 +                nbl->ci[i].ci, nbl->ci[i].shift,
 +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
 +
 +        for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)
 +        {
 +            fprintf(fp, "  cj %5d  imask %x\n",
 +                    nbl->cj[j].cj,
 +                    nbl->cj[j].excl);
 +        }
 +    }
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
 +{
 +    int i, j4, j, ncp, si;
 +
 +    for (i = 0; i < nbl->nsci; i++)
 +    {
 +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
 +                nbl->sci[i].sci, nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
 +
 +        ncp = 0;
 +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                fprintf(fp, "  sj %5d  imask %x\n",
 +                        nbl->cj4[j4].cj[j],
 +                        nbl->cj4[j4].imei[0].imask);
-     for(i = 0; i <= m; i++)
++                for (si = 0; si < GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        ncp++;
 +                    }
 +                }
 +            }
 +        }
 +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
 +                nbl->sci[i].sci, nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
 +                ncp);
 +    }
 +}
 +
 +/* Combine pair lists *nbl generated on multiple threads nblc */
 +static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
 +                            nbnxn_pairlist_t *nblc)
 +{
 +    int nsci, ncj4, nexcl;
 +    int n, i;
 +
 +    if (nblc->bSimple)
 +    {
 +        gmx_incons("combine_nblists does not support simple lists");
 +    }
 +
 +    nsci  = nblc->nsci;
 +    ncj4  = nblc->ncj4;
 +    nexcl = nblc->nexcl;
 +    for (i = 0; i < nnbl; i++)
 +    {
 +        nsci  += nbl[i]->nsci;
 +        ncj4  += nbl[i]->ncj4;
 +        nexcl += nbl[i]->nexcl;
 +    }
 +
 +    if (nsci > nblc->sci_nalloc)
 +    {
 +        nb_realloc_sci(nblc, nsci);
 +    }
 +    if (ncj4 > nblc->cj4_nalloc)
 +    {
 +        nblc->cj4_nalloc = over_alloc_small(ncj4);
 +        nbnxn_realloc_void((void **)&nblc->cj4,
 +                           nblc->ncj4*sizeof(*nblc->cj4),
 +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
 +                           nblc->alloc, nblc->free);
 +    }
 +    if (nexcl > nblc->excl_nalloc)
 +    {
 +        nblc->excl_nalloc = over_alloc_small(nexcl);
 +        nbnxn_realloc_void((void **)&nblc->excl,
 +                           nblc->nexcl*sizeof(*nblc->excl),
 +                           nblc->excl_nalloc*sizeof(*nblc->excl),
 +                           nblc->alloc, nblc->free);
 +    }
 +
 +    /* Each thread should copy its own data to the combined arrays,
 +     * as otherwise data will go back and forth between different caches.
 +     */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for (n = 0; n < nnbl; n++)
 +    {
 +        int sci_offset;
 +        int cj4_offset;
 +        int ci_offset;
 +        int excl_offset;
 +        int i, j4;
 +        const nbnxn_pairlist_t *nbli;
 +
 +        /* Determine the offset in the combined data for our thread */
 +        sci_offset  = nblc->nsci;
 +        cj4_offset  = nblc->ncj4;
 +        ci_offset   = nblc->nci_tot;
 +        excl_offset = nblc->nexcl;
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            sci_offset  += nbl[i]->nsci;
 +            cj4_offset  += nbl[i]->ncj4;
 +            ci_offset   += nbl[i]->nci_tot;
 +            excl_offset += nbl[i]->nexcl;
 +        }
 +
 +        nbli = nbl[n];
 +
 +        for (i = 0; i < nbli->nsci; i++)
 +        {
 +            nblc->sci[sci_offset+i]                = nbli->sci[i];
 +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
 +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
 +        }
 +
 +        for (j4 = 0; j4 < nbli->ncj4; j4++)
 +        {
 +            nblc->cj4[cj4_offset+j4]                   = nbli->cj4[j4];
 +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
 +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
 +        }
 +
 +        for (j4 = 0; j4 < nbli->nexcl; j4++)
 +        {
 +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
 +        }
 +    }
 +
 +    for (n = 0; n < nnbl; n++)
 +    {
 +        nblc->nsci    += nbl[n]->nsci;
 +        nblc->ncj4    += nbl[n]->ncj4;
 +        nblc->nci_tot += nbl[n]->nci_tot;
 +        nblc->nexcl   += nbl[n]->nexcl;
 +    }
 +}
 +
 +/* Returns the next ci to be processes by our thread */
 +static gmx_bool next_ci(const nbnxn_grid_t *grid,
 +                        int conv,
 +                        int nth, int ci_block,
 +                        int *ci_x, int *ci_y,
 +                        int *ci_b, int *ci)
 +{
 +    (*ci_b)++;
 +    (*ci)++;
 +
 +    if (*ci_b == ci_block)
 +    {
 +        /* Jump to the next block assigned to this task */
 +        *ci   += (nth - 1)*ci_block;
 +        *ci_b  = 0;
 +    }
 +
 +    if (*ci >= grid->nc*conv)
 +    {
 +        return FALSE;
 +    }
 +
 +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
 +    {
 +        *ci_y += 1;
 +        if (*ci_y == grid->ncy)
 +        {
 +            *ci_x += 1;
 +            *ci_y  = 0;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +/* Returns the distance^2 for which we put cell pairs in the list
 + * without checking atom pair distances. This is usually < rlist^2.
 + */
 +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
 +                                        const nbnxn_grid_t *gridj,
 +                                        real                rlist,
 +                                        gmx_bool            simple)
 +{
 +    /* If the distance between two sub-cell bounding boxes is less
 +     * than this distance, do not check the distance between
 +     * all particle pairs in the sub-cell, since then it is likely
 +     * that the box pair has atom pairs within the cut-off.
 +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
 +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
 +     * Using more than 0.5 gains at most 0.5%.
 +     * If forces are calculated more than twice, the performance gain
 +     * in the force calculation outweighs the cost of checking.
 +     * Note that with subcell lists, the atom-pair distance check
 +     * is only performed when only 1 out of 8 sub-cells in within range,
 +     * this is because the GPU is much faster than the cpu.
 +     */
 +    real bbx, bby;
 +    real rbb2;
 +
 +    bbx = 0.5*(gridi->sx + gridj->sx);
 +    bby = 0.5*(gridi->sy + gridj->sy);
 +    if (!simple)
 +    {
 +        bbx /= GPU_NSUBCELL_X;
 +        bby /= GPU_NSUBCELL_Y;
 +    }
 +
 +    rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
 +
 +#ifndef GMX_DOUBLE
 +    return rbb2;
 +#else
 +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
 +#endif
 +}
 +
 +static int get_ci_block_size(const nbnxn_grid_t *gridi,
 +                             gmx_bool bDomDec, int nth)
 +{
 +    const int ci_block_enum      = 5;
 +    const int ci_block_denom     = 11;
 +    const int ci_block_min_atoms = 16;
 +    int ci_block;
 +
 +    /* Here we decide how to distribute the blocks over the threads.
 +     * We use prime numbers to try to avoid that the grid size becomes
 +     * a multiple of the number of threads, which would lead to some
 +     * threads getting "inner" pairs and others getting boundary pairs,
 +     * which in turns will lead to load imbalance between threads.
 +     * Set the block size as 5/11/ntask times the average number of cells
 +     * in a y,z slab. This should ensure a quite uniform distribution
 +     * of the grid parts of the different thread along all three grid
 +     * zone boundaries with 3D domain decomposition. At the same time
 +     * the blocks will not become too small.
 +     */
 +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
 +
 +    /* Ensure the blocks are not too small: avoids cache invalidation */
 +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
 +    {
 +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
 +    }
 +
 +    /* Without domain decomposition
 +     * or with less than 3 blocks per task, divide in nth blocks.
 +     */
 +    if (!bDomDec || ci_block*3*nth > gridi->nc)
 +    {
 +        ci_block = (gridi->nc + nth - 1)/nth;
 +    }
 +
 +    return ci_block;
 +}
 +
 +/* Generates the part of pair-list nbl assigned to our thread */
 +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 +                                     const nbnxn_grid_t *gridi,
 +                                     const nbnxn_grid_t *gridj,
 +                                     nbnxn_search_work_t *work,
 +                                     const nbnxn_atomdata_t *nbat,
 +                                     const t_blocka *excl,
 +                                     real rlist,
 +                                     int nb_kernel_type,
 +                                     int ci_block,
 +                                     gmx_bool bFBufferFlag,
 +                                     int nsubpair_max,
 +                                     gmx_bool progBal,
 +                                     int min_ci_balanced,
 +                                     int th, int nth,
 +                                     nbnxn_pairlist_t *nbl)
 +{
 +    int  na_cj_2log;
 +    matrix box;
 +    real rl2;
 +    float rbb2;
 +    int  d;
 +    int  ci_b, ci, ci_x, ci_y, ci_xy, cj;
 +    ivec shp;
 +    int  tx, ty, tz;
 +    int  shift;
 +    gmx_bool bMakeList;
 +    real shx, shy, shz;
 +    int  conv_i, cell0_i;
 +    const float *bb_i, *bbcz_i, *bbcz_j;
 +    const int *flags_i;
 +    real bx0, bx1, by0, by1, bz0, bz1;
 +    real bz1_frac;
 +    real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
 +    int  cxf, cxl, cyf, cyf_x, cyl;
 +    int  cx, cy;
 +    int  c0, c1, cs, cf, cl;
 +    int  ndistc;
 +    int  ncpcheck;
 +    int  gridi_flag_shift = 0, gridj_flag_shift = 0;
 +    unsigned *gridj_flag  = NULL;
 +    int  ncj_old_i, ncj_old_j;
 +
 +    nbs_cycle_start(&work->cc[enbsCCsearch]);
 +
 +    if (gridj->bSimple != nbl->bSimple)
 +    {
 +        gmx_incons("Grid incompatible with pair-list");
 +    }
 +
 +    sync_work(nbl);
 +    nbl->na_sc = gridj->na_sc;
 +    nbl->na_ci = gridj->na_c;
 +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    na_cj_2log = get_2log(nbl->na_cj);
 +
 +    nbl->rlist  = rlist;
 +
 +    if (bFBufferFlag)
 +    {
 +        /* Determine conversion of clusters to flag blocks */
 +        gridi_flag_shift = 0;
 +        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridi_flag_shift++;
 +        }
 +        gridj_flag_shift = 0;
 +        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridj_flag_shift++;
 +        }
 +
 +        gridj_flag = work->buffer_flags.flag;
 +    }
 +
 +    copy_mat(nbs->box, box);
 +
 +    rl2 = nbl->rlist*nbl->rlist;
 +
 +    rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));
 +    }
 +
 +    /* Set the shift range */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    if (nbl->bSimple && !gridi->bSimple)
 +    {
 +        conv_i  = gridi->na_sc/gridj->na_sc;
 +        bb_i    = gridi->bb_simple;
 +        bbcz_i  = gridi->bbcz_simple;
 +        flags_i = gridi->flags_simple;
 +    }
 +    else
 +    {
 +        conv_i  = 1;
 +        bb_i    = gridi->bb;
 +        bbcz_i  = gridi->bbcz;
 +        flags_i = gridi->flags;
 +    }
 +    cell0_i = gridi->cell0*conv_i;
 +
 +    bbcz_j = gridj->bbcz;
 +
 +    if (conv_i != 1)
 +    {
 +        /* Blocks of the conversion factor - 1 give a large repeat count
 +         * combined with a small block size. This should result in good
 +         * load balancing for both small and large domains.
 +         */
 +        ci_block = conv_i - 1;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
 +                gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
 +    }
 +
 +    ndistc   = 0;
 +    ncpcheck = 0;
 +
 +    /* Initially ci_b and ci to 1 before where we want them to start,
 +     * as they will both be incremented in next_ci.
 +     */
 +    ci_b = -1;
 +    ci   = th*ci_block - 1;
 +    ci_x = 0;
 +    ci_y = 0;
 +    while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
 +    {
 +        if (nbl->bSimple && flags_i[ci] == 0)
 +        {
 +            continue;
 +        }
 +
 +        ncj_old_i = nbl->ncj;
 +
 +        d2cx = 0;
 +        if (gridj != gridi && shp[XX] == 0)
 +        {
 +            if (nbl->bSimple)
 +            {
 +                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
 +            }
 +            else
 +            {
 +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
 +            }
 +            if (bx1 < gridj->c0[XX])
 +            {
 +                d2cx = sqr(gridj->c0[XX] - bx1);
 +
 +                if (d2cx >= rl2)
 +                {
 +                    continue;
 +                }
 +            }
 +        }
 +
 +        ci_xy = ci_x*gridi->ncy + ci_y;
 +
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
 +        {
 +            shz = tz*box[ZZ][ZZ];
 +
 +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
 +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
 +
 +            if (tz == 0)
 +            {
 +                d2z = 0;
 +            }
 +            else if (tz < 0)
 +            {
 +                d2z = sqr(bz1);
 +            }
 +            else
 +            {
 +                d2z = sqr(bz0 - box[ZZ][ZZ]);
 +            }
 +
 +            d2z_cx = d2z + d2cx;
 +
 +            if (d2z_cx >= rl2)
 +            {
 +                continue;
 +            }
 +
 +            bz1_frac =
 +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
 +            if (bz1_frac < 0)
 +            {
 +                bz1_frac = 0;
 +            }
 +            /* The check with bz1_frac close to or larger than 1 comes later */
 +
 +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
 +            {
 +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 +
 +                if (nbl->bSimple)
 +                {
 +                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
 +                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
 +                }
 +                else
 +                {
 +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
 +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
 +                }
 +
 +                get_cell_range(by0, by1,
 +                               gridj->ncy, gridj->c0[YY], gridj->sy, gridj->inv_sy,
 +                               d2z_cx, rl2,
 +                               &cyf, &cyl);
 +
 +                if (cyf > cyl)
 +                {
 +                    continue;
 +                }
 +
 +                d2z_cy = d2z;
 +                if (by1 < gridj->c0[YY])
 +                {
 +                    d2z_cy += sqr(gridj->c0[YY] - by1);
 +                }
 +                else if (by0 > gridj->c1[YY])
 +                {
 +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
 +                }
 +
 +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
 +                {
 +                    shift = XYZ2IS(tx, ty, tz);
 +
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                    if (gridi == gridj && shift > CENTRAL)
 +                    {
 +                        continue;
 +                    }
 +#endif
 +
 +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
 +                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
 +                    }
 +                    else
 +                    {
 +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
 +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
 +                    }
 +
 +                    get_cell_range(bx0, bx1,
 +                                   gridj->ncx, gridj->c0[XX], gridj->sx, gridj->inv_sx,
 +                                   d2z_cy, rl2,
 +                                   &cxf, &cxl);
 +
 +                    if (cxf > cxl)
 +                    {
 +                        continue;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci],
 +                                     nbl->work);
 +                    }
 +                    else
 +                    {
 +                        new_sci_entry(nbl, cell0_i+ci, shift, flags_i[ci],
 +                                      nbl->work);
 +                    }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                    if (cxf < ci_x)
 +#else
 +                    if (shift == CENTRAL && gridi == gridj &&
 +                        cxf < ci_x)
 +#endif
 +                    {
 +                        /* Leave the pairs with i > j.
 +                         * x is the major index, so skip half of it.
 +                         */
 +                        cxf = ci_x;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        set_icell_bb_simple(bb_i, ci, shx, shy, shz,
 +                                            nbl->work->bb_ci);
 +                    }
 +                    else
 +                    {
 +                        set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
 +                                              nbl->work->bb_ci);
 +                    }
 +
 +                    nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
 +                                     gridi->na_c, nbat->xstride, nbat->x,
 +                                     nbl->work);
 +
 +                    for (cx = cxf; cx <= cxl; cx++)
 +                    {
 +                        d2zx = d2z;
 +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
 +                        }
 +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
 +                        }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                        if (gridi == gridj &&
 +                            cx == 0 && cyf < ci_y)
 +#else
 +                        if (gridi == gridj &&
 +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
 +#endif
 +                        {
 +                            /* Leave the pairs with i > j.
 +                             * Skip half of y when i and j have the same x.
 +                             */
 +                            cyf_x = ci_y;
 +                        }
 +                        else
 +                        {
 +                            cyf_x = cyf;
 +                        }
 +
 +                        for (cy = cyf_x; cy <= cyl; cy++)
 +                        {
 +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
 +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                            if (gridi == gridj &&
 +                                shift == CENTRAL && c0 < ci)
 +                            {
 +                                c0 = ci;
 +                            }
 +#endif
 +
 +                            d2zxy = d2zx;
 +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
 +                            }
 +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
 +                            }
 +                            if (c1 > c0 && d2zxy < rl2)
 +                            {
 +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
 +                                if (cs >= c1)
 +                                {
 +                                    cs = c1 - 1;
 +                                }
 +
 +                                d2xy = d2zxy - d2z;
 +
 +                                /* Find the lowest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cf = cs;
 +                                while (cf > c0 &&
 +                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
 +                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
 +                                {
 +                                    cf--;
 +                                }
 +
 +                                /* Find the highest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cl = cs;
 +                                while (cl < c1-1 &&
 +                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
 +                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
 +                                {
 +                                    cl++;
 +                                }
 +
 +#ifdef NBNXN_REFCODE
 +                                {
 +                                    /* Simple reference code, for debugging,
 +                                     * overrides the more complex code above.
 +                                     */
 +                                    int k;
 +                                    cf = c1;
 +                                    cl = -1;
 +                                    for (k = c0; k < c1; k++)
 +                                    {
 +                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k < cf)
 +                                        {
 +                                            cf = k;
 +                                        }
 +                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k > cl)
 +                                        {
 +                                            cl = k;
 +                                        }
 +                                    }
 +                                }
 +#endif
 +
 +                                if (gridi == gridj)
 +                                {
 +                                    /* We want each atom/cell pair only once,
 +                                     * only use cj >= ci.
 +                                     */
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                                    cf = max(cf, ci);
 +#else
 +                                    if (shift == CENTRAL)
 +                                    {
 +                                        cf = max(cf, ci);
 +                                    }
 +#endif
 +                                }
 +
 +                                if (cf <= cl)
 +                                {
 +                                    /* For f buffer flags with simple lists */
 +                                    ncj_old_j = nbl->ncj;
 +
 +                                    switch (nb_kernel_type)
 +                                    {
 +                                        case nbnxnk4x4_PlainC:
 +                                            check_subcell_list_space_simple(nbl, cl-cf+1);
 +
 +                                            make_cluster_list_simple(gridj,
 +                                                                     nbl, ci, cf, cl,
 +                                                                     (gridi == gridj && shift == CENTRAL),
 +                                                                     nbat->x,
 +                                                                     rl2, rbb2,
 +                                                                     &ndistc);
 +                                            break;
 +#ifdef GMX_NBNXN_SIMD_4XN
 +                                        case nbnxnk4xN_SIMD_4xN:
 +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
 +                                            make_cluster_list_simd_4xn(gridj,
 +                                                                       nbl, ci, cf, cl,
 +                                                                       (gridi == gridj && shift == CENTRAL),
 +                                                                       nbat->x,
 +                                                                       rl2, rbb2,
 +                                                                       &ndistc);
 +                                            break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +                                        case nbnxnk4xN_SIMD_2xNN:
 +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
 +                                            make_cluster_list_simd_2xnn(gridj,
 +                                                                        nbl, ci, cf, cl,
 +                                                                        (gridi == gridj && shift == CENTRAL),
 +                                                                        nbat->x,
 +                                                                        rl2, rbb2,
 +                                                                        &ndistc);
 +                                            break;
 +#endif
 +                                        case nbnxnk8x8x8_PlainC:
 +                                        case nbnxnk8x8x8_CUDA:
 +                                            check_subcell_list_space_supersub(nbl, cl-cf+1);
 +                                            for (cj = cf; cj <= cl; cj++)
 +                                            {
 +                                                make_cluster_list_supersub(nbs, gridi, gridj,
 +                                                                           nbl, ci, cj,
 +                                                                           (gridi == gridj && shift == CENTRAL && ci == cj),
 +                                                                           nbat->xstride, nbat->x,
 +                                                                           rl2, rbb2,
 +                                                                           &ndistc);
 +                                            }
 +                                            break;
 +                                    }
 +                                    ncpcheck += cl - cf + 1;
 +
 +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
 +                                    {
 +                                        int cbf, cbl, cb;
 +
 +                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
 +                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
 +                                        for (cb = cbf; cb <= cbl; cb++)
 +                                        {
 +                                            gridj_flag[cb] = 1U<<th;
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +
 +                    /* Set the exclusions for this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        set_ci_top_excls(nbs,
 +                                         nbl,
 +                                         shift == CENTRAL && gridi == gridj,
 +                                         gridj->na_c_2log,
 +                                         na_cj_2log,
 +                                         &(nbl->ci[nbl->nci]),
 +                                         excl);
 +                    }
 +                    else
 +                    {
 +                        set_sci_top_excls(nbs,
 +                                          nbl,
 +                                          shift == CENTRAL && gridi == gridj,
 +                                          gridj->na_c_2log,
 +                                          &(nbl->sci[nbl->nsci]),
 +                                          excl);
 +                    }
 +
 +                    /* Close this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        close_ci_entry_simple(nbl);
 +                    }
 +                    else
 +                    {
 +                        close_ci_entry_supersub(nbl,
 +                                                nsubpair_max,
 +                                                progBal, min_ci_balanced,
 +                                                th, nth);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
 +        {
 +            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
 +        }
 +    }
 +
 +    work->ndistc = ndistc;
 +
 +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "number of distance checks %d\n", ndistc);
 +        fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",
 +                ncpcheck);
 +
 +        if (nbl->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug, nbl, nbs, rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug, nbl, nbs, rlist);
 +        }
 +
 +    }
 +}
 +
 +static void reduce_buffer_flags(const nbnxn_search_t        nbs,
 +                                int                         nsrc,
 +                                const nbnxn_buffer_flags_t *dest)
 +{
 +    int s, b;
 +    const unsigned *flag;
 +
 +    for (s = 0; s < nsrc; s++)
 +    {
 +        flag = nbs->work[s].buffer_flags.flag;
 +
 +        for (b = 0; b < dest->nflag; b++)
 +        {
 +            dest->flag[b] |= flag[b];
 +        }
 +    }
 +}
 +
 +static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
 +{
 +    int nelem, nkeep, ncopy, nred, b, c, out;
 +
 +    nelem = 0;
 +    nkeep = 0;
 +    ncopy = 0;
 +    nred  = 0;
 +    for (b = 0; b < flags->nflag; b++)
 +    {
 +        if (flags->flag[b] == 1)
 +        {
 +            /* Only flag 0 is set, no copy of reduction required */
 +            nelem++;
 +            nkeep++;
 +        }
 +        else if (flags->flag[b] > 0)
 +        {
 +            c = 0;
 +            for (out = 0; out < nout; out++)
 +            {
 +                if (flags->flag[b] & (1U<<out))
 +                {
 +                    c++;
 +                }
 +            }
 +            nelem += c;
 +            if (c == 1)
 +            {
 +                ncopy++;
 +            }
 +            else
 +            {
 +                nred += c;
 +            }
 +        }
 +    }
 +
 +    fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
 +            flags->nflag, nout,
 +            nelem/(double)(flags->nflag),
 +            nkeep/(double)(flags->nflag),
 +            ncopy/(double)(flags->nflag),
 +            nred/(double)(flags->nflag));
 +}
 +
 +/* Perform a count (linear) sort to sort the smaller lists to the end.
 + * This avoids load imbalance on the GPU, as large lists will be
 + * scheduled and executed first and the smaller lists later.
 + * Load balancing between multi-processors only happens at the end
 + * and there smaller lists lead to more effective load balancing.
 + * The sorting is done on the cj4 count, not on the actual pair counts.
 + * Not only does this make the sort faster, but it also results in
 + * better load balancing than using a list sorted on exact load.
 + * This function swaps the pointer in the pair list to avoid a copy operation.
 + */
 +static void sort_sci(nbnxn_pairlist_t *nbl)
 +{
 +    nbnxn_list_work_t *work;
 +    int                m, i, s, s0, s1;
 +    nbnxn_sci_t       *sci_sort;
 +
 +    if (nbl->ncj4 <= nbl->nsci)
 +    {
 +        /* nsci = 0 or all sci have size 1, sorting won't change the order */
 +        return;
 +    }
 +
 +    work = nbl->work;
 +
 +    /* We will distinguish differences up to double the average */
 +    m = (2*nbl->ncj4)/nbl->nsci;
 +
 +    if (m + 1 > work->sort_nalloc)
 +    {
 +        work->sort_nalloc = over_alloc_large(m + 1);
 +        srenew(work->sort, work->sort_nalloc);
 +    }
 +
 +    if (work->sci_sort_nalloc != nbl->sci_nalloc)
 +    {
 +        work->sci_sort_nalloc = nbl->sci_nalloc;
 +        nbnxn_realloc_void((void **)&work->sci_sort,
 +                           0,
 +                           work->sci_sort_nalloc*sizeof(*work->sci_sort),
 +                           nbl->alloc, nbl->free);
 +    }
 +
 +    /* Count the entries of each size */
-     for(s = 0; s < nbl->nsci; s++)
++    for (i = 0; i <= m; i++)
 +    {
 +        work->sort[i] = 0;
 +    }
-     s0           = work->sort[m];
++    for (s = 0; s < nbl->nsci; s++)
 +    {
 +        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
 +        work->sort[i]++;
 +    }
 +    /* Calculate the offset for each count */
-     for(i = m - 1; i >= 0; i--)
++    s0            = work->sort[m];
 +    work->sort[m] = 0;
-     for(s = 0; s < nbl->nsci; s++)
++    for (i = m - 1; i >= 0; i--)
 +    {
 +        s1            = work->sort[i];
 +        work->sort[i] = work->sort[i + 1] + s0;
 +        s0            = s1;
 +    }
 +
 +    /* Sort entries directly into place */
 +    sci_sort = work->sci_sort;
++    for (s = 0; s < nbl->nsci; s++)
 +    {
 +        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
 +        sci_sort[work->sort[i]++] = nbl->sci[s];
 +    }
 +
 +    /* Swap the sci pointers so we use the new, sorted list */
 +    work->sci_sort = nbl->sci;
 +    nbl->sci       = sci_sort;
 +}
 +
 +/* Make a local or non-local pair-list, depending on iloc */
 +void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
 +                         nbnxn_atomdata_t     *nbat,
 +                         const t_blocka       *excl,
 +                         real                  rlist,
 +                         int                   min_ci_balanced,
 +                         nbnxn_pairlist_set_t *nbl_list,
 +                         int                   iloc,
 +                         int                   nb_kernel_type,
 +                         t_nrnb               *nrnb)
 +{
 +    nbnxn_grid_t *gridi, *gridj;
 +    gmx_bool bGPUCPU;
 +    int nzi, zi, zj0, zj1, zj;
 +    int nsubpair_max;
 +    int th;
 +    int nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int ci_block;
 +    gmx_bool CombineNBLists;
 +    gmx_bool progBal;
 +    int np_tot, np_noq, np_hlj, nap;
 +
 +    /* Check if we are running hybrid GPU + CPU nbnxn mode */
 +    bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
 +
 +    nnbl            = nbl_list->nnbl;
 +    nbl             = nbl_list->nbl;
 +    CombineNBLists  = nbl_list->bCombined;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "ns making %d nblists\n", nnbl);
 +    }
 +
 +    nbat->bUseBufferFlags = (nbat->nout > 1);
 +    /* We should re-init the flags before making the first list */
 +    if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
 +    {
 +        init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
 +    }
 +
 +    if (nbl_list->bSimple)
 +    {
 +        switch (nb_kernel_type)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +            case nbnxnk4xN_SIMD_4xN:
 +                nbs->icell_set_x = icell_set_x_simd_4xn;
 +                break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +            case nbnxnk4xN_SIMD_2xNN:
 +                nbs->icell_set_x = icell_set_x_simd_2xnn;
 +                break;
 +#endif
 +            default:
 +                nbs->icell_set_x = icell_set_x_simple;
 +                break;
 +        }
 +    }
 +    else
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        nbs->icell_set_x = icell_set_x_supersub_sse8;
 +#else
 +        nbs->icell_set_x = icell_set_x_supersub;
 +#endif
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Only zone (grid) 0 vs 0 */
 +        nzi = 1;
 +        zj0 = 0;
 +        zj1 = 1;
 +    }
 +    else
 +    {
 +        nzi = nbs->zones->nizone;
 +    }
 +
 +    if (!nbl_list->bSimple && min_ci_balanced > 0)
 +    {
 +        nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);
 +    }
 +    else
 +    {
 +        nsubpair_max = 0;
 +    }
 +
 +    /* Clear all pair-lists */
 +    for (th = 0; th < nnbl; th++)
 +    {
 +        clear_pairlist(nbl[th]);
 +    }
 +
 +    for (zi = 0; zi < nzi; zi++)
 +    {
 +        gridi = &nbs->grid[zi];
 +
 +        if (NONLOCAL_I(iloc))
 +        {
 +            zj0 = nbs->zones->izone[zi].j0;
 +            zj1 = nbs->zones->izone[zi].j1;
 +            if (zi == 0)
 +            {
 +                zj0++;
 +            }
 +        }
 +        for (zj = zj0; zj < zj1; zj++)
 +        {
 +            gridj = &nbs->grid[zj];
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
 +            }
 +
 +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 +
 +            if (nbl[0]->bSimple && !gridi->bSimple)
 +            {
 +                /* Hybrid list, determine blocking later */
 +                ci_block = 0;
 +            }
 +            else
 +            {
 +                ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
 +            }
 +
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                /* Re-init the thread-local work flag data before making
 +                 * the first list (not an elegant conditional).
 +                 */
 +                if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
 +                                              (bGPUCPU && zi == 0 && zj == 1)))
 +                {
 +                    init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
 +                }
 +
 +                if (CombineNBLists && th > 0)
 +                {
 +                    clear_pairlist(nbl[th]);
 +                }
 +
 +                /* With GPU: generate progressively smaller lists for
 +                 * load balancing for local only or non-local with 2 zones.
 +                 */
 +                progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
 +
 +                /* Divide the i super cell equally over the nblists */
 +                nbnxn_make_pairlist_part(nbs, gridi, gridj,
 +                                         &nbs->work[th], nbat, excl,
 +                                         rlist,
 +                                         nb_kernel_type,
 +                                         ci_block,
 +                                         nbat->bUseBufferFlags,
 +                                         nsubpair_max,
 +                                         progBal, min_ci_balanced,
 +                                         th, nnbl,
 +                                         nbl[th]);
 +            }
 +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
 +
 +            np_tot = 0;
 +            np_noq = 0;
 +            np_hlj = 0;
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
 +
 +                if (nbl_list->bSimple)
 +                {
 +                    np_tot += nbl[th]->ncj;
 +                    np_noq += nbl[th]->work->ncj_noq;
 +                    np_hlj += nbl[th]->work->ncj_hlj;
 +                }
 +                else
 +                {
 +                    /* This count ignores potential subsequent pair pruning */
 +                    np_tot += nbl[th]->nci_tot;
 +                }
 +            }
 +            nap                   = nbl[0]->na_ci*nbl[0]->na_cj;
 +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
 +            nbl_list->natpair_lj  = np_noq*nap;
 +            nbl_list->natpair_q   = np_hlj*nap/2;
 +
 +            if (CombineNBLists && nnbl > 1)
 +            {
 +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
 +
 +                combine_nblists(nnbl-1, nbl+1, nbl[0]);
 +
 +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
 +            }
 +        }
 +    }
 +
 +    if (!nbl_list->bSimple)
 +    {
 +        /* Sort the entries on size, large ones first */
 +        if (CombineNBLists || nnbl == 1)
 +        {
 +            sort_sci(nbl[0]);
 +        }
 +        else
 +        {
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                sort_sci(nbl[th]);
 +            }
 +        }
 +    }
 +
 +    if (nbat->bUseBufferFlags)
 +    {
 +        reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);
 +    }
 +
 +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
 +    if (LOCAL_I(iloc))
 +    {
 +        nbs->search_count++;
 +    }
 +    if (nbs->print_cycles &&
 +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
 +        nbs->search_count % 100 == 0)
 +    {
 +        nbs_cycle_print(stderr, nbs);
 +    }
 +
 +    if (debug && (CombineNBLists && nnbl > 1))
 +    {
 +        if (nbl[0]->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (gmx_debug_at)
 +        {
 +            if (nbl[0]->bSimple)
 +            {
 +                print_nblist_ci_cj(debug, nbl[0]);
 +            }
 +            else
 +            {
 +                print_nblist_sci_cj(debug, nbl[0]);
 +            }
 +        }
 +
 +        if (nbat->bUseBufferFlags)
 +        {
 +            print_reduction_cost(&nbat->buffer_flags, nnbl);
 +        }
 +    }
 +}
index 209534910a516b5c7680df44eafa81cbaa09d41a,0000000000000000000000000000000000000000..417b596c1cb47d705edf4b4fb17b547b1a5658a8
mode 100644,000000..100644
--- /dev/null
@@@ -1,2982 -1,0 +1,2982 @@@
-                          wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS ) ||
-                            ( !bEnergyGroupCG && wf[jj] <= GMX_REAL_EPS ) )
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "network.h"
 +#include "nsgrid.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "ns.h"
 +#include "pbc.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "nrnb.h"
 +#include "txtdump.h"
 +#include "mtop_util.h"
 +
 +#include "domdec.h"
 +#include "adress.h"
 +
 +
 +/*
 + *    E X C L U S I O N   H A N D L I N G
 + */
 +
 +#ifdef DEBUG
 +static void SETEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    e[j] = e[j] | (1<<i);
 +}
 +static void RMEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    e[j] = e[j] & ~(1<<i);
 +}
 +static gmx_bool ISEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    return (gmx_bool)(e[j] & (1<<i));
 +}
 +static gmx_bool NOTEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    return !(ISEXCL(e, i, j));
 +}
 +#else
 +#define SETEXCL(e, i, j) (e)[((atom_id) (j))] |= (1<<((atom_id) (i)))
 +#define RMEXCL(e, i, j)  (e)[((atom_id) (j))] &= (~(1<<((atom_id) (i))))
 +#define ISEXCL(e, i, j)  (gmx_bool) ((e)[((atom_id) (j))] & (1<<((atom_id) (i))))
 +#define NOTEXCL(e, i, j) !(ISEXCL(e, i, j))
 +#endif
 +
 +static int
 +round_up_to_simd_width(int length, int simd_width)
 +{
 +    int offset, newlength;
 +
 +    offset = (simd_width > 0) ? length % simd_width : 0;
 +
 +    return (offset == 0) ? length : length-offset+simd_width;
 +}
 +/************************************************
 + *
 + *  U T I L I T I E S    F O R    N S
 + *
 + ************************************************/
 +
 +static void reallocate_nblist(t_nblist *nl)
 +{
 +    if (gmx_debug_at)
 +    {
 +        fprintf(debug, "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
 +                nl->ielec, nl->ivdw, nl->igeometry, nl->type, nl->maxnri);
 +    }
 +    srenew(nl->iinr,   nl->maxnri);
 +    if (nl->igeometry == GMX_NBLIST_GEOMETRY_CG_CG)
 +    {
 +        srenew(nl->iinr_end, nl->maxnri);
 +    }
 +    srenew(nl->gid,    nl->maxnri);
 +    srenew(nl->shift,  nl->maxnri);
 +    srenew(nl->jindex, nl->maxnri+1);
 +}
 +
 +
 +static void init_nblist(FILE *log, t_nblist *nl_sr, t_nblist *nl_lr,
 +                        int maxsr, int maxlr,
 +                        int ivdw, int ivdwmod,
 +                        int ielec, int ielecmod,
 +                        int igeometry, int type)
 +{
 +    t_nblist *nl;
 +    int       homenr;
 +    int       i, nn;
 +
 +    for (i = 0; (i < 2); i++)
 +    {
 +        nl     = (i == 0) ? nl_sr : nl_lr;
 +        homenr = (i == 0) ? maxsr : maxlr;
 +
 +        if (nl == NULL)
 +        {
 +            continue;
 +        }
 +
 +
 +        /* Set coul/vdw in neighborlist, and for the normal loops we determine
 +         * an index of which one to call.
 +         */
 +        nl->ivdw        = ivdw;
 +        nl->ivdwmod     = ivdwmod;
 +        nl->ielec       = ielec;
 +        nl->ielecmod    = ielecmod;
 +        nl->type        = type;
 +        nl->igeometry   = igeometry;
 +
 +        if (nl->type == GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +        {
 +            nl->igeometry  = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +        }
 +
 +        /* This will also set the simd_padding_width field */
 +        gmx_nonbonded_set_kernel_pointers( (i == 0) ? log : NULL, nl);
 +
 +        /* maxnri is influenced by the number of shifts (maximum is 8)
 +         * and the number of energy groups.
 +         * If it is not enough, nl memory will be reallocated during the run.
 +         * 4 seems to be a reasonable factor, which only causes reallocation
 +         * during runs with tiny and many energygroups.
 +         */
 +        nl->maxnri      = homenr*4;
 +        nl->maxnrj      = 0;
 +        nl->maxlen      = 0;
 +        nl->nri         = -1;
 +        nl->nrj         = 0;
 +        nl->iinr        = NULL;
 +        nl->gid         = NULL;
 +        nl->shift       = NULL;
 +        nl->jindex      = NULL;
 +        reallocate_nblist(nl);
 +        nl->jindex[0] = 0;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Initiating neighbourlist (ielec=%d, ivdw=%d, type=%d) for %s interactions,\nwith %d SR, %d LR atoms.\n",
 +                    nl->ielec, nl->ivdw, nl->type, gmx_nblist_geometry_names[nl->igeometry], maxsr, maxlr);
 +        }
 +    }
 +}
 +
 +void init_neighbor_list(FILE *log, t_forcerec *fr, int homenr)
 +{
 +    /* Make maxlr tunable! (does not seem to be a big difference though)
 +     * This parameter determines the number of i particles in a long range
 +     * neighbourlist. Too few means many function calls, too many means
 +     * cache trashing.
 +     */
 +    int        maxsr, maxsr_wat, maxlr, maxlr_wat;
 +    int        ielec, ielecf, ivdw, ielecmod, ielecmodf, ivdwmod, type;
 +    int        solvent;
 +    int        igeometry_def, igeometry_w, igeometry_ww;
 +    int        i;
 +    t_nblists *nbl;
 +
 +    /* maxsr     = homenr-fr->nWatMol*3; */
 +    maxsr     = homenr;
 +
 +    if (maxsr < 0)
 +    {
 +        gmx_fatal(FARGS, "%s, %d: Negative number of short range atoms.\n"
 +                  "Call your Gromacs dealer for assistance.", __FILE__, __LINE__);
 +    }
 +    /* This is just for initial allocation, so we do not reallocate
 +     * all the nlist arrays many times in a row.
 +     * The numbers seem very accurate, but they are uncritical.
 +     */
 +    maxsr_wat = min(fr->nWatMol, (homenr+2)/3);
 +    if (fr->bTwinRange)
 +    {
 +        maxlr     = 50;
 +        maxlr_wat = min(maxsr_wat, maxlr);
 +    }
 +    else
 +    {
 +        maxlr = maxlr_wat = 0;
 +    }
 +
 +    /* Determine the values for ielec/ivdw. */
 +    ielec    = fr->nbkernel_elec_interaction;
 +    ivdw     = fr->nbkernel_vdw_interaction;
 +    ielecmod = fr->nbkernel_elec_modifier;
 +    ivdwmod  = fr->nbkernel_vdw_modifier;
 +    type     = GMX_NBLIST_INTERACTION_STANDARD;
 +
 +    fr->ns.bCGlist = (getenv("GMX_NBLISTCG") != 0);
 +    if (!fr->ns.bCGlist)
 +    {
 +        igeometry_def = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +    }
 +    else
 +    {
 +        igeometry_def = GMX_NBLIST_GEOMETRY_CG_CG;
 +        if (log != NULL)
 +        {
 +            fprintf(log, "\nUsing charge-group - charge-group neighbor lists and kernels\n\n");
 +        }
 +    }
 +
 +    if (fr->solvent_opt == esolTIP4P)
 +    {
 +        igeometry_w  = GMX_NBLIST_GEOMETRY_WATER4_PARTICLE;
 +        igeometry_ww = GMX_NBLIST_GEOMETRY_WATER4_WATER4;
 +    }
 +    else
 +    {
 +        igeometry_w  = GMX_NBLIST_GEOMETRY_WATER3_PARTICLE;
 +        igeometry_ww = GMX_NBLIST_GEOMETRY_WATER3_WATER3;
 +    }
 +
 +    for (i = 0; i < fr->nnblists; i++)
 +    {
 +        nbl = &(fr->nblists[i]);
 +
 +        if ((fr->adress_type != eAdressOff) && (i >= fr->nnblists/2))
 +        {
 +            type = GMX_NBLIST_INTERACTION_ADRESS;
 +        }
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ], &nbl->nlist_lr[eNL_VDWQQ],
 +                    maxsr, maxlr, ivdw, ivdwmod, ielec, ielecmod, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDW], &nbl->nlist_lr[eNL_VDW],
 +                    maxsr, maxlr, ivdw, ivdwmod, GMX_NBKERNEL_ELEC_NONE, eintmodNONE, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ], &nbl->nlist_lr[eNL_QQ],
 +                    maxsr, maxlr, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_WATER], &nbl->nlist_lr[eNL_VDWQQ_WATER],
 +                    maxsr_wat, maxlr_wat, ivdw, ivdwmod, ielec, ielecmod, igeometry_w, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ_WATER], &nbl->nlist_lr[eNL_QQ_WATER],
 +                    maxsr_wat, maxlr_wat, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_w, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_WATERWATER], &nbl->nlist_lr[eNL_VDWQQ_WATERWATER],
 +                    maxsr_wat, maxlr_wat, ivdw, ivdwmod, ielec, ielecmod, igeometry_ww, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ_WATERWATER], &nbl->nlist_lr[eNL_QQ_WATERWATER],
 +                    maxsr_wat, maxlr_wat, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_ww, type);
 +
 +        /* Did we get the solvent loops so we can use optimized water kernels? */
 +        if (nbl->nlist_sr[eNL_VDWQQ_WATER].kernelptr_vf == NULL
 +            || nbl->nlist_sr[eNL_QQ_WATER].kernelptr_vf == NULL
 +#ifndef DISABLE_WATERWATER_NLIST
 +            || nbl->nlist_sr[eNL_VDWQQ_WATERWATER].kernelptr_vf == NULL
 +            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL
 +#endif
 +            )
 +        {
 +            fr->solvent_opt = esolNO;
 +            fprintf(log, "Note: The available nonbonded kernels do not support water optimization - disabling.\n");
 +        }
 +
 +        if (fr->efep != efepNO)
 +        {
 +            if ((fr->bEwald) && (fr->sc_alphacoul > 0)) /* need to handle long range differently if using softcore */
 +            {
 +                ielecf    = GMX_NBKERNEL_ELEC_EWALD;
 +                ielecmodf = eintmodNONE;
 +            }
 +            else
 +            {
 +                ielecf    = ielec;
 +                ielecmodf = ielecmod;
 +            }
 +
 +            init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_FREE], &nbl->nlist_lr[eNL_VDWQQ_FREE],
 +                        maxsr, maxlr, ivdw, ivdwmod, ielecf, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +            init_nblist(log, &nbl->nlist_sr[eNL_VDW_FREE], &nbl->nlist_lr[eNL_VDW_FREE],
 +                        maxsr, maxlr, ivdw, ivdwmod, GMX_NBKERNEL_ELEC_NONE, eintmodNONE, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +            init_nblist(log, &nbl->nlist_sr[eNL_QQ_FREE], &nbl->nlist_lr[eNL_QQ_FREE],
 +                        maxsr, maxlr, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielecf, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +        }
 +    }
 +    /* QMMM MM list */
 +    if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +    {
 +        init_nblist(log, &fr->QMMMlist, NULL,
 +                    maxsr, maxlr, 0, 0, ielec, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_STANDARD);
 +    }
 +
 +    if (log != NULL)
 +    {
 +        fprintf(log, "\n");
 +    }
 +
 +    fr->ns.nblist_initialized = TRUE;
 +}
 +
 +static void reset_nblist(t_nblist *nl)
 +{
 +    nl->nri       = -1;
 +    nl->nrj       = 0;
 +    nl->maxlen    = 0;
 +    if (nl->jindex)
 +    {
 +        nl->jindex[0] = 0;
 +    }
 +}
 +
 +static void reset_neighbor_lists(t_forcerec *fr, gmx_bool bResetSR, gmx_bool bResetLR)
 +{
 +    int n, i;
 +
 +    if (fr->bQMMM)
 +    {
 +        /* only reset the short-range nblist */
 +        reset_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for (n = 0; n < fr->nnblists; n++)
 +    {
 +        for (i = 0; i < eNL_NR; i++)
 +        {
 +            if (bResetSR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_sr[i]) );
 +            }
 +            if (bResetLR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_lr[i]) );
 +            }
 +        }
 +    }
 +}
 +
 +
 +
 +
 +static inline void new_i_nblist(t_nblist *nlist,
 +                                gmx_bool bLR, atom_id i_atom, int shift, int gid)
 +{
 +    int    i, k, nri, nshift;
 +
 +    nri = nlist->nri;
 +
 +    /* Check whether we have to increase the i counter */
 +    if ((nri == -1) ||
 +        (nlist->iinr[nri]  != i_atom) ||
 +        (nlist->shift[nri] != shift) ||
 +        (nlist->gid[nri]   != gid))
 +    {
 +        /* This is something else. Now see if any entries have
 +         * been added in the list of the previous atom.
 +         */
 +        if ((nri == -1) ||
 +            ((nlist->jindex[nri+1] > nlist->jindex[nri]) &&
 +             (nlist->gid[nri] != -1)))
 +        {
 +            /* If so increase the counter */
 +            nlist->nri++;
 +            nri++;
 +            if (nlist->nri >= nlist->maxnri)
 +            {
 +                nlist->maxnri += over_alloc_large(nlist->nri);
 +                reallocate_nblist(nlist);
 +            }
 +        }
 +        /* Set the number of neighbours and the atom number */
 +        nlist->jindex[nri+1] = nlist->jindex[nri];
 +        nlist->iinr[nri]     = i_atom;
 +        nlist->gid[nri]      = gid;
 +        nlist->shift[nri]    = shift;
 +    }
 +}
 +
 +static inline void close_i_nblist(t_nblist *nlist)
 +{
 +    int nri = nlist->nri;
 +    int len;
 +
 +    if (nri >= 0)
 +    {
 +        /* Add elements up to padding. Since we allocate memory in units
 +         * of the simd_padding width, we do not have to check for possible
 +         * list reallocation here.
 +         */
 +        while ((nlist->nrj % nlist->simd_padding_width) != 0)
 +        {
 +            /* Use -4 here, so we can write forces for 4 atoms before real data */
 +            nlist->jjnr[nlist->nrj++] = -4;
 +        }
 +        nlist->jindex[nri+1] = nlist->nrj;
 +
 +        len = nlist->nrj -  nlist->jindex[nri];
 +
 +        /* nlist length for water i molecules is treated statically
 +         * in the innerloops
 +         */
 +        if (len > nlist->maxlen)
 +        {
 +            nlist->maxlen = len;
 +        }
 +    }
 +}
 +
 +static inline void close_nblist(t_nblist *nlist)
 +{
 +    /* Only close this nblist when it has been initialized.
 +     * Avoid the creation of i-lists with no j-particles.
 +     */
 +    if (nlist->nrj == 0)
 +    {
 +        /* Some assembly kernels do not support empty lists,
 +         * make sure here that we don't generate any empty lists.
 +         * With the current ns code this branch is taken in two cases:
 +         * No i-particles at all: nri=-1 here
 +         * There are i-particles, but no j-particles; nri=0 here
 +         */
 +        nlist->nri = 0;
 +    }
 +    else
 +    {
 +        /* Close list number nri by incrementing the count */
 +        nlist->nri++;
 +    }
 +}
 +
 +static inline void close_neighbor_lists(t_forcerec *fr, gmx_bool bMakeQMMMnblist)
 +{
 +    int n, i;
 +
 +    if (bMakeQMMMnblist)
 +    {
 +        close_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for (n = 0; n < fr->nnblists; n++)
 +    {
 +        for (i = 0; (i < eNL_NR); i++)
 +        {
 +            close_nblist(&(fr->nblists[n].nlist_sr[i]));
 +            close_nblist(&(fr->nblists[n].nlist_lr[i]));
 +        }
 +    }
 +}
 +
 +
 +static inline void add_j_to_nblist(t_nblist *nlist, atom_id j_atom, gmx_bool bLR)
 +{
 +    int nrj = nlist->nrj;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = round_up_to_simd_width(over_alloc_small(nlist->nrj + 1), nlist->simd_padding_width);
 +
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
 +                    bLR ? "LR" : "SR", nlist->ielec, nlist->ivdw, nlist->type, nlist->igeometry, nlist->maxnrj);
 +        }
 +
 +        srenew(nlist->jjnr, nlist->maxnrj);
 +    }
 +
 +    nlist->jjnr[nrj] = j_atom;
 +    nlist->nrj++;
 +}
 +
 +static inline void add_j_to_nblist_cg(t_nblist *nlist,
 +                                      atom_id j_start, int j_end,
 +                                      t_excl *bexcl, gmx_bool i_is_j,
 +                                      gmx_bool bLR)
 +{
 +    int nrj = nlist->nrj;
 +    int j;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = over_alloc_small(nlist->nrj + 1);
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
 +                    bLR ? "LR" : "SR", nlist->ielec, nlist->ivdw, nlist->type, nlist->igeometry, nlist->maxnrj);
 +        }
 +
 +        srenew(nlist->jjnr, nlist->maxnrj);
 +        srenew(nlist->jjnr_end, nlist->maxnrj);
 +        srenew(nlist->excl, nlist->maxnrj*MAX_CGCGSIZE);
 +    }
 +
 +    nlist->jjnr[nrj]     = j_start;
 +    nlist->jjnr_end[nrj] = j_end;
 +
 +    if (j_end - j_start > MAX_CGCGSIZE)
 +    {
 +        gmx_fatal(FARGS, "The charge-group - charge-group neighborlist do not support charge groups larger than %d, found a charge group of size %d", MAX_CGCGSIZE, j_end-j_start);
 +    }
 +
 +    /* Set the exclusions */
 +    for (j = j_start; j < j_end; j++)
 +    {
 +        nlist->excl[nrj*MAX_CGCGSIZE + j - j_start] = bexcl[j];
 +    }
 +    if (i_is_j)
 +    {
 +        /* Avoid double counting of intra-cg interactions */
 +        for (j = 1; j < j_end-j_start; j++)
 +        {
 +            nlist->excl[nrj*MAX_CGCGSIZE + j] |= (1<<j) - 1;
 +        }
 +    }
 +
 +    nlist->nrj++;
 +}
 +
 +typedef void
 +    put_in_list_t (gmx_bool              bHaveVdW[],
 +                   int                   ngid,
 +                   t_mdatoms     *       md,
 +                   int                   icg,
 +                   int                   jgid,
 +                   int                   nj,
 +                   atom_id               jjcg[],
 +                   atom_id               index[],
 +                   t_excl                bExcl[],
 +                   int                   shift,
 +                   t_forcerec     *      fr,
 +                   gmx_bool              bLR,
 +                   gmx_bool              bDoVdW,
 +                   gmx_bool              bDoCoul,
 +                   int                   solvent_opt);
 +
 +static void
 +put_in_list_at(gmx_bool              bHaveVdW[],
 +               int                   ngid,
 +               t_mdatoms     *       md,
 +               int                   icg,
 +               int                   jgid,
 +               int                   nj,
 +               atom_id               jjcg[],
 +               atom_id               index[],
 +               t_excl                bExcl[],
 +               int                   shift,
 +               t_forcerec     *      fr,
 +               gmx_bool              bLR,
 +               gmx_bool              bDoVdW,
 +               gmx_bool              bDoCoul,
 +               int                   solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist  *   vdwc;
 +    t_nblist  *   vdw;
 +    t_nblist  *   coul;
 +    t_nblist  *   vdwc_free  = NULL;
 +    t_nblist  *   vdw_free   = NULL;
 +    t_nblist  *   coul_free  = NULL;
 +    t_nblist  *   vdwc_ww    = NULL;
 +    t_nblist  *   coul_ww    = NULL;
 +
 +    int           i, j, jcg, igid, gid, nbl_ind, ind_ij;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg, len;
 +
 +    int          *cginfo;
 +    int          *type, *typeB;
 +    real         *charge, *chargeB;
 +    real          qi, qiB, qq, rlj;
 +    gmx_bool      bFreeEnergy, bFree, bFreeJ, bNotEx, *bPert;
 +    gmx_bool      bDoVdW_i, bDoCoul_i, bDoCoul_i_sol;
 +    int           iwater, jwater;
 +    t_nblist     *nlist;
 +
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt != esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +
 +    bFreeEnergy = FALSE;
 +    if (md->nPerturbed)
 +    {
 +        /* Check if any of the particles involved are perturbed.
 +         * If not we can do the cheaper normal put_in_list
 +         * and use more solvent optimization.
 +         */
 +        for (i = 0; i < nicg; i++)
 +        {
 +            bFreeEnergy |= bPert[i0+i];
 +        }
 +        /* Loop over the j charge groups */
 +        for (j = 0; (j < nj && !bFreeEnergy); j++)
 +        {
 +            jcg = jjcg[j];
 +            jj0 = index[jcg];
 +            jj1 = index[jcg+1];
 +            /* Finally loop over the atoms in the j-charge group */
 +            for (jj = jj0; jj < jj1; jj++)
 +            {
 +                bFreeEnergy |= bPert[jj];
 +            }
 +        }
 +    }
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[GID(igid, jgid, ngid)];
 +    }
 +    if (bLR)
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_sr;
 +    }
 +
 +    if (iwater != esolNO)
 +    {
 +        vdwc = &nlist[eNL_VDWQQ_WATER];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ_WATER];
 +#ifndef DISABLE_WATERWATER_NLIST
 +        vdwc_ww = &nlist[eNL_VDWQQ_WATERWATER];
 +        coul_ww = &nlist[eNL_QQ_WATERWATER];
 +#endif
 +    }
 +    else
 +    {
 +        vdwc = &nlist[eNL_VDWQQ];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ];
 +    }
 +
 +    if (!bFreeEnergy)
 +    {
 +        if (iwater != esolNO)
 +        {
 +            /* Loop over the atoms in the i charge group */
 +            i_atom  = i0;
 +            gid     = GID(igid, jgid, ngid);
 +            /* Create new i_atom for each energy group */
 +            if (bDoCoul && bDoVdW)
 +            {
 +                new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(vdwc_ww, bLR, i_atom, shift, gid);
 +#endif
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +            }
 +            if (bDoCoul)
 +            {
 +                new_i_nblist(coul, bLR, i_atom, shift, gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(coul_ww, bLR, i_atom, shift, gid);
 +#endif
 +            }
 +            /* Loop over the j charge groups */
 +            for (j = 0; (j < nj); j++)
 +            {
 +                jcg = jjcg[j];
 +
 +                if (jcg == icg)
 +                {
 +                    continue;
 +                }
 +
 +                jj0    = index[jcg];
 +                jwater = GET_CGINFO_SOLOPT(cginfo[jcg]);
 +
 +                if (iwater == esolSPC && jwater == esolSPC)
 +                {
 +                    /* Interaction between two SPC molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw, jj0, bLR);
 +                    }
 +                    else
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST
 +                        /* Add entries for the three atoms - only do VdW if we need to */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc, jj0, bLR);
 +                        }
 +                        add_j_to_nblist(coul, jj0+1, bLR);
 +                        add_j_to_nblist(coul, jj0+2, bLR);
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww, jj0, bLR);
 +                        }
 +#endif
 +                    }
 +                }
 +                else if (iwater == esolTIP4P && jwater == esolTIP4P)
 +                {
 +                    /* Interaction between two TIP4p molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw, jj0, bLR);
 +                    }
 +                    else
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST
 +                        /* Add entries for the four atoms - only do VdW if we need to */
 +                        if (bDoVdW)
 +                        {
 +                            add_j_to_nblist(vdw, jj0, bLR);
 +                        }
 +                        add_j_to_nblist(coul, jj0+1, bLR);
 +                        add_j_to_nblist(coul, jj0+2, bLR);
 +                        add_j_to_nblist(coul, jj0+3, bLR);
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww, jj0, bLR);
 +                        }
 +#endif
 +                    }
 +                }
 +                else
 +                {
 +                    /* j charge group is not water, but i is.
 +                     * Add entries to the water-other_atom lists; the geometry of the water
 +                     * molecule doesn't matter - that is taken care of in the nonbonded kernel,
 +                     * so we don't care if it is SPC or TIP4P...
 +                     */
 +
 +                    jj1 = index[jcg+1];
 +
 +                    if (!bDoVdW)
 +                    {
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                    else if (!bDoCoul)
 +                    {
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                add_j_to_nblist(vdw, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* _charge_ _groups_ interact with both coulomb and LJ */
 +                        /* Check which atoms we should add to the lists!       */
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    add_j_to_nblist(vdwc, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw, jj, bLR);
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +#ifndef DISABLE_WATERWATER_NLIST
 +            close_i_nblist(coul_ww);
 +            close_i_nblist(vdwc_ww);
 +#endif
 +        }
 +        else
 +        {
 +            /* no solvent as i charge group */
 +            /* Loop over the atoms in the i charge group */
 +            for (i = 0; i < nicg; i++)
 +            {
 +                i_atom  = i0+i;
 +                gid     = GID(igid, jgid, ngid);
 +                qi      = charge[i_atom];
 +
 +                /* Create new i_atom for each energy group */
 +                if (bDoVdW && bDoCoul)
 +                {
 +                    new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +                }
 +                if (bDoVdW)
 +                {
 +                    new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +                }
 +                if (bDoCoul)
 +                {
 +                    new_i_nblist(coul, bLR, i_atom, shift, gid);
 +                }
 +                bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +                bDoCoul_i = (bDoCoul && qi != 0);
 +
 +                if (bDoVdW_i || bDoCoul_i)
 +                {
 +                    /* Loop over the j charge groups */
 +                    for (j = 0; (j < nj); j++)
 +                    {
 +                        jcg = jjcg[j];
 +
 +                        /* Check for large charge groups */
 +                        if (jcg == icg)
 +                        {
 +                            jj0 = i0 + i + 1;
 +                        }
 +                        else
 +                        {
 +                            jj0 = index[jcg];
 +                        }
 +
 +                        jj1 = index[jcg+1];
 +                        /* Finally loop over the atoms in the j-charge group */
 +                        for (jj = jj0; jj < jj1; jj++)
 +                        {
 +                            bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                            if (bNotEx)
 +                            {
 +                                if (!bDoVdW_i)
 +                                {
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i)
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc, jj, bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +                close_i_nblist(vdw);
 +                close_i_nblist(coul);
 +                close_i_nblist(vdwc);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* we are doing free energy */
 +        vdwc_free = &nlist[eNL_VDWQQ_FREE];
 +        vdw_free  = &nlist[eNL_VDW_FREE];
 +        coul_free = &nlist[eNL_QQ_FREE];
 +        /* Loop over the atoms in the i charge group */
 +        for (i = 0; i < nicg; i++)
 +        {
 +            i_atom  = i0+i;
 +            gid     = GID(igid, jgid, ngid);
 +            qi      = charge[i_atom];
 +            qiB     = chargeB[i_atom];
 +
 +            /* Create new i_atom for each energy group */
 +            if (bDoVdW && bDoCoul)
 +            {
 +                new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +            }
 +            if (bDoCoul)
 +            {
 +                new_i_nblist(coul, bLR, i_atom, shift, gid);
 +            }
 +
 +            new_i_nblist(vdw_free, bLR, i_atom, shift, gid);
 +            new_i_nblist(coul_free, bLR, i_atom, shift, gid);
 +            new_i_nblist(vdwc_free, bLR, i_atom, shift, gid);
 +
 +            bDoVdW_i  = (bDoVdW  &&
 +                         (bHaveVdW[type[i_atom]] || bHaveVdW[typeB[i_atom]]));
 +            bDoCoul_i = (bDoCoul && (qi != 0 || qiB != 0));
 +            /* For TIP4P the first atom does not have a charge,
 +             * but the last three do. So we should still put an atom
 +             * without LJ but with charge in the water-atom neighborlist
 +             * for a TIP4p i charge group.
 +             * For SPC type water the first atom has LJ and charge,
 +             * so there is no such problem.
 +             */
 +            if (iwater == esolNO)
 +            {
 +                bDoCoul_i_sol = bDoCoul_i;
 +            }
 +            else
 +            {
 +                bDoCoul_i_sol = bDoCoul;
 +            }
 +
 +            if (bDoVdW_i || bDoCoul_i_sol)
 +            {
 +                /* Loop over the j charge groups */
 +                for (j = 0; (j < nj); j++)
 +                {
 +                    jcg = jjcg[j];
 +
 +                    /* Check for large charge groups */
 +                    if (jcg == icg)
 +                    {
 +                        jj0 = i0 + i + 1;
 +                    }
 +                    else
 +                    {
 +                        jj0 = index[jcg];
 +                    }
 +
 +                    jj1 = index[jcg+1];
 +                    /* Finally loop over the atoms in the j-charge group */
 +                    bFree = bPert[i_atom];
 +                    for (jj = jj0; (jj < jj1); jj++)
 +                    {
 +                        bFreeJ = bFree || bPert[jj];
 +                        /* Complicated if, because the water H's should also
 +                         * see perturbed j-particles
 +                         */
 +                        if (iwater == esolNO || i == 0 || bFreeJ)
 +                        {
 +                            bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                            if (bNotEx)
 +                            {
 +                                if (bFreeJ)
 +                                {
 +                                    if (!bDoVdW_i)
 +                                    {
 +                                        if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(coul_free, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (!bDoCoul_i)
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            add_j_to_nblist(vdw_free, jj, bLR);
 +                                        }
 +                                    }
 +                                    else
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                            {
 +                                                add_j_to_nblist(vdwc_free, jj, bLR);
 +                                            }
 +                                            else
 +                                            {
 +                                                add_j_to_nblist(vdw_free, jj, bLR);
 +                                            }
 +                                        }
 +                                        else if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(coul_free, jj, bLR);
 +                                        }
 +                                    }
 +                                }
 +                                else if (!bDoVdW_i)
 +                                {
 +                                    /* This is done whether or not bWater is set */
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i_sol)
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc, jj, bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_free);
 +            close_i_nblist(coul_free);
 +            close_i_nblist(vdwc_free);
 +        }
 +    }
 +}
 +
 +static void
 +put_in_list_adress(gmx_bool              bHaveVdW[],
 +                   int                   ngid,
 +                   t_mdatoms     *       md,
 +                   int                   icg,
 +                   int                   jgid,
 +                   int                   nj,
 +                   atom_id               jjcg[],
 +                   atom_id               index[],
 +                   t_excl                bExcl[],
 +                   int                   shift,
 +                   t_forcerec     *      fr,
 +                   gmx_bool              bLR,
 +                   gmx_bool              bDoVdW,
 +                   gmx_bool              bDoCoul,
 +                   int                   solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist  *   vdwc;
 +    t_nblist  *   vdw;
 +    t_nblist  *   coul;
 +    t_nblist  *   vdwc_adress  = NULL;
 +    t_nblist  *   vdw_adress   = NULL;
 +    t_nblist  *   coul_adress  = NULL;
 +    t_nblist  *   vdwc_ww      = NULL;
 +    t_nblist  *   coul_ww      = NULL;
 +
 +    int           i, j, jcg, igid, gid, nbl_ind, nbl_ind_adress;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg, len;
 +
 +    int          *cginfo;
 +    int          *type, *typeB;
 +    real         *charge, *chargeB;
 +    real         *wf;
 +    real          qi, qiB, qq, rlj;
 +    gmx_bool      bFreeEnergy, bFree, bFreeJ, bNotEx, *bPert;
 +    gmx_bool      bDoVdW_i, bDoCoul_i, bDoCoul_i_sol;
 +    gmx_bool      b_hybrid;
 +    gmx_bool      j_all_atom;
 +    int           iwater, jwater;
 +    t_nblist     *nlist, *nlist_adress;
 +    gmx_bool      bEnergyGroupCG;
 +
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +    wf      = md->wf;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt != esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +
 +    if (md->nPerturbed)
 +    {
 +        gmx_fatal(FARGS, "AdResS does not support free energy pertubation\n");
 +    }
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 2)
 +    {
 +        nbl_ind        = 0;
 +        nbl_ind_adress = 1;
 +    }
 +    else
 +    {
 +        nbl_ind        = fr->gid2nblists[GID(igid, jgid, ngid)];
 +        nbl_ind_adress = nbl_ind+fr->nnblists/2;
 +    }
 +    if (bLR)
 +    {
 +        nlist        = fr->nblists[nbl_ind].nlist_lr;
 +        nlist_adress = fr->nblists[nbl_ind_adress].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist        = fr->nblists[nbl_ind].nlist_sr;
 +        nlist_adress = fr->nblists[nbl_ind_adress].nlist_sr;
 +    }
 +
 +
 +    vdwc = &nlist[eNL_VDWQQ];
 +    vdw  = &nlist[eNL_VDW];
 +    coul = &nlist[eNL_QQ];
 +
 +    vdwc_adress = &nlist_adress[eNL_VDWQQ];
 +    vdw_adress  = &nlist_adress[eNL_VDW];
 +    coul_adress = &nlist_adress[eNL_QQ];
 +
 +    /* We do not support solvent optimization with AdResS for now.
 +       For this we would need hybrid solvent-other kernels */
 +
 +    /* no solvent as i charge group */
 +    /* Loop over the atoms in the i charge group */
 +    for (i = 0; i < nicg; i++)
 +    {
 +        i_atom  = i0+i;
 +        gid     = GID(igid, jgid, ngid);
 +        qi      = charge[i_atom];
 +
 +        /* Create new i_atom for each energy group */
 +        if (bDoVdW && bDoCoul)
 +        {
 +            new_i_nblist(vdwc, bLR, i_atom, shift, gid);
 +            new_i_nblist(vdwc_adress, bLR, i_atom, shift, gid);
 +
 +        }
 +        if (bDoVdW)
 +        {
 +            new_i_nblist(vdw, bLR, i_atom, shift, gid);
 +            new_i_nblist(vdw_adress, bLR, i_atom, shift, gid);
 +
 +        }
 +        if (bDoCoul)
 +        {
 +            new_i_nblist(coul, bLR, i_atom, shift, gid);
 +            new_i_nblist(coul_adress, bLR, i_atom, shift, gid);
 +        }
 +        bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +        bDoCoul_i = (bDoCoul && qi != 0);
 +
 +        /* Here we find out whether the energy groups interaction belong to a
 +         * coarse-grained (vsite) or atomistic interaction. Note that, beacuse
 +         * interactions between coarse-grained and other (atomistic) energygroups
 +         * are excluded automatically by grompp, it is sufficient to check for
 +         * the group id of atom i (igid) */
 +        bEnergyGroupCG = !egp_explicit(fr, igid);
 +
 +        if (bDoVdW_i || bDoCoul_i)
 +        {
 +            /* Loop over the j charge groups */
 +            for (j = 0; (j < nj); j++)
 +            {
 +                jcg = jjcg[j];
 +
 +                /* Check for large charge groups */
 +                if (jcg == icg)
 +                {
 +                    jj0 = i0 + i + 1;
 +                }
 +                else
 +                {
 +                    jj0 = index[jcg];
 +                }
 +
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for (jj = jj0; jj < jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                    /* Now we have to exclude interactions which will be zero
 +                     * anyway due to the AdResS weights (in previous implementations
 +                     * this was done in the force kernel). This is necessary as
 +                     * pure interactions (those with b_hybrid=false, i.e. w_i*w_j==1 or 0)
 +                     * are put into neighbour lists which will be passed to the
 +                     * standard (optimized) kernels for speed. The interactions with
 +                     * b_hybrid=true are placed into the _adress neighbour lists and
 +                     * processed by the generic AdResS kernel.
 +                     */
 +                    if ( (bEnergyGroupCG &&
-                         (wf[i_atom] <= GMX_REAL_EPS && wf[jj] <= GMX_REAL_EPS));
++                          wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS ) ||
++                         ( !bEnergyGroupCG && wf[jj] <= GMX_REAL_EPS ) )
 +                    {
 +                        continue;
 +                    }
 +
 +                    b_hybrid = !((wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS) ||
++                                 (wf[i_atom] <= GMX_REAL_EPS && wf[jj] <= GMX_REAL_EPS));
 +
 +                    if (bNotEx)
 +                    {
 +                        if (!bDoVdW_i)
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(coul, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(coul_adress, jj, bLR);
 +                                }
 +                            }
 +                        }
 +                        else if (!bDoCoul_i)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(vdw, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw_adress, jj, bLR);
 +                                }
 +                            }
 +                        }
 +                        else
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    if (!b_hybrid)
 +                                    {
 +                                        add_j_to_nblist(vdwc, jj, bLR);
 +                                    }
 +                                    else
 +                                    {
 +                                        add_j_to_nblist(vdwc_adress, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (!b_hybrid)
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                    else
 +                                    {
 +                                        add_j_to_nblist(vdw_adress, jj, bLR);
 +                                    }
 +
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(coul, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(coul_adress, jj, bLR);
 +                                }
 +
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_adress);
 +            close_i_nblist(coul_adress);
 +            close_i_nblist(vdwc_adress);
 +        }
 +    }
 +}
 +
 +static void
 +put_in_list_qmmm(gmx_bool              bHaveVdW[],
 +                 int                   ngid,
 +                 t_mdatoms     *       md,
 +                 int                   icg,
 +                 int                   jgid,
 +                 int                   nj,
 +                 atom_id               jjcg[],
 +                 atom_id               index[],
 +                 t_excl                bExcl[],
 +                 int                   shift,
 +                 t_forcerec     *      fr,
 +                 gmx_bool              bLR,
 +                 gmx_bool              bDoVdW,
 +                 gmx_bool              bDoCoul,
 +                 int                   solvent_opt)
 +{
 +    t_nblist  *   coul;
 +    int           i, j, jcg, igid, gid;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg;
 +    gmx_bool      bNotEx;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(fr->cginfo[icg]);
 +
 +    coul = &fr->QMMMlist;
 +
 +    /* Loop over atoms in the ith charge group */
 +    for (i = 0; i < nicg; i++)
 +    {
 +        i_atom = i0+i;
 +        gid    = GID(igid, jgid, ngid);
 +        /* Create new i_atom for each energy group */
 +        new_i_nblist(coul, bLR, i_atom, shift, gid);
 +
 +        /* Loop over the j charge groups */
 +        for (j = 0; j < nj; j++)
 +        {
 +            jcg = jjcg[j];
 +
 +            /* Charge groups cannot have QM and MM atoms simultaneously */
 +            if (jcg != icg)
 +            {
 +                jj0 = index[jcg];
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for (jj = jj0; jj < jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl, i, jj);
 +                    if (bNotEx)
 +                    {
 +                        add_j_to_nblist(coul, jj, bLR);
 +                    }
 +                }
 +            }
 +        }
 +        close_i_nblist(coul);
 +    }
 +}
 +
 +static void
 +put_in_list_cg(gmx_bool              bHaveVdW[],
 +               int                   ngid,
 +               t_mdatoms     *       md,
 +               int                   icg,
 +               int                   jgid,
 +               int                   nj,
 +               atom_id               jjcg[],
 +               atom_id               index[],
 +               t_excl                bExcl[],
 +               int                   shift,
 +               t_forcerec     *      fr,
 +               gmx_bool              bLR,
 +               gmx_bool              bDoVdW,
 +               gmx_bool              bDoCoul,
 +               int                   solvent_opt)
 +{
 +    int          cginfo;
 +    int          igid, gid, nbl_ind;
 +    t_nblist *   vdwc;
 +    int          j, jcg;
 +
 +    cginfo = fr->cginfo[icg];
 +
 +    igid = GET_CGINFO_GID(cginfo);
 +    gid  = GID(igid, jgid, ngid);
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[gid];
 +    }
 +    if (bLR)
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_lr[eNL_VDWQQ];
 +    }
 +    else
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_sr[eNL_VDWQQ];
 +    }
 +
 +    /* Make a new neighbor list for charge group icg.
 +     * Currently simply one neighbor list is made with LJ and Coulomb.
 +     * If required, zero interactions could be removed here
 +     * or in the force loop.
 +     */
 +    new_i_nblist(vdwc, bLR, index[icg], shift, gid);
 +    vdwc->iinr_end[vdwc->nri] = index[icg+1];
 +
 +    for (j = 0; (j < nj); j++)
 +    {
 +        jcg = jjcg[j];
 +        /* Skip the icg-icg pairs if all self interactions are excluded */
 +        if (!(jcg == icg && GET_CGINFO_EXCL_INTRA(cginfo)))
 +        {
 +            /* Here we add the j charge group jcg to the list,
 +             * exclusions are also added to the list.
 +             */
 +            add_j_to_nblist_cg(vdwc, index[jcg], index[jcg+1], bExcl, icg == jcg, bLR);
 +        }
 +    }
 +
 +    close_i_nblist(vdwc);
 +}
 +
 +static void setexcl(atom_id start, atom_id end, t_blocka *excl, gmx_bool b,
 +                    t_excl bexcl[])
 +{
 +    atom_id i, k;
 +
 +    if (b)
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            for (k = excl->index[i]; k < excl->index[i+1]; k++)
 +            {
 +                SETEXCL(bexcl, i-start, excl->a[k]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            for (k = excl->index[i]; k < excl->index[i+1]; k++)
 +            {
 +                RMEXCL(bexcl, i-start, excl->a[k]);
 +            }
 +        }
 +    }
 +}
 +
 +int calc_naaj(int icg, int cgtot)
 +{
 +    int naaj;
 +
 +    if ((cgtot % 2) == 1)
 +    {
 +        /* Odd number of charge groups, easy */
 +        naaj = 1 + (cgtot/2);
 +    }
 +    else if ((cgtot % 4) == 0)
 +    {
 +        /* Multiple of four is hard */
 +        if (icg < cgtot/2)
 +        {
 +            if ((icg % 2) == 0)
 +            {
 +                naaj = 1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj = cgtot/2;
 +            }
 +        }
 +        else
 +        {
 +            if ((icg % 2) == 1)
 +            {
 +                naaj = 1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj = cgtot/2;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* cgtot/2 = odd */
 +        if ((icg % 2) == 0)
 +        {
 +            naaj = 1+(cgtot/2);
 +        }
 +        else
 +        {
 +            naaj = cgtot/2;
 +        }
 +    }
 +#ifdef DEBUG
 +    fprintf(log, "naaj=%d\n", naaj);
 +#endif
 +
 +    return naaj;
 +}
 +
 +/************************************************
 + *
 + *  S I M P L E      C O R E     S T U F F
 + *
 + ************************************************/
 +
 +static real calc_image_tric(rvec xi, rvec xj, matrix box,
 +                            rvec b_inv, int *shift)
 +{
 +    /* This code assumes that the cut-off is smaller than
 +     * a half times the smallest diagonal element of the box.
 +     */
 +    const real h25 = 2.5;
 +    real       dx, dy, dz;
 +    real       r2;
 +    int        tx, ty, tz;
 +
 +    /* Compute diff vector */
 +    dz = xj[ZZ] - xi[ZZ];
 +    dy = xj[YY] - xi[YY];
 +    dx = xj[XX] - xi[XX];
 +
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 2.5 then subtract 2 again
 +     */
 +    tz  = dz*b_inv[ZZ] + h25;
 +    tz -= 2;
 +    dz -= tz*box[ZZ][ZZ];
 +    dy -= tz*box[ZZ][YY];
 +    dx -= tz*box[ZZ][XX];
 +
 +    ty  = dy*b_inv[YY] + h25;
 +    ty -= 2;
 +    dy -= ty*box[YY][YY];
 +    dx -= ty*box[YY][XX];
 +
 +    tx  = dx*b_inv[XX]+h25;
 +    tx -= 2;
 +    dx -= tx*box[XX][XX];
 +
 +    /* Distance squared */
 +    r2 = (dx*dx) + (dy*dy) + (dz*dz);
 +
 +    *shift = XYZ2IS(tx, ty, tz);
 +
 +    return r2;
 +}
 +
 +static real calc_image_rect(rvec xi, rvec xj, rvec box_size,
 +                            rvec b_inv, int *shift)
 +{
 +    const real h15 = 1.5;
 +    real       ddx, ddy, ddz;
 +    real       dx, dy, dz;
 +    real       r2;
 +    int        tx, ty, tz;
 +
 +    /* Compute diff vector */
 +    dx = xj[XX] - xi[XX];
 +    dy = xj[YY] - xi[YY];
 +    dz = xj[ZZ] - xi[ZZ];
 +
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 1.5 then subtract 1 again
 +     */
 +    tx = dx*b_inv[XX] + h15;
 +    ty = dy*b_inv[YY] + h15;
 +    tz = dz*b_inv[ZZ] + h15;
 +    tx--;
 +    ty--;
 +    tz--;
 +
 +    /* Correct diff vector for translation */
 +    ddx = tx*box_size[XX] - dx;
 +    ddy = ty*box_size[YY] - dy;
 +    ddz = tz*box_size[ZZ] - dz;
 +
 +    /* Distance squared */
 +    r2 = (ddx*ddx) + (ddy*ddy) + (ddz*ddz);
 +
 +    *shift = XYZ2IS(tx, ty, tz);
 +
 +    return r2;
 +}
 +
 +static void add_simple(t_ns_buf *nsbuf, int nrj, atom_id cg_j,
 +                       gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                       int icg, int jgid, t_block *cgs, t_excl bexcl[],
 +                       int shift, t_forcerec *fr, put_in_list_t *put_in_list)
 +{
 +    if (nsbuf->nj + nrj > MAX_CG)
 +    {
 +        put_in_list(bHaveVdW, ngid, md, icg, jgid, nsbuf->ncg, nsbuf->jcg,
 +                    cgs->index, bexcl, shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +        /* Reset buffer contents */
 +        nsbuf->ncg = nsbuf->nj = 0;
 +    }
 +    nsbuf->jcg[nsbuf->ncg++] = cg_j;
 +    nsbuf->nj               += nrj;
 +}
 +
 +static void ns_inner_tric(rvec x[], int icg, int *i_egp_flags,
 +                          int njcg, atom_id jcg[],
 +                          matrix box, rvec b_inv, real rcut2,
 +                          t_block *cgs, t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                          t_excl bexcl[], t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int       shift;
 +    int       j, nrj, jgid;
 +    int      *cginfo = fr->cginfo;
 +    atom_id   cg_j, *cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    shift   = CENTRAL;
 +    for (j = 0; (j < njcg); j++)
 +    {
 +        cg_j   = jcg[j];
 +        nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +        if (calc_image_tric(x[icg], x[cg_j], box, b_inv, &shift) < rcut2)
 +        {
 +            jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +            if (!(i_egp_flags[jgid] & EGP_EXCL))
 +            {
 +                add_simple(&ns_buf[jgid][shift], nrj, cg_j,
 +                           bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, shift, fr,
 +                           put_in_list);
 +            }
 +        }
 +    }
 +}
 +
 +static void ns_inner_rect(rvec x[], int icg, int *i_egp_flags,
 +                          int njcg, atom_id jcg[],
 +                          gmx_bool bBox, rvec box_size, rvec b_inv, real rcut2,
 +                          t_block *cgs, t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                          t_excl bexcl[], t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int       shift;
 +    int       j, nrj, jgid;
 +    int      *cginfo = fr->cginfo;
 +    atom_id   cg_j, *cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    if (bBox)
 +    {
 +        shift = CENTRAL;
 +        for (j = 0; (j < njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if (calc_image_rect(x[icg], x[cg_j], box_size, b_inv, &shift) < rcut2)
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][shift], nrj, cg_j,
 +                               bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, shift, fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (j = 0; (j < njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if ((rcut2 == 0) || (distance2(x[icg], x[cg_j]) < rcut2))
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][CENTRAL], nrj, cg_j,
 +                               bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, CENTRAL, fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* ns_simple_core needs to be adapted for QMMM still 2005 */
 +
 +static int ns_simple_core(t_forcerec *fr,
 +                          gmx_localtop_t *top,
 +                          t_mdatoms *md,
 +                          matrix box, rvec box_size,
 +                          t_excl bexcl[], atom_id *aaj,
 +                          int ngid, t_ns_buf **ns_buf,
 +                          put_in_list_t *put_in_list, gmx_bool bHaveVdW[])
 +{
 +    int          naaj, k;
 +    real         rlist2;
 +    int          nsearch, icg, jcg, igid, i0, nri, nn;
 +    int         *cginfo;
 +    t_ns_buf    *nsbuf;
 +    /* atom_id  *i_atoms; */
 +    t_block     *cgs  = &(top->cgs);
 +    t_blocka    *excl = &(top->excls);
 +    rvec         b_inv;
 +    int          m;
 +    gmx_bool     bBox, bTriclinic;
 +    int         *i_egp_flags;
 +
 +    rlist2 = sqr(fr->rlist);
 +
 +    bBox = (fr->ePBC != epbcNONE);
 +    if (bBox)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            b_inv[m] = divide_err(1.0, box_size[m]);
 +        }
 +        bTriclinic = TRICLINIC(box);
 +    }
 +    else
 +    {
 +        bTriclinic = FALSE;
 +    }
 +
 +    cginfo = fr->cginfo;
 +
 +    nsearch = 0;
 +    for (icg = fr->cg0; (icg < fr->hcg); icg++)
 +    {
 +        /*
 +           i0        = cgs->index[icg];
 +           nri       = cgs->index[icg+1]-i0;
 +           i_atoms   = &(cgs->a[i0]);
 +           i_eg_excl = fr->eg_excl + ngid*md->cENER[*i_atoms];
 +           setexcl(nri,i_atoms,excl,TRUE,bexcl);
 +         */
 +        igid        = GET_CGINFO_GID(cginfo[icg]);
 +        i_egp_flags = fr->egp_flags + ngid*igid;
 +        setexcl(cgs->index[icg], cgs->index[icg+1], excl, TRUE, bexcl);
 +
 +        naaj = calc_naaj(icg, cgs->nr);
 +        if (bTriclinic)
 +        {
 +            ns_inner_tric(fr->cg_cm, icg, i_egp_flags, naaj, &(aaj[icg]),
 +                          box, b_inv, rlist2, cgs, ns_buf,
 +                          bHaveVdW, ngid, md, bexcl, fr, put_in_list);
 +        }
 +        else
 +        {
 +            ns_inner_rect(fr->cg_cm, icg, i_egp_flags, naaj, &(aaj[icg]),
 +                          bBox, box_size, b_inv, rlist2, cgs, ns_buf,
 +                          bHaveVdW, ngid, md, bexcl, fr, put_in_list);
 +        }
 +        nsearch += naaj;
 +
 +        for (nn = 0; (nn < ngid); nn++)
 +        {
 +            for (k = 0; (k < SHIFTS); k++)
 +            {
 +                nsbuf = &(ns_buf[nn][k]);
 +                if (nsbuf->ncg > 0)
 +                {
 +                    put_in_list(bHaveVdW, ngid, md, icg, nn, nsbuf->ncg, nsbuf->jcg,
 +                                cgs->index, bexcl, k, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                    nsbuf->ncg = nsbuf->nj = 0;
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg], cgs->index[icg+1], excl, FALSE, bexcl);
 +    }
 +    close_neighbor_lists(fr, FALSE);
 +
 +    return nsearch;
 +}
 +
 +/************************************************
 + *
 + *    N S 5     G R I D     S T U F F
 + *
 + ************************************************/
 +
 +static inline void get_dx(int Nx, real gridx, real rc2, int xgi, real x,
 +                          int *dx0, int *dx1, real *dcx2)
 +{
 +    real dcx, tmp;
 +    int  xgi0, xgi1, i;
 +
 +    if (xgi < 0)
 +    {
 +        *dx0 = 0;
 +        xgi0 = -1;
 +        *dx1 = -1;
 +        xgi1 = 0;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        *dx0 = Nx;
 +        xgi0 = Nx-1;
 +        *dx1 = Nx-1;
 +        xgi1 = Nx;
 +    }
 +    else
 +    {
 +        dcx2[xgi] = 0;
 +        *dx0      = xgi;
 +        xgi0      = xgi-1;
 +        *dx1      = xgi;
 +        xgi1      = xgi+1;
 +    }
 +
 +    for (i = xgi0; i >= 0; i--)
 +    {
 +        dcx = (i+1)*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx0    = i;
 +        dcx2[i] = tmp;
 +    }
 +    for (i = xgi1; i < Nx; i++)
 +    {
 +        dcx = i*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx1    = i;
 +        dcx2[i] = tmp;
 +    }
 +}
 +
 +static inline void get_dx_dd(int Nx, real gridx, real rc2, int xgi, real x,
 +                             int ncpddc, int shift_min, int shift_max,
 +                             int *g0, int *g1, real *dcx2)
 +{
 +    real dcx, tmp;
 +    int  g_min, g_max, shift_home;
 +
 +    if (xgi < 0)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = 0;
 +        *g1   = -1;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = Nx;
 +        *g1   = Nx - 1;
 +    }
 +    else
 +    {
 +        if (ncpddc == 0)
 +        {
 +            g_min = 0;
 +            g_max = Nx - 1;
 +        }
 +        else
 +        {
 +            if (xgi < ncpddc)
 +            {
 +                shift_home = 0;
 +            }
 +            else
 +            {
 +                shift_home = -1;
 +            }
 +            g_min = (shift_min == shift_home ? 0          : ncpddc);
 +            g_max = (shift_max == shift_home ? ncpddc - 1 : Nx - 1);
 +        }
 +        if (shift_min > 0)
 +        {
 +            *g0 = g_min;
 +            *g1 = g_min - 1;
 +        }
 +        else if (shift_max < 0)
 +        {
 +            *g0 = g_max + 1;
 +            *g1 = g_max;
 +        }
 +        else
 +        {
 +            *g0       = xgi;
 +            *g1       = xgi;
 +            dcx2[xgi] = 0;
 +        }
 +    }
 +
 +    while (*g0 > g_min)
 +    {
 +        /* Check one grid cell down */
 +        dcx = ((*g0 - 1) + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g0)--;
 +        dcx2[*g0] = tmp;
 +    }
 +
 +    while (*g1 < g_max)
 +    {
 +        /* Check one grid cell up */
 +        dcx = (*g1 + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g1)++;
 +        dcx2[*g1] = tmp;
 +    }
 +}
 +
 +
 +#define sqr(x) ((x)*(x))
 +#define calc_dx2(XI, YI, ZI, y) (sqr(XI-y[XX]) + sqr(YI-y[YY]) + sqr(ZI-y[ZZ]))
 +#define calc_cyl_dx2(XI, YI, y) (sqr(XI-y[XX]) + sqr(YI-y[YY]))
 +/****************************************************
 + *
 + *    F A S T   N E I G H B O R  S E A R C H I N G
 + *
 + *    Optimized neighboursearching routine using grid
 + *    at least 1x1x1, see GROMACS manual
 + *
 + ****************************************************/
 +
 +
 +static void get_cutoff2(t_forcerec *fr, gmx_bool bDoLongRange,
 +                        real *rvdw2, real *rcoul2,
 +                        real *rs2, real *rm2, real *rl2)
 +{
 +    *rs2 = sqr(fr->rlist);
 +
 +    if (bDoLongRange && fr->bTwinRange)
 +    {
 +        /* The VdW and elec. LR cut-off's could be different,
 +         * so we can not simply set them to rlistlong.
 +         */
 +        if (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(fr->vdwtype) &&
 +            fr->rvdw > fr->rlist)
 +        {
 +            *rvdw2  = sqr(fr->rlistlong);
 +        }
 +        else
 +        {
 +            *rvdw2  = sqr(fr->rvdw);
 +        }
 +        if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(fr->eeltype) &&
 +            fr->rcoulomb > fr->rlist)
 +        {
 +            *rcoul2 = sqr(fr->rlistlong);
 +        }
 +        else
 +        {
 +            *rcoul2 = sqr(fr->rcoulomb);
 +        }
 +    }
 +    else
 +    {
 +        /* Workaround for a gcc -O3 or -ffast-math problem */
 +        *rvdw2  = *rs2;
 +        *rcoul2 = *rs2;
 +    }
 +    *rm2 = min(*rvdw2, *rcoul2);
 +    *rl2 = max(*rvdw2, *rcoul2);
 +}
 +
 +static void init_nsgrid_lists(t_forcerec *fr, int ngid, gmx_ns_t *ns)
 +{
 +    real rvdw2, rcoul2, rs2, rm2, rl2;
 +    int  j;
 +
 +    get_cutoff2(fr, TRUE, &rvdw2, &rcoul2, &rs2, &rm2, &rl2);
 +
 +    /* Short range buffers */
 +    snew(ns->nl_sr, ngid);
 +    /* Counters */
 +    snew(ns->nsr, ngid);
 +    snew(ns->nlr_ljc, ngid);
 +    snew(ns->nlr_one, ngid);
 +
 +    /* Always allocate both list types, since rcoulomb might now change with PME load balancing */
 +    /* Long range VdW and Coul buffers */
 +    snew(ns->nl_lr_ljc, ngid);
 +    /* Long range VdW or Coul only buffers */
 +    snew(ns->nl_lr_one, ngid);
 +
 +    for (j = 0; (j < ngid); j++)
 +    {
 +        snew(ns->nl_sr[j], MAX_CG);
 +        snew(ns->nl_lr_ljc[j], MAX_CG);
 +        snew(ns->nl_lr_one[j], MAX_CG);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "ns5_core: rs2 = %g, rm2 = %g, rl2 = %g (nm^2)\n",
 +                rs2, rm2, rl2);
 +    }
 +}
 +
 +static int nsgrid_core(FILE *log, t_commrec *cr, t_forcerec *fr,
 +                       matrix box, rvec box_size, int ngid,
 +                       gmx_localtop_t *top,
 +                       t_grid *grid, rvec x[],
 +                       t_excl bexcl[], gmx_bool *bExcludeAlleg,
 +                       t_nrnb *nrnb, t_mdatoms *md,
 +                       real *lambda, real *dvdlambda,
 +                       gmx_grppairener_t *grppener,
 +                       put_in_list_t *put_in_list,
 +                       gmx_bool bHaveVdW[],
 +                       gmx_bool bDoLongRange, gmx_bool bMakeQMMMnblist)
 +{
 +    gmx_ns_t     *ns;
 +    atom_id     **nl_lr_ljc, **nl_lr_one, **nl_sr;
 +    int          *nlr_ljc, *nlr_one, *nsr;
 +    gmx_domdec_t *dd     = NULL;
 +    t_block      *cgs    = &(top->cgs);
 +    int          *cginfo = fr->cginfo;
 +    /* atom_id *i_atoms,*cgsindex=cgs->index; */
 +    ivec          sh0, sh1, shp;
 +    int           cell_x, cell_y, cell_z;
 +    int           d, tx, ty, tz, dx, dy, dz, cj;
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    int           zsh_ty, zsh_tx, ysh_tx;
 +#endif
 +    int           dx0, dx1, dy0, dy1, dz0, dz1;
 +    int           Nx, Ny, Nz, shift = -1, j, nrj, nns, nn = -1;
 +    real          gridx, gridy, gridz, grid_x, grid_y, grid_z;
 +    real         *dcx2, *dcy2, *dcz2;
 +    int           zgi, ygi, xgi;
 +    int           cg0, cg1, icg = -1, cgsnr, i0, igid, nri, naaj, max_jcg;
 +    int           jcg0, jcg1, jjcg, cgj0, jgid;
 +    int          *grida, *gridnra, *gridind;
 +    gmx_bool      rvdw_lt_rcoul, rcoul_lt_rvdw;
 +    rvec          xi, *cgcm, grid_offset;
 +    real          r2, rs2, rvdw2, rcoul2, rm2, rl2, XI, YI, ZI, dcx, dcy, dcz, tmp1, tmp2;
 +    int          *i_egp_flags;
 +    gmx_bool      bDomDec, bTriclinicX, bTriclinicY;
 +    ivec          ncpddc;
 +
 +    ns = &fr->ns;
 +
 +    bDomDec = DOMAINDECOMP(cr);
 +    if (bDomDec)
 +    {
 +        dd = cr->dd;
 +    }
 +
 +    bTriclinicX = ((YY < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[YY] == 1) && box[YY][XX] != 0) ||
 +                   (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ] == 1) && box[ZZ][XX] != 0));
 +    bTriclinicY =  (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ] == 1) && box[ZZ][YY] != 0);
 +
 +    cgsnr    = cgs->nr;
 +
 +    get_cutoff2(fr, bDoLongRange, &rvdw2, &rcoul2, &rs2, &rm2, &rl2);
 +
 +    rvdw_lt_rcoul = (rvdw2 >= rcoul2);
 +    rcoul_lt_rvdw = (rcoul2 >= rvdw2);
 +
 +    if (bMakeQMMMnblist)
 +    {
 +        rm2 = rl2;
 +        rs2 = rl2;
 +    }
 +
 +    nl_sr     = ns->nl_sr;
 +    nsr       = ns->nsr;
 +    nl_lr_ljc = ns->nl_lr_ljc;
 +    nl_lr_one = ns->nl_lr_one;
 +    nlr_ljc   = ns->nlr_ljc;
 +    nlr_one   = ns->nlr_one;
 +
 +    /* Unpack arrays */
 +    cgcm    = fr->cg_cm;
 +    Nx      = grid->n[XX];
 +    Ny      = grid->n[YY];
 +    Nz      = grid->n[ZZ];
 +    grida   = grid->a;
 +    gridind = grid->index;
 +    gridnra = grid->nra;
 +    nns     = 0;
 +
 +    gridx      = grid->cell_size[XX];
 +    gridy      = grid->cell_size[YY];
 +    gridz      = grid->cell_size[ZZ];
 +    grid_x     = 1/gridx;
 +    grid_y     = 1/gridy;
 +    grid_z     = 1/gridz;
 +    copy_rvec(grid->cell_offset, grid_offset);
 +    copy_ivec(grid->ncpddc, ncpddc);
 +    dcx2       = grid->dcx2;
 +    dcy2       = grid->dcy2;
 +    dcz2       = grid->dcz2;
 +
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    zsh_ty = floor(-box[ZZ][YY]/box[YY][YY]+0.5);
 +    zsh_tx = floor(-box[ZZ][XX]/box[XX][XX]+0.5);
 +    ysh_tx = floor(-box[YY][XX]/box[XX][XX]+0.5);
 +    if (zsh_tx != 0 && ysh_tx != 0)
 +    {
 +        /* This could happen due to rounding, when both ratios are 0.5 */
 +        ysh_tx = 0;
 +    }
 +#endif
 +
 +    debug_gmx();
 +
 +    if (fr->n_tpi)
 +    {
 +        /* We only want a list for the test particle */
 +        cg0 = cgsnr - 1;
 +    }
 +    else
 +    {
 +        cg0 = grid->icg0;
 +    }
 +    cg1 = grid->icg1;
 +
 +    /* Set the shift range */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        sh0[d] = -1;
 +        sh1[d] = 1;
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(fr->ePBC) || (bDomDec && dd->nc[d] > 1))
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    /* Loop over charge groups */
 +    for (icg = cg0; (icg < cg1); icg++)
 +    {
 +        igid = GET_CGINFO_GID(cginfo[icg]);
 +        /* Skip this charge group if all energy groups are excluded! */
 +        if (bExcludeAlleg[igid])
 +        {
 +            continue;
 +        }
 +
 +        i0   = cgs->index[icg];
 +
 +        if (bMakeQMMMnblist)
 +        {
 +            /* Skip this charge group if it is not a QM atom while making a
 +             * QM/MM neighbourlist
 +             */
 +            if (md->bQM[i0] == FALSE)
 +            {
 +                continue; /* MM particle, go to next particle */
 +            }
 +
 +            /* Compute the number of charge groups that fall within the control
 +             * of this one (icg)
 +             */
 +            naaj    = calc_naaj(icg, cgsnr);
 +            jcg0    = icg;
 +            jcg1    = icg + naaj;
 +            max_jcg = cgsnr;
 +        }
 +        else
 +        {
 +            /* make a normal neighbourlist */
 +
 +            if (bDomDec)
 +            {
 +                /* Get the j charge-group and dd cell shift ranges */
 +                dd_get_ns_ranges(cr->dd, icg, &jcg0, &jcg1, sh0, sh1);
 +                max_jcg = 0;
 +            }
 +            else
 +            {
 +                /* Compute the number of charge groups that fall within the control
 +                 * of this one (icg)
 +                 */
 +                naaj = calc_naaj(icg, cgsnr);
 +                jcg0 = icg;
 +                jcg1 = icg + naaj;
 +
 +                if (fr->n_tpi)
 +                {
 +                    /* The i-particle is awlways the test particle,
 +                     * so we want all j-particles
 +                     */
 +                    max_jcg = cgsnr - 1;
 +                }
 +                else
 +                {
 +                    max_jcg  = jcg1 - cgsnr;
 +                }
 +            }
 +        }
 +
 +        i_egp_flags = fr->egp_flags + igid*ngid;
 +
 +        /* Set the exclusions for the atoms in charge group icg using a bitmask */
 +        setexcl(i0, cgs->index[icg+1], &top->excls, TRUE, bexcl);
 +
 +        ci2xyz(grid, icg, &cell_x, &cell_y, &cell_z);
 +
 +        /* Changed iicg to icg, DvdS 990115
 +         * (but see consistency check above, DvdS 990330)
 +         */
 +#ifdef NS5DB
 +        fprintf(log, "icg=%5d, naaj=%5d, cell %d %d %d\n",
 +                icg, naaj, cell_x, cell_y, cell_z);
 +#endif
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
 +        {
 +            ZI = cgcm[icg][ZZ]+tz*box[ZZ][ZZ];
 +            /* Calculate range of cells in Z direction that have the shift tz */
 +            zgi = cell_z + tz*Nz;
 +#define FAST_DD_NS
 +#ifndef FAST_DD_NS
 +            get_dx(Nz, gridz, rl2, zgi, ZI, &dz0, &dz1, dcz2);
 +#else
 +            get_dx_dd(Nz, gridz, rl2, zgi, ZI-grid_offset[ZZ],
 +                      ncpddc[ZZ], sh0[ZZ], sh1[ZZ], &dz0, &dz1, dcz2);
 +#endif
 +            if (dz0 > dz1)
 +            {
 +                continue;
 +            }
 +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
 +            {
 +                YI = cgcm[icg][YY]+ty*box[YY][YY]+tz*box[ZZ][YY];
 +                /* Calculate range of cells in Y direction that have the shift ty */
 +                if (bTriclinicY)
 +                {
 +                    ygi = (int)(Ny + (YI - grid_offset[YY])*grid_y) - Ny;
 +                }
 +                else
 +                {
 +                    ygi = cell_y + ty*Ny;
 +                }
 +#ifndef FAST_DD_NS
 +                get_dx(Ny, gridy, rl2, ygi, YI, &dy0, &dy1, dcy2);
 +#else
 +                get_dx_dd(Ny, gridy, rl2, ygi, YI-grid_offset[YY],
 +                          ncpddc[YY], sh0[YY], sh1[YY], &dy0, &dy1, dcy2);
 +#endif
 +                if (dy0 > dy1)
 +                {
 +                    continue;
 +                }
 +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
 +                {
 +                    XI = cgcm[icg][XX]+tx*box[XX][XX]+ty*box[YY][XX]+tz*box[ZZ][XX];
 +                    /* Calculate range of cells in X direction that have the shift tx */
 +                    if (bTriclinicX)
 +                    {
 +                        xgi = (int)(Nx + (XI - grid_offset[XX])*grid_x) - Nx;
 +                    }
 +                    else
 +                    {
 +                        xgi = cell_x + tx*Nx;
 +                    }
 +#ifndef FAST_DD_NS
 +                    get_dx(Nx, gridx, rl2, xgi*Nx, XI, &dx0, &dx1, dcx2);
 +#else
 +                    get_dx_dd(Nx, gridx, rl2, xgi, XI-grid_offset[XX],
 +                              ncpddc[XX], sh0[XX], sh1[XX], &dx0, &dx1, dcx2);
 +#endif
 +                    if (dx0 > dx1)
 +                    {
 +                        continue;
 +                    }
 +                    /* Adress: an explicit cg that has a weigthing function of 0 is excluded
 +                     *  from the neigbour list as it will not interact  */
 +                    if (fr->adress_type != eAdressOff)
 +                    {
 +                        if (md->wf[cgs->index[icg]] <= GMX_REAL_EPS && egp_explicit(fr, igid))
 +                        {
 +                            continue;
 +                        }
 +                    }
 +                    /* Get shift vector */
 +                    shift = XYZ2IS(tx, ty, tz);
 +#ifdef NS5DB
 +                    range_check(shift, 0, SHIFTS);
 +#endif
 +                    for (nn = 0; (nn < ngid); nn++)
 +                    {
 +                        nsr[nn]      = 0;
 +                        nlr_ljc[nn]  = 0;
 +                        nlr_one[nn]  = 0;
 +                    }
 +#ifdef NS5DB
 +                    fprintf(log, "shift: %2d, dx0,1: %2d,%2d, dy0,1: %2d,%2d, dz0,1: %2d,%2d\n",
 +                            shift, dx0, dx1, dy0, dy1, dz0, dz1);
 +                    fprintf(log, "cgcm: %8.3f  %8.3f  %8.3f\n", cgcm[icg][XX],
 +                            cgcm[icg][YY], cgcm[icg][ZZ]);
 +                    fprintf(log, "xi:   %8.3f  %8.3f  %8.3f\n", XI, YI, ZI);
 +#endif
 +                    for (dx = dx0; (dx <= dx1); dx++)
 +                    {
 +                        tmp1 = rl2 - dcx2[dx];
 +                        for (dy = dy0; (dy <= dy1); dy++)
 +                        {
 +                            tmp2 = tmp1 - dcy2[dy];
 +                            if (tmp2 > 0)
 +                            {
 +                                for (dz = dz0; (dz <= dz1); dz++)
 +                                {
 +                                    if (tmp2 > dcz2[dz])
 +                                    {
 +                                        /* Find grid-cell cj in which possible neighbours are */
 +                                        cj   = xyz2ci(Ny, Nz, dx, dy, dz);
 +
 +                                        /* Check out how many cgs (nrj) there in this cell */
 +                                        nrj  = gridnra[cj];
 +
 +                                        /* Find the offset in the cg list */
 +                                        cgj0 = gridind[cj];
 +
 +                                        /* Check if all j's are out of range so we
 +                                         * can skip the whole cell.
 +                                         * Should save some time, especially with DD.
 +                                         */
 +                                        if (nrj == 0 ||
 +                                            (grida[cgj0] >= max_jcg &&
 +                                             (grida[cgj0] >= jcg1 || grida[cgj0+nrj-1] < jcg0)))
 +                                        {
 +                                            continue;
 +                                        }
 +
 +                                        /* Loop over cgs */
 +                                        for (j = 0; (j < nrj); j++)
 +                                        {
 +                                            jjcg = grida[cgj0+j];
 +
 +                                            /* check whether this guy is in range! */
 +                                            if ((jjcg >= jcg0 && jjcg < jcg1) ||
 +                                                (jjcg < max_jcg))
 +                                            {
 +                                                r2 = calc_dx2(XI, YI, ZI, cgcm[jjcg]);
 +                                                if (r2 < rl2)
 +                                                {
 +                                                    /* jgid = gid[cgsatoms[cgsindex[jjcg]]]; */
 +                                                    jgid = GET_CGINFO_GID(cginfo[jjcg]);
 +                                                    /* check energy group exclusions */
 +                                                    if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                                                    {
 +                                                        if (r2 < rs2)
 +                                                        {
 +                                                            if (nsr[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to short-range list */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nsr[jgid], nl_sr[jgid],
 +                                                                            cgs->index, /* cgsatoms, */ bexcl,
 +                                                                            shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                                                                nsr[jgid] = 0;
 +                                                            }
 +                                                            nl_sr[jgid][nsr[jgid]++] = jjcg;
 +                                                        }
 +                                                        else if (r2 < rm2)
 +                                                        {
 +                                                            if (nlr_ljc[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to LJ+coulomb long-range list */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nlr_ljc[jgid], nl_lr_ljc[jgid], top->cgs.index,
 +                                                                            bexcl, shift, fr, TRUE, TRUE, TRUE, fr->solvent_opt);
 +                                                                nlr_ljc[jgid] = 0;
 +                                                            }
 +                                                            nl_lr_ljc[jgid][nlr_ljc[jgid]++] = jjcg;
 +                                                        }
 +                                                        else
 +                                                        {
 +                                                            if (nlr_one[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to long-range list with only coul, or only LJ */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nlr_one[jgid], nl_lr_one[jgid], top->cgs.index,
 +                                                                            bexcl, shift, fr, TRUE, rvdw_lt_rcoul, rcoul_lt_rvdw, fr->solvent_opt);
 +                                                                nlr_one[jgid] = 0;
 +                                                            }
 +                                                            nl_lr_one[jgid][nlr_one[jgid]++] = jjcg;
 +                                                        }
 +                                                    }
 +                                                }
 +                                                nns++;
 +                                            }
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                    /* CHECK whether there is anything left in the buffers */
 +                    for (nn = 0; (nn < ngid); nn++)
 +                    {
 +                        if (nsr[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nsr[nn], nl_sr[nn],
 +                                        cgs->index, /* cgsatoms, */ bexcl,
 +                                        shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                        }
 +
 +                        if (nlr_ljc[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nlr_ljc[nn],
 +                                        nl_lr_ljc[nn], top->cgs.index,
 +                                        bexcl, shift, fr, TRUE, TRUE, TRUE, fr->solvent_opt);
 +                        }
 +
 +                        if (nlr_one[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nlr_one[nn],
 +                                        nl_lr_one[nn], top->cgs.index,
 +                                        bexcl, shift, fr, TRUE, rvdw_lt_rcoul, rcoul_lt_rvdw, fr->solvent_opt);
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,&top->atoms.excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg], cgs->index[icg+1], &top->excls, FALSE, bexcl);
 +    }
 +    /* No need to perform any left-over force calculations anymore (as we used to do here)
 +     * since we now save the proper long-range lists for later evaluation.
 +     */
 +
 +    debug_gmx();
 +
 +    /* Close neighbourlists */
 +    close_neighbor_lists(fr, bMakeQMMMnblist);
 +
 +    return nns;
 +}
 +
 +void ns_realloc_natoms(gmx_ns_t *ns, int natoms)
 +{
 +    int i;
 +
 +    if (natoms > ns->nra_alloc)
 +    {
 +        ns->nra_alloc = over_alloc_dd(natoms);
 +        srenew(ns->bexcl, ns->nra_alloc);
 +        for (i = 0; i < ns->nra_alloc; i++)
 +        {
 +            ns->bexcl[i] = 0;
 +        }
 +    }
 +}
 +
 +void init_ns(FILE *fplog, const t_commrec *cr,
 +             gmx_ns_t *ns, t_forcerec *fr,
 +             const gmx_mtop_t *mtop,
 +             matrix box)
 +{
 +    int  mt, icg, nr_in_cg, maxcg, i, j, jcg, ngid, ncg;
 +    t_block *cgs;
 +    char *ptr;
 +
 +    /* Compute largest charge groups size (# atoms) */
 +    nr_in_cg = 1;
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        cgs = &mtop->moltype[mt].cgs;
 +        for (icg = 0; (icg < cgs->nr); icg++)
 +        {
 +            nr_in_cg = max(nr_in_cg, (int)(cgs->index[icg+1]-cgs->index[icg]));
 +        }
 +    }
 +
 +    /* Verify whether largest charge group is <= max cg.
 +     * This is determined by the type of the local exclusion type
 +     * Exclusions are stored in bits. (If the type is not large
 +     * enough, enlarge it, unsigned char -> unsigned short -> unsigned long)
 +     */
 +    maxcg = sizeof(t_excl)*8;
 +    if (nr_in_cg > maxcg)
 +    {
 +        gmx_fatal(FARGS, "Max #atoms in a charge group: %d > %d\n",
 +                  nr_in_cg, maxcg);
 +    }
 +
 +    ngid = mtop->groups.grps[egcENER].nr;
 +    snew(ns->bExcludeAlleg, ngid);
 +    for (i = 0; i < ngid; i++)
 +    {
 +        ns->bExcludeAlleg[i] = TRUE;
 +        for (j = 0; j < ngid; j++)
 +        {
 +            if (!(fr->egp_flags[i*ngid+j] & EGP_EXCL))
 +            {
 +                ns->bExcludeAlleg[i] = FALSE;
 +            }
 +        }
 +    }
 +
 +    if (fr->bGrid)
 +    {
 +        /* Grid search */
 +        ns->grid = init_grid(fplog, fr);
 +        init_nsgrid_lists(fr, ngid, ns);
 +    }
 +    else
 +    {
 +        /* Simple search */
 +        snew(ns->ns_buf, ngid);
 +        for (i = 0; (i < ngid); i++)
 +        {
 +            snew(ns->ns_buf[i], SHIFTS);
 +        }
 +        ncg = ncg_mtop(mtop);
 +        snew(ns->simple_aaj, 2*ncg);
 +        for (jcg = 0; (jcg < ncg); jcg++)
 +        {
 +            ns->simple_aaj[jcg]     = jcg;
 +            ns->simple_aaj[jcg+ncg] = jcg;
 +        }
 +    }
 +
 +    /* Create array that determines whether or not atoms have VdW */
 +    snew(ns->bHaveVdW, fr->ntype);
 +    for (i = 0; (i < fr->ntype); i++)
 +    {
 +        for (j = 0; (j < fr->ntype); j++)
 +        {
 +            ns->bHaveVdW[i] = (ns->bHaveVdW[i] ||
 +                               (fr->bBHAM ?
 +                                ((BHAMA(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (BHAMB(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (BHAMC(fr->nbfp, fr->ntype, i, j) != 0)) :
 +                                ((C6(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (C12(fr->nbfp, fr->ntype, i, j) != 0))));
 +        }
 +    }
 +    if (debug)
 +    {
 +        pr_bvec(debug, 0, "bHaveVdW", ns->bHaveVdW, fr->ntype, TRUE);
 +    }
 +
 +    ns->nra_alloc = 0;
 +    ns->bexcl     = NULL;
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* This could be reduced with particle decomposition */
 +        ns_realloc_natoms(ns, mtop->natoms);
 +    }
 +
 +    ns->nblist_initialized = FALSE;
 +
 +    /* nbr list debug dump */
 +    {
 +        char *ptr = getenv("GMX_DUMP_NL");
 +        if (ptr)
 +        {
 +            ns->dump_nl = strtol(ptr, NULL, 10);
 +            if (fplog)
 +            {
 +                fprintf(fplog, "GMX_DUMP_NL = %d", ns->dump_nl);
 +            }
 +        }
 +        else
 +        {
 +            ns->dump_nl = 0;
 +        }
 +    }
 +}
 +
 +
 +int search_neighbours(FILE *log, t_forcerec *fr,
 +                      rvec x[], matrix box,
 +                      gmx_localtop_t *top,
 +                      gmx_groups_t *groups,
 +                      t_commrec *cr,
 +                      t_nrnb *nrnb, t_mdatoms *md,
 +                      real *lambda, real *dvdlambda,
 +                      gmx_grppairener_t *grppener,
 +                      gmx_bool bFillGrid,
 +                      gmx_bool bDoLongRangeNS,
 +                      gmx_bool bPadListsForKernels)
 +{
 +    t_block  *cgs = &(top->cgs);
 +    rvec     box_size, grid_x0, grid_x1;
 +    int      i, j, m, ngid;
 +    real     min_size, grid_dens;
 +    int      nsearch;
 +    gmx_bool     bGrid;
 +    char     *ptr;
 +    gmx_bool     *i_egp_flags;
 +    int      cg_start, cg_end, start, end;
 +    gmx_ns_t *ns;
 +    t_grid   *grid;
 +    gmx_domdec_zones_t *dd_zones;
 +    put_in_list_t *put_in_list;
 +
 +    ns = &fr->ns;
 +
 +    /* Set some local variables */
 +    bGrid = fr->bGrid;
 +    ngid  = groups->grps[egcENER].nr;
 +
 +    for (m = 0; (m < DIM); m++)
 +    {
 +        box_size[m] = box[m][m];
 +    }
 +
 +    if (fr->ePBC != epbcNONE)
 +    {
 +        if (sqr(fr->rlistlong) >= max_cutoff2(fr->ePBC, box))
 +        {
 +            gmx_fatal(FARGS, "One of the box vectors has become shorter than twice the cut-off length or box_yy-|box_zy| or box_zz has become smaller than the cut-off.");
 +        }
 +        if (!bGrid)
 +        {
 +            min_size = min(box_size[XX], min(box_size[YY], box_size[ZZ]));
 +            if (2*fr->rlistlong >= min_size)
 +            {
 +                gmx_fatal(FARGS, "One of the box diagonal elements has become smaller than twice the cut-off length.");
 +            }
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        ns_realloc_natoms(ns, cgs->index[cgs->nr]);
 +    }
 +    debug_gmx();
 +
 +    /* Reset the neighbourlists */
 +    reset_neighbor_lists(fr, TRUE, TRUE);
 +
 +    if (bGrid && bFillGrid)
 +    {
 +
 +        grid = ns->grid;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_zones = domdec_zones(cr->dd);
 +        }
 +        else
 +        {
 +            dd_zones = NULL;
 +
 +            get_nsgrid_boundaries(grid->nboundeddim, box, NULL, NULL, NULL, NULL,
 +                                  cgs->nr, fr->cg_cm, grid_x0, grid_x1, &grid_dens);
 +
 +            grid_first(log, grid, NULL, NULL, fr->ePBC, box, grid_x0, grid_x1,
 +                       fr->rlistlong, grid_dens);
 +        }
 +        debug_gmx();
 +
 +        /* Don't know why this all is... (DvdS 3/99) */
 +#ifndef SEGV
 +        start = 0;
 +        end   = cgs->nr;
 +#else
 +        start = fr->cg0;
 +        end   = (cgs->nr+1)/2;
 +#endif
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            end = cgs->nr;
 +            fill_grid(log, dd_zones, grid, end, -1, end, fr->cg_cm);
 +            grid->icg0 = 0;
 +            grid->icg1 = dd_zones->izone[dd_zones->nizone-1].cg1;
 +        }
 +        else
 +        {
 +            fill_grid(log, NULL, grid, cgs->nr, fr->cg0, fr->hcg, fr->cg_cm);
 +            grid->icg0 = fr->cg0;
 +            grid->icg1 = fr->hcg;
 +            debug_gmx();
 +
 +            if (PARTDECOMP(cr))
 +            {
 +                mv_grid(cr, grid);
 +            }
 +            debug_gmx();
 +        }
 +
 +        calc_elemnr(log, grid, start, end, cgs->nr);
 +        calc_ptrs(grid);
 +        grid_last(log, grid, start, end, cgs->nr);
 +
 +        if (gmx_debug_at)
 +        {
 +            check_grid(debug, grid);
 +            print_grid(debug, grid);
 +        }
 +    }
 +    else if (fr->n_tpi)
 +    {
 +        /* Set the grid cell index for the test particle only.
 +         * The cell to cg index is not corrected, but that does not matter.
 +         */
 +        fill_grid(log, NULL, ns->grid, fr->hcg, fr->hcg-1, fr->hcg, fr->cg_cm);
 +    }
 +    debug_gmx();
 +
 +    if (fr->adress_type == eAdressOff)
 +    {
 +        if (!fr->ns.bCGlist)
 +        {
 +            put_in_list = put_in_list_at;
 +        }
 +        else
 +        {
 +            put_in_list = put_in_list_cg;
 +        }
 +    }
 +    else
 +    {
 +        put_in_list = put_in_list_adress;
 +    }
 +
 +    /* Do the core! */
 +    if (bGrid)
 +    {
 +        grid    = ns->grid;
 +        nsearch = nsgrid_core(log, cr, fr, box, box_size, ngid, top,
 +                              grid, x, ns->bexcl, ns->bExcludeAlleg,
 +                              nrnb, md, lambda, dvdlambda, grppener,
 +                              put_in_list, ns->bHaveVdW,
 +                              bDoLongRangeNS, FALSE);
 +
 +        /* neighbour searching withouth QMMM! QM atoms have zero charge in
 +         * the classical calculation. The charge-charge interaction
 +         * between QM and MM atoms is handled in the QMMM core calculation
 +         * (see QMMM.c). The VDW however, we'd like to compute classically
 +         * and the QM MM atom pairs have just been put in the
 +         * corresponding neighbourlists. in case of QMMM we still need to
 +         * fill a special QMMM neighbourlist that contains all neighbours
 +         * of the QM atoms. If bQMMM is true, this list will now be made:
 +         */
 +        if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +        {
 +            nsearch += nsgrid_core(log, cr, fr, box, box_size, ngid, top,
 +                                   grid, x, ns->bexcl, ns->bExcludeAlleg,
 +                                   nrnb, md, lambda, dvdlambda, grppener,
 +                                   put_in_list_qmmm, ns->bHaveVdW,
 +                                   bDoLongRangeNS, TRUE);
 +        }
 +    }
 +    else
 +    {
 +        nsearch = ns_simple_core(fr, top, md, box, box_size,
 +                                 ns->bexcl, ns->simple_aaj,
 +                                 ngid, ns->ns_buf, put_in_list, ns->bHaveVdW);
 +    }
 +    debug_gmx();
 +
 +#ifdef DEBUG
 +    pr_nsblock(log);
 +#endif
 +
 +    inc_nrnb(nrnb, eNR_NS, nsearch);
 +    /* inc_nrnb(nrnb,eNR_LR,fr->nlr); */
 +
 +    return nsearch;
 +}
 +
 +int natoms_beyond_ns_buffer(t_inputrec *ir, t_forcerec *fr, t_block *cgs,
 +                            matrix scale_tot, rvec *x)
 +{
 +    int  cg0, cg1, cg, a0, a1, a, i, j;
 +    real rint, hbuf2, scale;
 +    rvec *cg_cm, cgsc;
 +    gmx_bool bIsotropic;
 +    int  nBeyond;
 +
 +    nBeyond = 0;
 +
 +    rint = max(ir->rcoulomb, ir->rvdw);
 +    if (ir->rlist < rint)
 +    {
 +        gmx_fatal(FARGS, "The neighbor search buffer has negative size: %f nm",
 +                  ir->rlist - rint);
 +    }
 +    cg_cm = fr->cg_cm;
 +
 +    cg0 = fr->cg0;
 +    cg1 = fr->hcg;
 +
 +    if (!EI_DYNAMICS(ir->eI) || !DYNAMIC_BOX(*ir))
 +    {
 +        hbuf2 = sqr(0.5*(ir->rlist - rint));
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            a0 = cgs->index[cg];
 +            a1 = cgs->index[cg+1];
 +            for (a = a0; a < a1; a++)
 +            {
 +                if (distance2(cg_cm[cg], x[a]) > hbuf2)
 +                {
 +                    nBeyond++;
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        bIsotropic = TRUE;
 +        scale      = scale_tot[0][0];
 +        for (i = 1; i < DIM; i++)
 +        {
 +            /* With anisotropic scaling, the original spherical ns volumes become
 +             * ellipsoids. To avoid costly transformations we use the minimum
 +             * eigenvalue of the scaling matrix for determining the buffer size.
 +             * Since the lower half is 0, the eigenvalues are the diagonal elements.
 +             */
 +            scale = min(scale, scale_tot[i][i]);
 +            if (scale_tot[i][i] != scale_tot[i-1][i-1])
 +            {
 +                bIsotropic = FALSE;
 +            }
 +            for (j = 0; j < i; j++)
 +            {
 +                if (scale_tot[i][j] != 0)
 +                {
 +                    bIsotropic = FALSE;
 +                }
 +            }
 +        }
 +        hbuf2 = sqr(0.5*(scale*ir->rlist - rint));
 +        if (bIsotropic)
 +        {
 +            for (cg = cg0; cg < cg1; cg++)
 +            {
 +                svmul(scale, cg_cm[cg], cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for (a = a0; a < a1; a++)
 +                {
 +                    if (distance2(cgsc, x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Anistropic scaling */
 +            for (cg = cg0; cg < cg1; cg++)
 +            {
 +                /* Since scale_tot contains the transpose of the scaling matrix,
 +                 * we need to multiply with the transpose.
 +                 */
 +                tmvmul_ur0(scale_tot, cg_cm[cg], cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for (a = a0; a < a1; a++)
 +                {
 +                    if (distance2(cgsc, x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return nBeyond;
 +}
Simple merge
Simple merge
Simple merge
Simple merge