Merge branch release-4-6 into master

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 26 Jul 2013 14:47:13 +0000 (16:47 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 26 Jul 2013 14:47:13 +0000 (16:47 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 26 Jul 2013 14:47:13 +0000 (16:47 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 26 Jul 2013 14:47:13 +0000 (16:47 +0200)
diff --cc CMakeLists.txt
Simple merge
diff --cc src/gromacs/CMakeLists.txt

index e9878f118356da72f2b2c365820f613dae82d03d,0000000000000000000000000000000000000000..44c59fce569a6b0a1fe1a148a74ce1519c342b18

mode 100644,000000..100644
--- 1/src/gromacs/CMakeLists.txt
--- /dev/null
+++ b/src/gromacs/CMakeLists.txt
@@@ -1,140 -1,0 +1,141 @@@
-                       ${FFT_LIBRARIES} ${XML_LIBRARIES} ${GSL_LIBRARIES}
+ +#
+ +# This file is part of the GROMACS molecular simulation package.
+ +#
+ +# Copyright (c) 2010,2011,2012,2013, by the GROMACS development team, led by
+ +# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ +# others, as listed in the AUTHORS file in the top-level source
+ +# directory and at http://www.gromacs.org.
+ +#
+ +# GROMACS is free software; you can redistribute it and/or
+ +# modify it under the terms of the GNU Lesser General Public License
+ +# as published by the Free Software Foundation; either version 2.1
+ +# of the License, or (at your option) any later version.
+ +#
+ +# GROMACS is distributed in the hope that it will be useful,
+ +# but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +# Lesser General Public License for more details.
+ +#
+ +# You should have received a copy of the GNU Lesser General Public
+ +# License along with GROMACS; if not, see
+ +# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ +#
+ +# If you want to redistribute modifications to GROMACS, please
+ +# consider that scientific software is very special. Version
+ +# control is crucial - bugs must be traceable. We will be happy to
+ +# consider code for inclusion in the official distribution, but
+ +# derived work must not be called official GROMACS. Details are found
+ +# in the README & COPYING files - if they are missing, get the
+ +# official version at http://www.gromacs.org.
+ +#
+ +# To help us fund GROMACS development, we humbly ask that you cite
+ +# the research papers on the package. Check out http://www.gromacs.org.
+ +
+ +set(LIBGROMACS_SOURCES)
+ +
+ +add_subdirectory(legacyheaders)
+ +add_subdirectory(gmxlib)
+ +add_subdirectory(mdlib)
+ +add_subdirectory(gmxpreprocess)
+ +add_subdirectory(gmxana)
+ +add_subdirectory(analysisdata)
+ +add_subdirectory(commandline)
+ +add_subdirectory(fft)
+ +add_subdirectory(linearalgebra)
+ +add_subdirectory(onlinehelp)
+ +add_subdirectory(options)
+ +add_subdirectory(selection)
+ +add_subdirectory(trajectoryanalysis)
+ +add_subdirectory(utility)
+ +
+ +file(GLOB LIBGROMACS_HEADERS *.h)
+ +install(FILES ${LIBGROMACS_HEADERS} DESTINATION ${INCL_INSTALL_DIR}/gromacs
+ +        COMPONENT development)
+ +
+ +list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
+ +
+ +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/version.h.cmakein ${CMAKE_CURRENT_BINARY_DIR}/version.h)
+ +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.h
+ +    DESTINATION ${INCL_INSTALL_DIR}/gromacs
+ +    COMPONENT development)
+ +
+ +# Add target that generates gitversion.c every time make is run
+ +# if git version info is requested
+ +# This code is here instead of utility/CMakeLists.txt because CMake
+ +# ignores set_source_file_properties from subdirectories.
+ +if (GMX_GIT_VERSION_INFO)
+ +    set(GENERATED_VERSION_FILE ${CMAKE_CURRENT_BINARY_DIR}/utility/gitversion.c)
+ +    add_custom_target(gmx_version ALL
+ +            COMMAND ${CMAKE_COMMAND}
+ +                -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
+ +                -D GIT_VERSION="${GIT_VERSION}"
+ +                -D PROJECT_VERSION="${PROJECT_VERSION}"
+ +                -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
+ +                -D VERSION_C_CMAKEIN="${CMAKE_CURRENT_SOURCE_DIR}/utility/gitversion.c.cmakein"
+ +                -D VERSION_C_OUT=${GENERATED_VERSION_FILE}
+ +                -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake
+ +            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ +            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/utility/gitversion.c.cmakein
+ +            COMMENT "Generating git version information")
+ +    set_source_files_properties(${GENERATED_VERSION_FILE}
+ +                                PROPERTIES GENERATED true)
+ +    list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+ +endif()
+ +
+ +# apply gcc 4.4.x bug workaround
+ +if(GMX_USE_GCC44_BUG_WORKAROUND)
+ +   include(gmxGCC44O3BugWorkaround)
+ +   gmx_apply_gcc44_bug_workaround("gmxlib/bondfree.c")
+ +   gmx_apply_gcc44_bug_workaround("mdlib/force.c")
+ +   gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
+ +endif()
+ +
+ +add_library(libgromacs ${LIBGROMACS_SOURCES})
+ +if (GMX_GIT_VERSION_INFO)
+ +    add_dependencies(libgromacs gmx_version)
+ +endif ()
+ +
+ +if(GMX_BUILD_OWN_FFTW)
+ +    # This dependency has to be made here rather than the CMakeLists.txt that
+ +    # does the FFTW build, because of the order in which
+ +    # add_subdirectory() calls are made in the top-level CMakeLists.txt; the
+ +    # md library target does not necessarily exist yet. Also enabling and
+ +    # disabling GMX_BUILD_OWN_FFTW changes dependencies correctly.
+ +    add_dependencies(libgromacs gmxfftw)
+ +endif()
+ +
+ +target_link_libraries(libgromacs ${GMX_GPU_LIBRARIES}
+ +                      ${GMX_EXTRA_LIBRARIES}
++                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
++                      ${XML_LIBRARIES} ${GSL_LIBRARIES}
+ +                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
+ +set_target_properties(libgromacs PROPERTIES
+ +                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+ +                      SOVERSION ${SOVERSION}
+ +                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+ +
+ +install(TARGETS libgromacs DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
+ +
+ +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
+ +               ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
+ +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
+ +        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+ +        RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
+ +        COMPONENT development)
+ +
+ +if (INSTALL_CUDART_LIB) #can be set manual by user
+ +    if (GMX_GPU)
+ +        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+ +            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+ +            if(IS_CUDART) #libcuda should not be installed
+ +                #install also name-links (linker uses those)
+ +                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+ +                install(FILES ${CUDA_LIBS} DESTINATION
+ +                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+ +            endif()
+ +        endforeach()
+ +    else()
+ +        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
+ +    endif()
+ +endif ()
diff --cc src/gromacs/gmxana/dlist.c

index cb01f0ea75cf5888a34150b5c47f3d3f38579ee8,0000000000000000000000000000000000000000..a8d407773f82699d2a5d27c8ae9c16b3bf9522b5

mode 100644,000000..100644
--- 1/src/gromacs/gmxana/dlist.c
--- /dev/null
+++ b/src/gromacs/gmxana/dlist.c
@@@ -1,441 -1,0 +1,443 @@@
-                      (strcmp(*(atoms->atomname[i]), "O1") == 0))
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Green Red Orange Magenta Azure Cyan Skyblue
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <stdlib.h>
+ +
+ +#include "string2.h"
+ +#include "smalloc.h"
+ +#include "gstat.h"
+ +#include "gmx_fatal.h"
+ +#include "index.h"
+ +
+ +t_dlist *mk_dlist(FILE *log,
+ +                  t_atoms *atoms, int *nlist,
+ +                  gmx_bool bPhi, gmx_bool bPsi, gmx_bool bChi, gmx_bool bHChi,
+ +                  int maxchi, int r0, gmx_residuetype_t rt)
+ +{
+ +    int       ires, i, j, k, ii;
+ +    t_dihatms atm, prev;
+ +    int       nl = 0, nc[edMax];
+ +    char     *thisres;
+ +    t_dlist  *dl;
+ +
+ +    snew(dl, atoms->nres+1);
+ +    prev.C = prev.Cn[1] = -1; /* Keep the compiler quiet */
+ +    for (i = 0; (i < edMax); i++)
+ +    {
+ +        nc[i] = 0;
+ +    }
+ +    ires = -1;
+ +    i    =  0;
+ +    while (i < atoms->nr)
+ +    {
+ +        ires = atoms->atom[i].resind;
+ +
+ +        /* Initiate all atom numbers to -1 */
+ +        atm.minC = atm.H = atm.N = atm.C = atm.O = atm.minCalpha = -1;
+ +        for (j = 0; (j < MAXCHI+3); j++)
+ +        {
+ +            atm.Cn[j] = -1;
+ +        }
+ +
+ +        /* Look for atoms in this residue */
+ +        /* maybe should allow for chis to hydrogens? */
+ +        while ((i < atoms->nr) && (atoms->atom[i].resind == ires))
+ +        {
+ +            if ((strcmp(*(atoms->atomname[i]), "H") == 0) ||
+ +                (strcmp(*(atoms->atomname[i]), "H1") == 0) ||
+ +                (strcmp(*(atoms->atomname[i]), "HN") == 0) )
+ +            {
+ +                atm.H = i;
+ +            }
+ +            else if (strcmp(*(atoms->atomname[i]), "N") == 0)
+ +            {
+ +                atm.N = i;
+ +            }
+ +            else if (strcmp(*(atoms->atomname[i]), "C") == 0)
+ +            {
+ +                atm.C = i;
+ +            }
+ +            else if ((strcmp(*(atoms->atomname[i]), "O") == 0) ||
++                     (strcmp(*(atoms->atomname[i]), "O1") == 0) ||
++                     (strcmp(*(atoms->atomname[i]), "OC1") == 0) ||
++                     (strcmp(*(atoms->atomname[i]), "OT1") == 0))
+ +            {
+ +                atm.O = i;
+ +            }
+ +            else if (strcmp(*(atoms->atomname[i]), "CA") == 0)
+ +            {
+ +                atm.Cn[1] = i;
+ +            }
+ +            else if (strcmp(*(atoms->atomname[i]), "CB") == 0)
+ +            {
+ +                atm.Cn[2] = i;
+ +            }
+ +            else if ((strcmp(*(atoms->atomname[i]), "CG") == 0)  ||
+ +                     (strcmp(*(atoms->atomname[i]), "CG1") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "OG") == 0)  ||
+ +                     (strcmp(*(atoms->atomname[i]), "OG1") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "SG") == 0))
+ +            {
+ +                atm.Cn[3] = i;
+ +            }
+ +            else if ((strcmp(*(atoms->atomname[i]), "CD") == 0)  ||
+ +                     (strcmp(*(atoms->atomname[i]), "CD1") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "SD") == 0)  ||
+ +                     (strcmp(*(atoms->atomname[i]), "OD1") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "ND1") == 0))
+ +            {
+ +                atm.Cn[4] = i;
+ +            }
+ +            /* by grs - split the Cn[4] into 2 bits to check allowing dih to H */
+ +            else if (bHChi && ((strcmp(*(atoms->atomname[i]), "HG")  == 0) ||
+ +                               (strcmp(*(atoms->atomname[i]), "HG1")  == 0)) )
+ +            {
+ +                atm.Cn[4] = i;
+ +            }
+ +            else if ((strcmp(*(atoms->atomname[i]), "CE") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "CE1") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "OE1") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "NE") == 0))
+ +            {
+ +                atm.Cn[5] = i;
+ +            }
+ +            else if ((strcmp(*(atoms->atomname[i]), "CZ") == 0) ||
+ +                     (strcmp(*(atoms->atomname[i]), "NZ") == 0))
+ +            {
+ +                atm.Cn[6] = i;
+ +            }
+ +            /* HChi flag here too */
+ +            else if (bHChi && (strcmp(*(atoms->atomname[i]), "NH1") == 0))
+ +            {
+ +                atm.Cn[7] = i;
+ +            }
+ +            i++;
+ +        }
+ +
+ +        thisres = *(atoms->resinfo[ires].name);
+ +
+ +        /* added by grs - special case for aromatics, whose chis above 2 are
+ +           not real and produce rubbish output - so set back to -1 */
+ +        if (strcmp(thisres, "PHE") == 0 ||
+ +            strcmp(thisres, "TYR") == 0 ||
+ +            strcmp(thisres, "PTR") == 0 ||
+ +            strcmp(thisres, "TRP") == 0 ||
+ +            strcmp(thisres, "HIS") == 0 ||
+ +            strcmp(thisres, "HISA") == 0 ||
+ +            strcmp(thisres, "HISB") == 0)
+ +        {
+ +            for (ii = 5; ii <= 7; ii++)
+ +            {
+ +                atm.Cn[ii] = -1;
+ +            }
+ +        }
+ +        /* end fixing aromatics */
+ +
+ +        /* Special case for Pro, has no H */
+ +        if (strcmp(thisres, "PRO") == 0)
+ +        {
+ +            atm.H = atm.Cn[4];
+ +        }
+ +        /* Carbon from previous residue */
+ +        if (prev.C != -1)
+ +        {
+ +            atm.minC = prev.C;
+ +        }
+ +        /* Alpha-carbon from previous residue */
+ +        if (prev.Cn[1] != -1)
+ +        {
+ +            atm.minCalpha = prev.Cn[1];
+ +        }
+ +        prev = atm;
+ +
+ +        /* Check how many dihedrals we have */
+ +        if ((atm.N != -1) && (atm.Cn[1] != -1) && (atm.C != -1) &&
+ +            (atm.O != -1) && ((atm.H != -1) || (atm.minC != -1)))
+ +        {
+ +            dl[nl].resnr     = ires+1;
+ +            dl[nl].atm       = atm;
+ +            dl[nl].atm.Cn[0] = atm.N;
+ +            if ((atm.Cn[3] != -1) && (atm.Cn[2] != -1) && (atm.Cn[1] != -1))
+ +            {
+ +                nc[0]++;
+ +                if (atm.Cn[4] != -1)
+ +                {
+ +                    nc[1]++;
+ +                    if (atm.Cn[5] != -1)
+ +                    {
+ +                        nc[2]++;
+ +                        if (atm.Cn[6] != -1)
+ +                        {
+ +                            nc[3]++;
+ +                            if (atm.Cn[7] != -1)
+ +                            {
+ +                                nc[4]++;
+ +                                if (atm.Cn[8] != -1)
+ +                                {
+ +                                    nc[5]++;
+ +                                }
+ +                            }
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +            if ((atm.minC != -1) && (atm.minCalpha != -1))
+ +            {
+ +                nc[6]++;
+ +            }
+ +            dl[nl].index = gmx_residuetype_get_index(rt, thisres);
+ +
+ +            sprintf(dl[nl].name, "%s%d", thisres, ires+r0);
+ +            nl++;
+ +        }
+ +        else if (debug)
+ +        {
+ +            fprintf(debug, "Could not find N atom but could find other atoms"
+ +                    " in residue %s%d\n", thisres, ires+r0);
+ +        }
+ +    }
+ +    fprintf(stderr, "\n");
+ +    fprintf(log, "\n");
+ +    fprintf(log, "There are %d residues with dihedrals\n", nl);
+ +    j = 0;
+ +    if (bPhi)
+ +    {
+ +        j += nl;
+ +    }
+ +    if (bPsi)
+ +    {
+ +        j += nl;
+ +    }
+ +    if (bChi)
+ +    {
+ +        for (i = 0; (i < maxchi); i++)
+ +        {
+ +            j += nc[i];
+ +        }
+ +    }
+ +    fprintf(log, "There are %d dihedrals\n", j);
+ +    fprintf(log, "Dihedral: ");
+ +    if (bPhi)
+ +    {
+ +        fprintf(log, " Phi  ");
+ +    }
+ +    if (bPsi)
+ +    {
+ +        fprintf(log, " Psi  ");
+ +    }
+ +    if (bChi)
+ +    {
+ +        for (i = 0; (i < maxchi); i++)
+ +        {
+ +            fprintf(log, "Chi%d  ", i+1);
+ +        }
+ +    }
+ +    fprintf(log, "\nNumber:   ");
+ +    if (bPhi)
+ +    {
+ +        fprintf(log, "%4d  ", nl);
+ +    }
+ +    if (bPsi)
+ +    {
+ +        fprintf(log, "%4d  ", nl);
+ +    }
+ +    if (bChi)
+ +    {
+ +        for (i = 0; (i < maxchi); i++)
+ +        {
+ +            fprintf(log, "%4d  ", nc[i]);
+ +        }
+ +    }
+ +    fprintf(log, "\n");
+ +
+ +    *nlist = nl;
+ +
+ +    return dl;
+ +}
+ +
+ +gmx_bool has_dihedral(int Dih, t_dlist *dl)
+ +{
+ +    gmx_bool b = FALSE;
+ +    int      ddd;
+ +
+ +    switch (Dih)
+ +    {
+ +        case edPhi:
+ +            b = ((dl->atm.H != -1) && (dl->atm.N != -1) && (dl->atm.Cn[1] != -1) && (dl->atm.C != -1));
+ +            break;
+ +        case edPsi:
+ +            b = ((dl->atm.N != -1) && (dl->atm.Cn[1] != -1) && (dl->atm.C != -1) && (dl->atm.O != -1));
+ +            break;
+ +        case edOmega:
+ +            b = ((dl->atm.minCalpha != -1) && (dl->atm.minC != -1) && (dl->atm.N != -1) && (dl->atm.Cn[1] != -1));
+ +            break;
+ +        case edChi1:
+ +        case edChi2:
+ +        case edChi3:
+ +        case edChi4:
+ +        case edChi5:
+ +        case edChi6:
+ +            ddd = Dih - edChi1;
+ +            b   = ((dl->atm.Cn[ddd] != -1) &&  (dl->atm.Cn[ddd+1] != -1) &&
+ +                   (dl->atm.Cn[ddd+2] != -1) && (dl->atm.Cn[ddd+3] != -1));
+ +            break;
+ +        default:
+ +            pr_dlist(stdout, 1, dl, 1, 0, TRUE, TRUE, TRUE, TRUE, MAXCHI);
+ +            gmx_fatal(FARGS, "Non existant dihedral %d in file %s, line %d",
+ +                      Dih, __FILE__, __LINE__);
+ +    }
+ +    return b;
+ +}
+ +
+ +static void pr_one_ro(FILE *fp, t_dlist *dl, int nDih, real gmx_unused dt)
+ +{
+ +    int k;
+ +    for (k = 0; k < NROT; k++)
+ +    {
+ +        fprintf(fp, "  %6.2f", dl->rot_occ[nDih][k]);
+ +    }
+ +    fprintf(fp, "\n");
+ +}
+ +
+ +static void pr_ntr_s2(FILE *fp, t_dlist *dl, int nDih, real dt)
+ +{
+ +    fprintf(fp, "  %6.2f  %6.2f\n", (dt == 0) ? 0 : dl->ntr[nDih]/dt, dl->S2[nDih]);
+ +}
+ +
+ +void pr_dlist(FILE *fp, int nl, t_dlist dl[], real dt, int printtype,
+ +              gmx_bool bPhi, gmx_bool bPsi, gmx_bool bChi, gmx_bool bOmega, int maxchi)
+ +{
+ +    int   i, Xi;
+ +
+ +    void  (*pr_props)(FILE *, t_dlist *, int, real);
+ +
+ +    /* Analysis of dihedral transitions etc */
+ +
+ +    if (printtype == edPrintST)
+ +    {
+ +        pr_props = pr_ntr_s2;
+ +        fprintf(stderr, "Now printing out transitions and OPs...\n");
+ +    }
+ +    else
+ +    {
+ +        pr_props = pr_one_ro;
+ +        fprintf(stderr, "Now printing out rotamer occupancies...\n");
+ +        fprintf(fp, "\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n");
+ +    }
+ +
+ +    /* change atom numbers from 0 based to 1 based */
+ +    for (i = 0; (i < nl); i++)
+ +    {
+ +        fprintf(fp, "Residue %s\n", dl[i].name);
+ +        if (printtype == edPrintST)
+ +        {
+ +            fprintf(fp, " Angle [   AI,   AJ,   AK,   AL]  #tr/ns  S^2D  \n"
+ +                    "--------------------------------------------\n");
+ +        }
+ +        else
+ +        {
+ +            fprintf(fp, " Angle [   AI,   AJ,   AK,   AL]  rotamers  0  g(-)  t  g(+)\n"
+ +                    "--------------------------------------------\n");
+ +        }
+ +        if (bPhi)
+ +        {
+ +            fprintf(fp, "   Phi [%5d,%5d,%5d,%5d]",
+ +                    (dl[i].atm.H == -1) ? 1+dl[i].atm.minC : 1+dl[i].atm.H,
+ +                    1+dl[i].atm.N, 1+dl[i].atm.Cn[1], 1+dl[i].atm.C);
+ +            pr_props(fp, &dl[i], edPhi, dt);
+ +        }
+ +        if (bPsi)
+ +        {
+ +            fprintf(fp, "   Psi [%5d,%5d,%5d,%5d]", 1+dl[i].atm.N, 1+dl[i].atm.Cn[1],
+ +                    1+dl[i].atm.C, 1+dl[i].atm.O);
+ +            pr_props(fp, &dl[i], edPsi, dt);
+ +        }
+ +        if (bOmega && has_dihedral(edOmega, &(dl[i])))
+ +        {
+ +            fprintf(fp, " Omega [%5d,%5d,%5d,%5d]", 1+dl[i].atm.minCalpha, 1+dl[i].atm.minC,
+ +                    1+dl[i].atm.N, 1+dl[i].atm.Cn[1]);
+ +            pr_props(fp, &dl[i], edOmega, dt);
+ +        }
+ +        for (Xi = 0; Xi < MAXCHI; Xi++)
+ +        {
+ +            if (bChi && (Xi < maxchi) && (dl[i].atm.Cn[Xi+3] != -1) )
+ +            {
+ +                fprintf(fp, "   Chi%d[%5d,%5d,%5d,%5d]", Xi+1, 1+dl[i].atm.Cn[Xi],
+ +                        1+dl[i].atm.Cn[Xi+1], 1+dl[i].atm.Cn[Xi+2],
+ +                        1+dl[i].atm.Cn[Xi+3]);
+ +                pr_props(fp, &dl[i], Xi+edChi1, dt); /* Xi+2 was wrong here */
+ +            }
+ +        }
+ +        fprintf(fp, "\n");
+ +    }
+ +}
+ +
+ +
+ +
+ +int pr_trans(FILE *fp, int nl, t_dlist dl[], real dt, int Xi)
+ +{
+ +    /* never called at the moment */
+ +
+ +    int  i, nn, nz;
+ +
+ +    nz = 0;
+ +    fprintf(fp, "\\begin{table}[h]\n");
+ +    fprintf(fp, "\\caption{Number of dihedral transitions per nanosecond}\n");
+ +    fprintf(fp, "\\begin{tabular}{|l|l|}\n");
+ +    fprintf(fp, "\\hline\n");
+ +    fprintf(fp, "Residue\t&$\\chi_%d$\t\\\\\n", Xi+1);
+ +    for (i = 0; (i < nl); i++)
+ +    {
+ +        nn = dl[i].ntr[Xi]/dt;
+ +
+ +        if (nn == 0)
+ +        {
+ +            fprintf(fp, "%s\t&\\HL{%d}\t\\\\\n", dl[i].name, nn);
+ +            nz++;
+ +        }
+ +        else if (nn > 0)
+ +        {
+ +            fprintf(fp, "%s\t&\\%d\t\\\\\n", dl[i].name, nn);
+ +        }
+ +    }
+ +    fprintf(fp, "\\hline\n");
+ +    fprintf(fp, "\\end{tabular}\n");
+ +    fprintf(fp, "\\end{table}\n\n");
+ +
+ +    return nz;
+ +}
diff --cc src/gromacs/gmxana/gmx_enemat.c

index 5849aa2021bd0809d4dc435c1b46b370f0dd0a6f,0000000000000000000000000000000000000000..44a3ee850cd6a5f220e9cee91e1476f0aa8a722d

mode 100644,000000..100644
--- 1/src/gromacs/gmxana/gmx_enemat.c
--- /dev/null
+++ b/src/gromacs/gmxana/gmx_enemat.c
@@@ -1,583 -1,0 +1,587 @@@
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Green Red Orange Magenta Azure Cyan Skyblue
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +#include <string.h>
+ +#include <math.h>
+ +
+ +#include "string2.h"
+ +#include "typedefs.h"
+ +#include "gmx_fatal.h"
+ +#include "vec.h"
+ +#include "smalloc.h"
+ +#include "enxio.h"
+ +#include "statutil.h"
+ +#include "names.h"
+ +#include "macros.h"
+ +#include "xvgr.h"
+ +#include "gstat.h"
+ +#include "physics.h"
+ +#include "matio.h"
+ +#include "strdb.h"
+ +#include "gmx_ana.h"
+ +
+ +
+ +static int search_str2(int nstr, char **str, char *key)
+ +{
+ +    int  i, n;
+ +    int  keylen = strlen(key);
+ +    /* Linear search */
+ +    n = 0;
+ +    while ( (n < keylen) && ((key[n] < '0') || (key[n] > '9')) )
+ +    {
+ +        n++;
+ +    }
+ +    for (i = 0; (i < nstr); i++)
+ +    {
+ +        if (gmx_strncasecmp(str[i], key, n) == 0)
+ +        {
+ +            return i;
+ +        }
+ +    }
+ +
+ +    return -1;
+ +}
+ +
+ +int gmx_enemat(int argc, char *argv[])
+ +{
+ +    const char     *desc[] = {
+ +        "[TT]g_enemat[tt] extracts an energy matrix from the energy file ([TT]-f[tt]).",
+ +        "With [TT]-groups[tt] a file must be supplied with on each",
+ +        "line a group of atoms to be used. For these groups matrix of",
+ +        "interaction energies will be extracted from the energy file",
+ +        "by looking for energy groups with names corresponding to pairs",
+ +        "of groups of atoms, e.g. if your [TT]-groups[tt] file contains:[BR]",
+ +        "[TT]2[tt][BR]",
+ +        "[TT]Protein[tt][BR]",
+ +        "[TT]SOL[tt][BR]",
+ +        "then energy groups with names like 'Coul-SR:Protein-SOL' and ",
+ +        "'LJ:Protein-SOL' are expected in the energy file (although",
+ +        "[TT]g_enemat[tt] is most useful if many groups are analyzed",
+ +        "simultaneously). Matrices for different energy types are written",
+ +        "out separately, as controlled by the",
+ +        "[TT]-[no]coul[tt], [TT]-[no]coulr[tt], [TT]-[no]coul14[tt], ",
+ +        "[TT]-[no]lj[tt], [TT]-[no]lj14[tt], ",
+ +        "[TT]-[no]bham[tt] and [TT]-[no]free[tt] options.",
+ +        "Finally, the total interaction energy energy per group can be ",
+ +        "calculated ([TT]-etot[tt]).[PAR]",
+ +
+ +        "An approximation of the free energy can be calculated using:",
+ +        "[MATH]E[SUB]free[sub] = E[SUB]0[sub] + kT [LOG][CHEVRON][EXP](E-E[SUB]0[sub])/kT[exp][chevron][log][math], where '[MATH][CHEVRON][chevron][math]'",
+ +        "stands for time-average. A file with reference free energies",
+ +        "can be supplied to calculate the free energy difference",
+ +        "with some reference state. Group names (e.g. residue names)",
+ +        "in the reference file should correspond to the group names",
+ +        "as used in the [TT]-groups[tt] file, but a appended number",
+ +        "(e.g. residue number) in the [TT]-groups[tt] will be ignored",
+ +        "in the comparison."
+ +    };
+ +    static gmx_bool bSum      = FALSE;
+ +    static gmx_bool bMeanEmtx = TRUE;
+ +    static int      skip      = 0, nlevels = 20;
+ +    static real     cutmax    = 1e20, cutmin = -1e20, reftemp = 300.0;
+ +    static gmx_bool bCoulSR   = TRUE, bCoulLR = FALSE, bCoul14 = FALSE;
+ +    static gmx_bool bLJSR     = TRUE, bLJLR = FALSE, bLJ14 = FALSE, bBhamSR = FALSE, bBhamLR = FALSE,
+ +                    bFree     = TRUE;
+ +    t_pargs         pa[]      = {
+ +        { "-sum",  FALSE, etBOOL, {&bSum},
+ +          "Sum the energy terms selected rather than display them all" },
+ +        { "-skip", FALSE, etINT,  {&skip},
+ +          "Skip number of frames between data points" },
+ +        { "-mean", FALSE, etBOOL, {&bMeanEmtx},
+ +          "with [TT]-groups[tt] extracts matrix of mean energies instead of "
+ +          "matrix for each timestep" },
+ +        { "-nlevels", FALSE, etINT, {&nlevels}, "number of levels for matrix colors"},
+ +        { "-max", FALSE, etREAL, {&cutmax}, "max value for energies"},
+ +        { "-min", FALSE, etREAL, {&cutmin}, "min value for energies"},
+ +        { "-coulsr", FALSE, etBOOL, {&bCoulSR}, "extract Coulomb SR energies"},
+ +        { "-coullr", FALSE, etBOOL, {&bCoulLR}, "extract Coulomb LR energies"},
+ +        { "-coul14", FALSE, etBOOL, {&bCoul14}, "extract Coulomb 1-4 energies"},
+ +        { "-ljsr", FALSE, etBOOL, {&bLJSR}, "extract Lennard-Jones SR energies"},
+ +        { "-ljlr", FALSE, etBOOL, {&bLJLR}, "extract Lennard-Jones LR energies"},
+ +        { "-lj14", FALSE, etBOOL, {&bLJ14}, "extract Lennard-Jones 1-4 energies"},
+ +        { "-bhamsr", FALSE, etBOOL, {&bBhamSR}, "extract Buckingham SR energies"},
+ +        { "-bhamlr", FALSE, etBOOL, {&bBhamLR}, "extract Buckingham LR energies"},
+ +        { "-free", FALSE, etBOOL, {&bFree}, "calculate free energy"},
+ +        { "-temp", FALSE, etREAL, {&reftemp},
+ +          "reference temperature for free energy calculation"}
+ +    };
+ +    /* We will define egSP more energy-groups:
+ +       egTotal (total energy) */
+ +#define egTotal egNR
+ +#define egSP 1
+ +    gmx_bool       egrp_use[egNR+egSP];
+ +    ener_file_t    in;
+ +    FILE          *out;
+ +    int            timecheck = 0;
+ +    gmx_enxnm_t   *enm       = NULL;
+ +    t_enxframe    *fr;
+ +    int            teller = 0;
+ +    real           sum;
+ +    gmx_bool       bCont, bRef;
+ +    gmx_bool       bCutmax, bCutmin;
+ +    real         **eneset, *time = NULL;
+ +    int           *set, i, j, k, prevk, m = 0, n, nre, nset, nenergy;
+ +    char         **groups = NULL;
+ +    char           groupname[255], fn[255];
+ +    int            ngroups;
+ +    t_rgb          rlo, rhi, rmid;
+ +    real           emax, emid, emin;
+ +    real        ***emat, **etot, *groupnr;
+ +    double         beta, expE, **e, *eaver, *efree = NULL, edum;
+ +    char           label[234];
+ +    char         **ereflines, **erefres = NULL;
+ +    real          *eref  = NULL, *edif = NULL;
+ +    int            neref = 0;
+ +    output_env_t   oenv;
+ +
+ +    t_filenm       fnm[] = {
+ +        { efEDR, "-f", NULL, ffOPTRD },
+ +        { efDAT, "-groups", "groups.dat", ffREAD },
+ +        { efDAT, "-eref",   "eref.dat", ffOPTRD },
+ +        { efXPM, "-emat",   "emat", ffWRITE },
+ +        { efXVG, "-etot",   "energy", ffWRITE }
+ +    };
+ +#define NFILE asize(fnm)
+ +
+ +    parse_common_args(&argc, argv, PCA_CAN_VIEW | PCA_CAN_TIME | PCA_BE_NICE,
+ +                      NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL, &oenv);
+ +
++    for (i = 0; (i < egNR+egSP); i++)
++    {
++        egrp_use[i] = FALSE;
++    }
+ +    egrp_use[egCOULSR] = bCoulSR;
+ +    egrp_use[egLJSR]   = bLJSR;
+ +    egrp_use[egBHAMSR] = bBhamSR;
+ +    egrp_use[egCOULLR] = bCoulLR;
+ +    egrp_use[egLJLR]   = bLJLR;
+ +    egrp_use[egBHAMLR] = bBhamLR;
+ +    egrp_use[egCOUL14] = bCoul14;
+ +    egrp_use[egLJ14]   = bLJ14;
+ +    egrp_use[egTotal]  = TRUE;
+ +
+ +    bRef = opt2bSet("-eref", NFILE, fnm);
+ +    in   = open_enx(ftp2fn(efEDR, NFILE, fnm), "r");
+ +    do_enxnms(in, &nre, &enm);
+ +
+ +    if (nre == 0)
+ +    {
+ +        gmx_fatal(FARGS, "No energies!\n");
+ +    }
+ +
+ +    bCutmax = opt2parg_bSet("-max", asize(pa), pa);
+ +    bCutmin = opt2parg_bSet("-min", asize(pa), pa);
+ +
+ +    nenergy = 0;
+ +
+ +    /* Read groupnames from input file and construct selection of
+ +       energy groups from it*/
+ +
+ +    fprintf(stderr, "Will read groupnames from inputfile\n");
+ +    ngroups = get_lines(opt2fn("-groups", NFILE, fnm), &groups);
+ +    fprintf(stderr, "Read %d groups\n", ngroups);
+ +    snew(set, sqr(ngroups)*egNR/2);
+ +    n     = 0;
+ +    prevk = 0;
+ +    for (i = 0; (i < ngroups); i++)
+ +    {
+ +        fprintf(stderr, "\rgroup %d", i);
+ +        for (j = i; (j < ngroups); j++)
+ +        {
+ +            for (m = 0; (m < egNR); m++)
+ +            {
+ +                if (egrp_use[m])
+ +                {
+ +                    sprintf(groupname, "%s:%s-%s", egrp_nm[m], groups[i], groups[j]);
+ +#ifdef DEBUG
+ +                    fprintf(stderr, "\r%-15s %5d", groupname, n);
+ +#endif
+ +                    for (k = prevk; (k < prevk+nre); k++)
+ +                    {
+ +                        if (strcmp(enm[k%nre].name, groupname) == 0)
+ +                        {
+ +                            set[n++] = k;
+ +                            break;
+ +                        }
+ +                    }
+ +                    if (k == prevk+nre)
+ +                    {
+ +                        fprintf(stderr, "WARNING! could not find group %s (%d,%d)"
+ +                                "in energy file\n", groupname, i, j);
+ +                    }
+ +                    else
+ +                    {
+ +                        prevk = k;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +    fprintf(stderr, "\n");
+ +    nset = n;
+ +    snew(eneset, nset+1);
+ +    fprintf(stderr, "Will select half-matrix of energies with %d elements\n", n);
+ +
+ +    /* Start reading energy frames */
+ +    snew(fr, 1);
+ +    do
+ +    {
+ +        do
+ +        {
+ +            bCont = do_enx(in, fr);
+ +            if (bCont)
+ +            {
+ +                timecheck = check_times(fr->t);
+ +            }
+ +        }
+ +        while (bCont && (timecheck < 0));
+ +
+ +        if (timecheck == 0)
+ +        {
+ +#define DONTSKIP(cnt) (skip) ? ((cnt % skip) == 0) : TRUE
+ +
+ +            if (bCont)
+ +            {
+ +                fprintf(stderr, "\rRead frame: %d, Time: %.3f", teller, fr->t);
+ +
+ +                if ((nenergy % 1000) == 0)
+ +                {
+ +                    srenew(time, nenergy+1000);
+ +                    for (i = 0; (i <= nset); i++)
+ +                    {
+ +                        srenew(eneset[i], nenergy+1000);
+ +                    }
+ +                }
+ +                time[nenergy] = fr->t;
+ +                sum           = 0;
+ +                for (i = 0; (i < nset); i++)
+ +                {
+ +                    eneset[i][nenergy] = fr->ener[set[i]].e;
+ +                    sum               += fr->ener[set[i]].e;
+ +                }
+ +                if (bSum)
+ +                {
+ +                    eneset[nset][nenergy] = sum;
+ +                }
+ +                nenergy++;
+ +            }
+ +            teller++;
+ +        }
+ +    }
+ +    while (bCont && (timecheck == 0));
+ +
+ +    fprintf(stderr, "\n");
+ +
+ +    fprintf(stderr, "Will build energy half-matrix of %d groups, %d elements, "
+ +            "over %d frames\n", ngroups, nset, nenergy);
+ +
+ +    snew(emat, egNR+egSP);
+ +    for (j = 0; (j < egNR+egSP); j++)
+ +    {
+ +        if (egrp_use[m])
+ +        {
+ +            snew(emat[j], ngroups);
+ +            for (i = 0; (i < ngroups); i++)
+ +            {
+ +                snew(emat[j][i], ngroups);
+ +            }
+ +        }
+ +    }
+ +    snew(groupnr, ngroups);
+ +    for (i = 0; (i < ngroups); i++)
+ +    {
+ +        groupnr[i] = i+1;
+ +    }
+ +    rlo.r  = 1.0, rlo.g  = 0.0, rlo.b  = 0.0;
+ +    rmid.r = 1.0, rmid.g = 1.0, rmid.b = 1.0;
+ +    rhi.r  = 0.0, rhi.g  = 0.0, rhi.b  = 1.0;
+ +    if (bMeanEmtx)
+ +    {
+ +        snew(e, ngroups);
+ +        for (i = 0; (i < ngroups); i++)
+ +        {
+ +            snew(e[i], nenergy);
+ +        }
+ +        n = 0;
+ +        for (i = 0; (i < ngroups); i++)
+ +        {
+ +            for (j = i; (j < ngroups); j++)
+ +            {
+ +                for (m = 0; (m < egNR); m++)
+ +                {
+ +                    if (egrp_use[m])
+ +                    {
+ +                        for (k = 0; (k < nenergy); k++)
+ +                        {
+ +                            emat[m][i][j] += eneset[n][k];
+ +                            e[i][k]       += eneset[n][k]; /* *0.5; */
+ +                            e[j][k]       += eneset[n][k]; /* *0.5; */
+ +                        }
+ +                        n++;
+ +                        emat[egTotal][i][j] += emat[m][i][j];
+ +                        emat[m][i][j]       /= nenergy;
+ +                        emat[m][j][i]        = emat[m][i][j];
+ +                    }
+ +                }
+ +                emat[egTotal][i][j] /= nenergy;
+ +                emat[egTotal][j][i]  = emat[egTotal][i][j];
+ +            }
+ +        }
+ +        if (bFree)
+ +        {
+ +            if (bRef)
+ +            {
+ +                fprintf(stderr, "Will read reference energies from inputfile\n");
+ +                neref = get_lines(opt2fn("-eref", NFILE, fnm), &ereflines);
+ +                fprintf(stderr, "Read %d reference energies\n", neref);
+ +                snew(eref, neref);
+ +                snew(erefres, neref);
+ +                for (i = 0; (i < neref); i++)
+ +                {
+ +                    snew(erefres[i], 5);
+ +                    sscanf(ereflines[i], "%s %lf", erefres[i], &edum);
+ +                    eref[i] = edum;
+ +                }
+ +            }
+ +            snew(eaver, ngroups);
+ +            for (i = 0; (i < ngroups); i++)
+ +            {
+ +                for (k = 0; (k < nenergy); k++)
+ +                {
+ +                    eaver[i] += e[i][k];
+ +                }
+ +                eaver[i] /= nenergy;
+ +            }
+ +            beta = 1.0/(BOLTZ*reftemp);
+ +            snew(efree, ngroups);
+ +            snew(edif, ngroups);
+ +            for (i = 0; (i < ngroups); i++)
+ +            {
+ +                expE = 0;
+ +                for (k = 0; (k < nenergy); k++)
+ +                {
+ +                    expE += exp(beta*(e[i][k]-eaver[i]));
+ +                }
+ +                efree[i] = log(expE/nenergy)/beta + eaver[i];
+ +                if (bRef)
+ +                {
+ +                    n = search_str2(neref, erefres, groups[i]);
+ +                    if (n != -1)
+ +                    {
+ +                        edif[i] = efree[i]-eref[n];
+ +                    }
+ +                    else
+ +                    {
+ +                        edif[i] = efree[i];
+ +                        fprintf(stderr, "WARNING: group %s not found "
+ +                                "in reference energies.\n", groups[i]);
+ +                    }
+ +                }
+ +                else
+ +                {
+ +                    edif[i] = 0;
+ +                }
+ +            }
+ +        }
+ +
+ +        emid             = 0.0; /*(emin+emax)*0.5;*/
+ +        egrp_nm[egTotal] = "total";
+ +        for (m = 0; (m < egNR+egSP); m++)
+ +        {
+ +            if (egrp_use[m])
+ +            {
+ +                emin = 1e10;
+ +                emax = -1e10;
+ +                for (i = 0; (i < ngroups); i++)
+ +                {
+ +                    for (j = i; (j < ngroups); j++)
+ +                    {
+ +                        if (emat[m][i][j] > emax)
+ +                        {
+ +                            emax = emat[m][i][j];
+ +                        }
+ +                        else if (emat[m][i][j] < emin)
+ +                        {
+ +                            emin = emat[m][i][j];
+ +                        }
+ +                    }
+ +                }
+ +                if (emax == emin)
+ +                {
+ +                    fprintf(stderr, "Matrix of %s energy is uniform at %f "
+ +                            "(will not produce output).\n", egrp_nm[m], emax);
+ +                }
+ +                else
+ +                {
+ +                    fprintf(stderr, "Matrix of %s energy ranges from %f to %f\n",
+ +                            egrp_nm[m], emin, emax);
+ +                    if ((bCutmax) || (emax > cutmax))
+ +                    {
+ +                        emax = cutmax;
+ +                    }
+ +                    if ((bCutmin) || (emin < cutmin))
+ +                    {
+ +                        emin = cutmin;
+ +                    }
+ +                    if ((emax == cutmax) || (emin == cutmin))
+ +                    {
+ +                        fprintf(stderr, "Energy range adjusted: %f to %f\n", emin, emax);
+ +                    }
+ +
+ +                    sprintf(fn, "%s%s", egrp_nm[m], ftp2fn(efXPM, NFILE, fnm));
+ +                    sprintf(label, "%s Interaction Energies", egrp_nm[m]);
+ +                    out = ffopen(fn, "w");
+ +                    if (emin >= emid)
+ +                    {
+ +                        write_xpm(out, 0, label, "Energy (kJ/mol)",
+ +                                  "Residue Index", "Residue Index",
+ +                                  ngroups, ngroups, groupnr, groupnr, emat[m],
+ +                                  emid, emax, rmid, rhi, &nlevels);
+ +                    }
+ +                    else if (emax <= emid)
+ +                    {
+ +                        write_xpm(out, 0, label, "Energy (kJ/mol)",
+ +                                  "Residue Index", "Residue Index",
+ +                                  ngroups, ngroups, groupnr, groupnr, emat[m],
+ +                                  emin, emid, rlo, rmid, &nlevels);
+ +                    }
+ +                    else
+ +                    {
+ +                        write_xpm3(out, 0, label, "Energy (kJ/mol)",
+ +                                   "Residue Index", "Residue Index",
+ +                                   ngroups, ngroups, groupnr, groupnr, emat[m],
+ +                                   emin, emid, emax, rlo, rmid, rhi, &nlevels);
+ +                    }
+ +                    ffclose(out);
+ +                }
+ +            }
+ +        }
+ +        snew(etot, egNR+egSP);
+ +        for (m = 0; (m < egNR+egSP); m++)
+ +        {
+ +            snew(etot[m], ngroups);
+ +            for (i = 0; (i < ngroups); i++)
+ +            {
+ +                for (j = 0; (j < ngroups); j++)
+ +                {
+ +                    etot[m][i] += emat[m][i][j];
+ +                }
+ +            }
+ +        }
+ +
+ +        out = xvgropen(ftp2fn(efXVG, NFILE, fnm), "Mean Energy", "Residue", "kJ/mol",
+ +                       oenv);
+ +        xvgr_legend(out, 0, NULL, oenv);
+ +        j = 0;
+ +        for (m = 0; (m < egNR+egSP); m++)
+ +        {
+ +            if (egrp_use[m])
+ +            {
+ +                fprintf(out, "@ legend string %d \"%s\"\n", j++, egrp_nm[m]);
+ +            }
+ +        }
+ +        if (bFree)
+ +        {
+ +            fprintf(out, "@ legend string %d \"%s\"\n", j++, "Free");
+ +        }
+ +        if (bFree)
+ +        {
+ +            fprintf(out, "@ legend string %d \"%s\"\n", j++, "Diff");
+ +        }
+ +        fprintf(out, "@TYPE xy\n");
+ +        fprintf(out, "#%3s", "grp");
+ +        for (m = 0; (m < egNR+egSP); m++)
+ +        {
+ +            if (egrp_use[m])
+ +            {
+ +                fprintf(out, " %9s", egrp_nm[m]);
+ +            }
+ +        }
+ +        if (bFree)
+ +        {
+ +            fprintf(out, " %9s", "Free");
+ +        }
+ +        if (bFree)
+ +        {
+ +            fprintf(out, " %9s", "Diff");
+ +        }
+ +        fprintf(out, "\n");
+ +        for (i = 0; (i < ngroups); i++)
+ +        {
+ +            fprintf(out, "%3.0f", groupnr[i]);
+ +            for (m = 0; (m < egNR+egSP); m++)
+ +            {
+ +                if (egrp_use[m])
+ +                {
+ +                    fprintf(out, " %9.5g", etot[m][i]);
+ +                }
+ +            }
+ +            if (bFree)
+ +            {
+ +                fprintf(out, " %9.5g", efree[i]);
+ +            }
+ +            if (bRef)
+ +            {
+ +                fprintf(out, " %9.5g", edif[i]);
+ +            }
+ +            fprintf(out, "\n");
+ +        }
+ +        ffclose(out);
+ +    }
+ +    else
+ +    {
+ +        fprintf(stderr, "While typing at your keyboard, suddenly...\n"
+ +                "...nothing happens.\nWARNING: Not Implemented Yet\n");
+ +/*
+ +    out=ftp2FILE(efMAT,NFILE,fnm,"w");
+ +    n=0;
+ +    emin=emax=0.0;
+ +    for (k=0; (k<nenergy); k++) {
+ +      for (i=0; (i<ngroups); i++)
+ +    for (j=i+1; (j<ngroups); j++)
+ +      emat[i][j]=eneset[n][k];
+ +      sprintf(label,"t=%.0f ps",time[k]);
+ +      write_matrix(out,ngroups,1,ngroups,groupnr,emat,label,emin,emax,nlevels);
+ +      n++;
+ +    }
+ +    ffclose(out);
+ + */
+ +    }
+ +    close_enx(in);
+ +
+ +    return 0;
+ +}
diff --cc src/gromacs/gmxlib/bondfree.c

index 230ccea7ac5fa2779ae8d87445c98e5d2a5afe17,0000000000000000000000000000000000000000..f03f8738f3a6beb0a17df84d9c2ba8f671102406

mode 100644,000000..100644
--- 1/src/gromacs/gmxlib/bondfree.c
--- /dev/null
+++ b/src/gromacs/gmxlib/bondfree.c
@@@ -1,4452 -1,0 +1,4401 @@@
- #ifdef GMX_X86_SSE2
- #define SIMD_BONDEDS
- 
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROningen Mixture of Alchemy and Childrens' Stories
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include "physics.h"
+ +#include "vec.h"
+ +#include "maths.h"
+ +#include "txtdump.h"
+ +#include "bondf.h"
+ +#include "smalloc.h"
+ +#include "pbc.h"
+ +#include "ns.h"
+ +#include "macros.h"
+ +#include "names.h"
+ +#include "gmx_fatal.h"
+ +#include "mshift.h"
+ +#include "main.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "force.h"
+ +#include "nonbonded.h"
+ +
- /* Below are 3 SIMD vector operations.
-  * Currently these are only used here, but they should be moved to
-  * a general SIMD include file when used elsewhere.
-  */
- 
- /* SIMD inner-product of multiple vectors */
- static gmx_inline gmx_mm_pr
- gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
-              gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz)
- {
-     gmx_mm_pr ret;
- 
-     ret = gmx_mul_pr(ax, bx);
-     ret = gmx_madd_pr(ay, by, ret);
-     ret = gmx_madd_pr(az, bz, ret);
- 
-     return ret;
- }
- 
- /* SIMD norm squared of multiple vectors */
- static gmx_inline gmx_mm_pr
- gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az)
- {
-     gmx_mm_pr ret;
- 
-     ret = gmx_mul_pr(ax, ax);
-     ret = gmx_madd_pr(ay, ay, ret);
-     ret = gmx_madd_pr(az, az, ret);
- 
-     return ret;
- }
- 
- /* SIMD cross-product of multiple vectors */
- static gmx_inline void
- gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
-              gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz,
-              gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz)
- {
-     *cx = gmx_mul_pr(ay, bz);
-     *cx = gmx_nmsub_pr(az, by, *cx);
- 
-     *cy = gmx_mul_pr(az, bx);
-     *cy = gmx_nmsub_pr(ax, bz, *cy);
- 
-     *cz = gmx_mul_pr(ax, by);
-     *cz = gmx_nmsub_pr(ay, bx, *cz);
- }
- 
++/* Include the SIMD macro file and then check for support */
+ +#include "gmx_simd_macros.h"
++#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_TRIGONOMETRIC
++#define SIMD_BONDEDS
++#include "gmx_simd_vec.h"
+ +#endif
+ +
+ +/* Find a better place for this? */
+ +const int cmap_coeff_matrix[] = {
+ +    1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  9, -6,  0,  0, -6,  4,
+ +    0, 0,  3, -2, 0, 0,  0,  0,  0,  0, -9,  6,  0,  0,  6, -4,
+ +    0, 0,  0,  0, 1, 0, -3,  2, -2,  0,  6, -4,  1,  0, -3,  2,
+ +    0, 0,  0,  0, 0, 0,  0,  0, -1,  0,  3, -2,  1,  0, -3,  2,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  2,  0,  0,  3, -2,
+ +    0, 0,  0,  0, 0, 0,  3, -2,  0,  0, -6,  4,  0,  0,  3, -2,
+ +    0, 1, -2,  1, 0, 0,  0,  0,  0, -3,  6, -3,  0,  2, -4,  2,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  3, -6,  3,  0, -2,  4, -2,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  3,  0,  0,  2, -2,
+ +    0, 0, -1,  1, 0, 0,  0,  0,  0,  0,  3, -3,  0,  0, -2,  2,
+ +    0, 0,  0,  0, 0, 1, -2,  1,  0, -2,  4, -2,  0,  1, -2,  1,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0, -1,  2, -1,  0,  1, -2,  1,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  1, -1,  0,  0, -1,  1,
+ +    0, 0,  0,  0, 0, 0, -1,  1,  0,  0,  2, -2,  0,  0, -1,  1
+ +};
+ +
+ +
+ +
+ +int glatnr(int *global_atom_index, int i)
+ +{
+ +    int atnr;
+ +
+ +    if (global_atom_index == NULL)
+ +    {
+ +        atnr = i + 1;
+ +    }
+ +    else
+ +    {
+ +        atnr = global_atom_index[i] + 1;
+ +    }
+ +
+ +    return atnr;
+ +}
+ +
+ +static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
+ +{
+ +    if (pbc)
+ +    {
+ +        return pbc_dx_aiuc(pbc, xi, xj, dx);
+ +    }
+ +    else
+ +    {
+ +        rvec_sub(xi, xj, dx);
+ +        return CENTRAL;
+ +    }
+ +}
+ +
+ +#ifdef SIMD_BONDEDS
+ +
-     /* Using -0.0 should lead to only the sign bit being set */
-     gmx_mm_pr sign_mask_S = gmx_set1_pr(-0.0);
+ +/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
+ +typedef struct {
+ +    gmx_mm_pr inv_bzz;
+ +    gmx_mm_pr inv_byy;
+ +    gmx_mm_pr inv_bxx;
+ +    gmx_mm_pr bzx;
+ +    gmx_mm_pr bzy;
+ +    gmx_mm_pr bzz;
+ +    gmx_mm_pr byx;
+ +    gmx_mm_pr byy;
+ +    gmx_mm_pr bxx;
+ +} pbc_simd_t;
+ +
+ +/* Set the SIMD pbc data from a normal t_pbc struct */
+ +static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
+ +{
+ +    rvec inv_bdiag;
+ +    int  d;
+ +
+ +    /* Setting inv_bdiag to 0 effectively turns off PBC */
+ +    clear_rvec(inv_bdiag);
+ +    if (pbc != NULL)
+ +    {
+ +        for (d = 0; d < pbc->ndim_ePBC; d++)
+ +        {
+ +            inv_bdiag[d] = 1.0/pbc->box[d][d];
+ +        }
+ +    }
+ +
+ +    pbc_simd->inv_bzz = gmx_set1_pr(inv_bdiag[ZZ]);
+ +    pbc_simd->inv_byy = gmx_set1_pr(inv_bdiag[YY]);
+ +    pbc_simd->inv_bxx = gmx_set1_pr(inv_bdiag[XX]);
+ +
+ +    if (pbc != NULL)
+ +    {
+ +        pbc_simd->bzx = gmx_set1_pr(pbc->box[ZZ][XX]);
+ +        pbc_simd->bzy = gmx_set1_pr(pbc->box[ZZ][YY]);
+ +        pbc_simd->bzz = gmx_set1_pr(pbc->box[ZZ][ZZ]);
+ +        pbc_simd->byx = gmx_set1_pr(pbc->box[YY][XX]);
+ +        pbc_simd->byy = gmx_set1_pr(pbc->box[YY][YY]);
+ +        pbc_simd->bxx = gmx_set1_pr(pbc->box[XX][XX]);
+ +    }
+ +    else
+ +    {
+ +        pbc_simd->bzx = gmx_setzero_pr();
+ +        pbc_simd->bzy = gmx_setzero_pr();
+ +        pbc_simd->bzz = gmx_setzero_pr();
+ +        pbc_simd->byx = gmx_setzero_pr();
+ +        pbc_simd->byy = gmx_setzero_pr();
+ +        pbc_simd->bxx = gmx_setzero_pr();
+ +    }
+ +}
+ +
+ +/* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
+ +static gmx_inline void
+ +pbc_dx_simd(gmx_mm_pr *dx, gmx_mm_pr *dy, gmx_mm_pr *dz,
+ +            const pbc_simd_t *pbc)
+ +{
+ +    gmx_mm_pr sh;
+ +
+ +    sh  = gmx_round_pr(gmx_mul_pr(*dz, pbc->inv_bzz));
+ +    *dx = gmx_nmsub_pr(sh, pbc->bzx, *dx);
+ +    *dy = gmx_nmsub_pr(sh, pbc->bzy, *dy);
+ +    *dz = gmx_nmsub_pr(sh, pbc->bzz, *dz);
+ +
+ +    sh  = gmx_round_pr(gmx_mul_pr(*dy, pbc->inv_byy));
+ +    *dx = gmx_nmsub_pr(sh, pbc->byx, *dx);
+ +    *dy = gmx_nmsub_pr(sh, pbc->byy, *dy);
+ +
+ +    sh  = gmx_round_pr(gmx_mul_pr(*dx, pbc->inv_bxx));
+ +    *dx = gmx_nmsub_pr(sh, pbc->bxx, *dx);
+ +}
+ +
+ +#endif /* SIMD_BONDEDS */
+ +
+ +/*
+ + * Morse potential bond by Frank Everdij
+ + *
+ + * Three parameters needed:
+ + *
+ + * b0 = equilibrium distance in nm
+ + * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
+ + * cb = well depth in kJ/mol
+ + *
+ + * Note: the potential is referenced to be +cb at infinite separation
+ + *       and zero at the equilibrium distance!
+ + */
+ +
+ +real morse_bonds(int nbonds,
+ +                 const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                 const rvec x[], rvec f[], rvec fshift[],
+ +                 const t_pbc *pbc, const t_graph *g,
+ +                 real lambda, real *dvdlambda,
+ +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                 int gmx_unused *global_atom_index)
+ +{
+ +    const real one = 1.0;
+ +    const real two = 2.0;
+ +    real       dr, dr2, temp, omtemp, cbomtemp, fbond, vbond, fij, vtot;
+ +    real       b0, be, cb, b0A, beA, cbA, b0B, beB, cbB, L1;
+ +    rvec       dx;
+ +    int        i, m, ki, type, ai, aj;
+ +    ivec       dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        b0A   = forceparams[type].morse.b0A;
+ +        beA   = forceparams[type].morse.betaA;
+ +        cbA   = forceparams[type].morse.cbA;
+ +
+ +        b0B   = forceparams[type].morse.b0B;
+ +        beB   = forceparams[type].morse.betaB;
+ +        cbB   = forceparams[type].morse.cbB;
+ +
+ +        L1 = one-lambda;                            /* 1 */
+ +        b0 = L1*b0A + lambda*b0B;                   /* 3 */
+ +        be = L1*beA + lambda*beB;                   /* 3 */
+ +        cb = L1*cbA + lambda*cbB;                   /* 3 */
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3          */
+ +        dr2  = iprod(dx, dx);                       /*   5          */
+ +        dr   = dr2*gmx_invsqrt(dr2);                /*  10          */
+ +        temp = exp(-be*(dr-b0));                    /*  12          */
+ +
+ +        if (temp == one)
+ +        {
+ +            /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
+ +            *dvdlambda += cbB-cbA;
+ +            continue;
+ +        }
+ +
+ +        omtemp    = one-temp;                                                                                        /*   1          */
+ +        cbomtemp  = cb*omtemp;                                                                                       /*   1          */
+ +        vbond     = cbomtemp*omtemp;                                                                                 /*   1          */
+ +        fbond     = -two*be*temp*cbomtemp*gmx_invsqrt(dr2);                                                          /*   9          */
+ +        vtot     += vbond;                                                                                           /*   1          */
+ +
+ +        *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +
+ +        for (m = 0; (m < DIM); m++)                    /*  15          */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }                                         /*  83 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +real cubic_bonds(int nbonds,
+ +                 const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                 const rvec x[], rvec f[], rvec fshift[],
+ +                 const t_pbc *pbc, const t_graph *g,
+ +                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                 int gmx_unused *global_atom_index)
+ +{
+ +    const real three = 3.0;
+ +    const real two   = 2.0;
+ +    real       kb, b0, kcub;
+ +    real       dr, dr2, dist, kdist, kdist2, fbond, vbond, fij, vtot;
+ +    rvec       dx;
+ +    int        i, m, ki, type, ai, aj;
+ +    ivec       dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        b0   = forceparams[type].cubic.b0;
+ +        kb   = forceparams[type].cubic.kb;
+ +        kcub = forceparams[type].cubic.kcub;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);     /*   3          */
+ +        dr2  = iprod(dx, dx);                           /*   5          */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        dr         = dr2*gmx_invsqrt(dr2);                  /*  10          */
+ +        dist       = dr-b0;
+ +        kdist      = kb*dist;
+ +        kdist2     = kdist*dist;
+ +
+ +        vbond      = kdist2 + kcub*kdist2*dist;
+ +        fbond      = -(two*kdist + three*kdist2*kcub)/dr;
+ +
+ +        vtot      += vbond;   /* 21 */
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)                    /*  15          */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }                                         /*  54 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +real FENE_bonds(int nbonds,
+ +                const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                const rvec x[], rvec f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                int *global_atom_index)
+ +{
+ +    const real half = 0.5;
+ +    const real one  = 1.0;
+ +    real       bm, kb;
+ +    real       dr, dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
+ +    rvec       dx;
+ +    int        i, m, ki, type, ai, aj;
+ +    ivec       dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        bm   = forceparams[type].fene.bm;
+ +        kb   = forceparams[type].fene.kb;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);     /*   3          */
+ +        dr2  = iprod(dx, dx);                           /*   5          */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        bm2 = bm*bm;
+ +
+ +        if (dr2 >= bm2)
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
+ +                      dr2, bm2,
+ +                      glatnr(global_atom_index, ai),
+ +                      glatnr(global_atom_index, aj));
+ +        }
+ +
+ +        omdr2obm2  = one - dr2/bm2;
+ +
+ +        vbond      = -half*kb*bm2*log(omdr2obm2);
+ +        fbond      = -kb/omdr2obm2;
+ +
+ +        vtot      += vbond;   /* 35 */
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)                    /*  15          */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }                                         /*  58 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +real harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ +              real *V, real *F)
+ +{
+ +    const real half = 0.5;
+ +    real       L1, kk, x0, dx, dx2;
+ +    real       v, f, dvdlambda;
+ +
+ +    L1    = 1.0-lambda;
+ +    kk    = L1*kA+lambda*kB;
+ +    x0    = L1*xA+lambda*xB;
+ +
+ +    dx    = x-x0;
+ +    dx2   = dx*dx;
+ +
+ +    f          = -kk*dx;
+ +    v          = half*kk*dx2;
+ +    dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+ +
+ +    *F    = f;
+ +    *V    = v;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 19 flops */
+ +}
+ +
+ +
+ +real bonds(int nbonds,
+ +           const t_iatom forceatoms[], const t_iparams forceparams[],
+ +           const rvec x[], rvec f[], rvec fshift[],
+ +           const t_pbc *pbc, const t_graph *g,
+ +           real lambda, real *dvdlambda,
+ +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +           int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, fij, vtot;
+ +    rvec dx;
+ +    ivec dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +        dr   = dr2*gmx_invsqrt(dr2);                /*  10            */
+ +
+ +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ +                               forceparams[type].harmonic.krB,
+ +                               forceparams[type].harmonic.rA,
+ +                               forceparams[type].harmonic.rB,
+ +                               dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +
+ +        vtot  += vbond;            /* 1*/
+ +        fbond *= gmx_invsqrt(dr2); /*   6             */
+ +#ifdef DEBUG
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
+ +                    dr, vbond, fbond);
+ +        }
+ +#endif
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }               /* 59 TOTAL       */
+ +    return vtot;
+ +}
+ +
+ +real restraint_bonds(int nbonds,
+ +                     const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                     const rvec x[], rvec f[], rvec fshift[],
+ +                     const t_pbc *pbc, const t_graph *g,
+ +                     real lambda, real *dvdlambda,
+ +                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                     int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, fij, vtot;
+ +    real L1;
+ +    real low, dlow, up1, dup1, up2, dup2, k, dk;
+ +    real drh, drh2;
+ +    rvec dx;
+ +    ivec dt;
+ +
+ +    L1   = 1.0 - lambda;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +        dr   = dr2*gmx_invsqrt(dr2);                /*  10            */
+ +
+ +        low  = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
+ +        dlow =   -forceparams[type].restraint.lowA +        forceparams[type].restraint.lowB;
+ +        up1  = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
+ +        dup1 =   -forceparams[type].restraint.up1A +        forceparams[type].restraint.up1B;
+ +        up2  = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
+ +        dup2 =   -forceparams[type].restraint.up2A +        forceparams[type].restraint.up2B;
+ +        k    = L1*forceparams[type].restraint.kA   + lambda*forceparams[type].restraint.kB;
+ +        dk   =   -forceparams[type].restraint.kA   +        forceparams[type].restraint.kB;
+ +        /* 24 */
+ +
+ +        if (dr < low)
+ +        {
+ +            drh         = dr - low;
+ +            drh2        = drh*drh;
+ +            vbond       = 0.5*k*drh2;
+ +            fbond       = -k*drh;
+ +            *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
+ +        } /* 11 */
+ +        else if (dr <= up1)
+ +        {
+ +            vbond = 0;
+ +            fbond = 0;
+ +        }
+ +        else if (dr <= up2)
+ +        {
+ +            drh         = dr - up1;
+ +            drh2        = drh*drh;
+ +            vbond       = 0.5*k*drh2;
+ +            fbond       = -k*drh;
+ +            *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
+ +        } /* 11       */
+ +        else
+ +        {
+ +            drh         = dr - up2;
+ +            vbond       = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
+ +            fbond       = -k*(up2 - up1);
+ +            *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
+ +                + k*(dup2 - dup1)*(up2 - up1 + drh)
+ +                - k*(up2 - up1)*dup2;
+ +        }
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        vtot  += vbond;            /* 1*/
+ +        fbond *= gmx_invsqrt(dr2); /*   6             */
+ +#ifdef DEBUG
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
+ +                    dr, vbond, fbond);
+ +        }
+ +#endif
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)             /*  15                */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }                   /* 59 TOTAL   */
+ +
+ +    return vtot;
+ +}
+ +
+ +real polarize(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec f[], rvec fshift[],
+ +              const t_pbc *pbc, const t_graph *g,
+ +              real lambda, real *dvdlambda,
+ +              const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ +              int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, fij, vtot, ksh;
+ +    rvec dx;
+ +    ivec dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ksh  = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "POL: local ai = %d aj = %d ksh = %.3f\n", ai, aj, ksh);
+ +        }
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);                         /*   3      */
+ +        dr2  = iprod(dx, dx);                                               /*   5            */
+ +        dr   = dr2*gmx_invsqrt(dr2);                                        /*  10            */
+ +
+ +        *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        vtot  += vbond;            /* 1*/
+ +        fbond *= gmx_invsqrt(dr2); /*   6             */
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }               /* 59 TOTAL       */
+ +    return vtot;
+ +}
+ +
+ +real anharm_polarize(int nbonds,
+ +                     const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                     const rvec x[], rvec f[], rvec fshift[],
+ +                     const t_pbc *pbc, const t_graph *g,
+ +                     real lambda, real *dvdlambda,
+ +                     const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ +                     int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, fij, vtot, ksh, khyp, drcut, ddr, ddr3;
+ +    rvec dx;
+ +    ivec dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type  = forceatoms[i++];
+ +        ai    = forceatoms[i++];
+ +        aj    = forceatoms[i++];
+ +        ksh   = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
+ +        khyp  = forceparams[type].anharm_polarize.khyp;
+ +        drcut = forceparams[type].anharm_polarize.drcut;
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "POL: local ai = %d aj = %d ksh = %.3f\n", ai, aj, ksh);
+ +        }
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);                         /*   3      */
+ +        dr2  = iprod(dx, dx);                                               /*   5            */
+ +        dr   = dr2*gmx_invsqrt(dr2);                                        /*  10            */
+ +
+ +        *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        if (dr > drcut)
+ +        {
+ +            ddr    = dr-drcut;
+ +            ddr3   = ddr*ddr*ddr;
+ +            vbond += khyp*ddr*ddr3;
+ +            fbond -= 4*khyp*ddr3;
+ +        }
+ +        fbond *= gmx_invsqrt(dr2); /*   6             */
+ +        vtot  += vbond;            /* 1*/
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }               /* 72 TOTAL       */
+ +    return vtot;
+ +}
+ +
+ +real water_pol(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec f[], rvec gmx_unused fshift[],
+ +               const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ +               real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    /* This routine implements anisotropic polarizibility for water, through
+ +     * a shell connected to a dummy with spring constant that differ in the
+ +     * three spatial dimensions in the molecular frame.
+ +     */
+ +    int  i, m, aO, aH1, aH2, aD, aS, type, type0;
+ +    rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
+ +#ifdef DEBUG
+ +    rvec df;
+ +#endif
+ +    real vtot, fij, r_HH, r_OD, r_nW, tx, ty, tz, qS;
+ +
+ +    vtot = 0.0;
+ +    if (nbonds > 0)
+ +    {
+ +        type0  = forceatoms[0];
+ +        aS     = forceatoms[5];
+ +        qS     = md->chargeA[aS];
+ +        kk[XX] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
+ +        kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
+ +        kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
+ +        r_HH   = 1.0/forceparams[type0].wpol.rHH;
+ +        r_OD   = 1.0/forceparams[type0].wpol.rOD;
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "WPOL: qS  = %10.5f aS = %5d\n", qS, aS);
+ +            fprintf(debug, "WPOL: kk  = %10.3f        %10.3f        %10.3f\n",
+ +                    kk[XX], kk[YY], kk[ZZ]);
+ +            fprintf(debug, "WPOL: rOH = %10.3f  rHH = %10.3f  rOD = %10.3f\n",
+ +                    forceparams[type0].wpol.rOH,
+ +                    forceparams[type0].wpol.rHH,
+ +                    forceparams[type0].wpol.rOD);
+ +        }
+ +        for (i = 0; (i < nbonds); i += 6)
+ +        {
+ +            type = forceatoms[i];
+ +            if (type != type0)
+ +            {
+ +                gmx_fatal(FARGS, "Sorry, type = %d, type0 = %d, file = %s, line = %d",
+ +                          type, type0, __FILE__, __LINE__);
+ +            }
+ +            aO   = forceatoms[i+1];
+ +            aH1  = forceatoms[i+2];
+ +            aH2  = forceatoms[i+3];
+ +            aD   = forceatoms[i+4];
+ +            aS   = forceatoms[i+5];
+ +
+ +            /* Compute vectors describing the water frame */
+ +            rvec_sub(x[aH1], x[aO], dOH1);
+ +            rvec_sub(x[aH2], x[aO], dOH2);
+ +            rvec_sub(x[aH2], x[aH1], dHH);
+ +            rvec_sub(x[aD], x[aO], dOD);
+ +            rvec_sub(x[aS], x[aD], dDS);
+ +            cprod(dOH1, dOH2, nW);
+ +
+ +            /* Compute inverse length of normal vector
+ +             * (this one could be precomputed, but I'm too lazy now)
+ +             */
+ +            r_nW = gmx_invsqrt(iprod(nW, nW));
+ +            /* This is for precision, but does not make a big difference,
+ +             * it can go later.
+ +             */
+ +            r_OD = gmx_invsqrt(iprod(dOD, dOD));
+ +
+ +            /* Normalize the vectors in the water frame */
+ +            svmul(r_nW, nW, nW);
+ +            svmul(r_HH, dHH, dHH);
+ +            svmul(r_OD, dOD, dOD);
+ +
+ +            /* Compute displacement of shell along components of the vector */
+ +            dx[ZZ] = iprod(dDS, dOD);
+ +            /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                proj[m] = dDS[m]-dx[ZZ]*dOD[m];
+ +            }
+ +
+ +            /*dx[XX] = iprod(dDS,nW);
+ +               dx[YY] = iprod(dDS,dHH);*/
+ +            dx[XX] = iprod(proj, nW);
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                proj[m] -= dx[XX]*nW[m];
+ +            }
+ +            dx[YY] = iprod(proj, dHH);
+ +            /*#define DEBUG*/
+ +#ifdef DEBUG
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "WPOL: dx2=%10g  dy2=%10g  dz2=%10g  sum=%10g  dDS^2=%10g\n",
+ +                        sqr(dx[XX]), sqr(dx[YY]), sqr(dx[ZZ]), iprod(dx, dx), iprod(dDS, dDS));
+ +                fprintf(debug, "WPOL: dHH=(%10g,%10g,%10g)\n", dHH[XX], dHH[YY], dHH[ZZ]);
+ +                fprintf(debug, "WPOL: dOD=(%10g,%10g,%10g), 1/r_OD = %10g\n",
+ +                        dOD[XX], dOD[YY], dOD[ZZ], 1/r_OD);
+ +                fprintf(debug, "WPOL: nW =(%10g,%10g,%10g), 1/r_nW = %10g\n",
+ +                        nW[XX], nW[YY], nW[ZZ], 1/r_nW);
+ +                fprintf(debug, "WPOL: dx  =%10g, dy  =%10g, dz  =%10g\n",
+ +                        dx[XX], dx[YY], dx[ZZ]);
+ +                fprintf(debug, "WPOL: dDSx=%10g, dDSy=%10g, dDSz=%10g\n",
+ +                        dDS[XX], dDS[YY], dDS[ZZ]);
+ +            }
+ +#endif
+ +            /* Now compute the forces and energy */
+ +            kdx[XX] = kk[XX]*dx[XX];
+ +            kdx[YY] = kk[YY]*dx[YY];
+ +            kdx[ZZ] = kk[ZZ]*dx[ZZ];
+ +            vtot   += iprod(dx, kdx);
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                /* This is a tensor operation but written out for speed */
+ +                tx        =  nW[m]*kdx[XX];
+ +                ty        = dHH[m]*kdx[YY];
+ +                tz        = dOD[m]*kdx[ZZ];
+ +                fij       = -tx-ty-tz;
+ +#ifdef DEBUG
+ +                df[m] = fij;
+ +#endif
+ +                f[aS][m] += fij;
+ +                f[aD][m] -= fij;
+ +            }
+ +#ifdef DEBUG
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "WPOL: vwpol=%g\n", 0.5*iprod(dx, kdx));
+ +                fprintf(debug, "WPOL: df = (%10g, %10g, %10g)\n", df[XX], df[YY], df[ZZ]);
+ +            }
+ +#endif
+ +        }
+ +    }
+ +    return 0.5*vtot;
+ +}
+ +
+ +static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
+ +                       const t_pbc *pbc, real qq,
+ +                       rvec fshift[], real afac)
+ +{
+ +    rvec r12;
+ +    real r12sq, r12_1, r12n, r12bar, v0, v1, fscal, ebar, fff;
+ +    int  m, t;
+ +
+ +    t      = pbc_rvec_sub(pbc, xi, xj, r12);                      /*  3 */
+ +
+ +    r12sq  = iprod(r12, r12);                                     /*  5 */
+ +    r12_1  = gmx_invsqrt(r12sq);                                  /*  5 */
+ +    r12bar = afac/r12_1;                                          /*  5 */
+ +    v0     = qq*ONE_4PI_EPS0*r12_1;                               /*  2 */
+ +    ebar   = exp(-r12bar);                                        /*  5 */
+ +    v1     = (1-(1+0.5*r12bar)*ebar);                             /*  4 */
+ +    fscal  = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "THOLE: v0 = %.3f v1 = %.3f r12= % .3f r12bar = %.3f fscal = %.3f  ebar = %.3f\n", v0, v1, 1/r12_1, r12bar, fscal, ebar);
+ +    }
+ +
+ +    for (m = 0; (m < DIM); m++)
+ +    {
+ +        fff                 = fscal*r12[m];
+ +        fi[m]              += fff;
+ +        fj[m]              -= fff;
+ +        fshift[t][m]       += fff;
+ +        fshift[CENTRAL][m] -= fff;
+ +    }             /* 15 */
+ +
+ +    return v0*v1; /* 1 */
+ +    /* 54 */
+ +}
+ +
+ +real thole_pol(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph gmx_unused *g,
+ +               real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +               const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    /* Interaction between two pairs of particles with opposite charge */
+ +    int  i, type, a1, da1, a2, da2;
+ +    real q1, q2, qq, a, al1, al2, afac;
+ +    real V = 0;
+ +
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type  = forceatoms[i++];
+ +        a1    = forceatoms[i++];
+ +        da1   = forceatoms[i++];
+ +        a2    = forceatoms[i++];
+ +        da2   = forceatoms[i++];
+ +        q1    = md->chargeA[da1];
+ +        q2    = md->chargeA[da2];
+ +        a     = forceparams[type].thole.a;
+ +        al1   = forceparams[type].thole.alpha1;
+ +        al2   = forceparams[type].thole.alpha2;
+ +        qq    = q1*q2;
+ +        afac  = a*pow(al1*al2, -1.0/6.0);
+ +        V    += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
+ +        V    += do_1_thole(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
+ +        V    += do_1_thole(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
+ +        V    += do_1_thole(x[da1], x[da2], f[da1], f[da2], pbc, qq, fshift, afac);
+ +    }
+ +    /* 290 flops */
+ +    return V;
+ +}
+ +
+ +real bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ +                rvec r_ij, rvec r_kj, real *costh,
+ +                int *t1, int *t2)
+ +/* Return value is the angle between the bonds i-j and j-k */
+ +{
+ +    /* 41 FLOPS */
+ +    real th;
+ +
+ +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3              */
+ +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3              */
+ +
+ +    *costh = cos_angle(r_ij, r_kj);        /* 25              */
+ +    th     = acos(*costh);                 /* 10              */
+ +    /* 41 TOTAL       */
+ +    return th;
+ +}
+ +
+ +real angles(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec f[], rvec fshift[],
+ +            const t_pbc *pbc, const t_graph *g,
+ +            real lambda, real *dvdlambda,
+ +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +            int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ai, aj, ak, t1, t2, type;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; i < nbonds; )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2);  /*  41                */
+ +
+ +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ +                               forceparams[type].harmonic.krB,
+ +                               forceparams[type].harmonic.rA*DEG2RAD,
+ +                               forceparams[type].harmonic.rB*DEG2RAD,
+ +                               theta, lambda, &va, &dVdt);  /*  21  */
+ +        vtot += va;
+ +
+ +        cos_theta2 = sqr(cos_theta);
+ +        if (cos_theta2 < 1)
+ +        {
+ +            int  m;
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            real nrkj_1, nrij_1;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12            */
+ +            sth = st*cos_theta;                     /*   1            */
+ +#ifdef DEBUG
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
+ +                        theta*RAD2DEG, va, dVdt);
+ +            }
+ +#endif
+ +            nrij2 = iprod(r_ij, r_ij);      /*   5            */
+ +            nrkj2 = iprod(r_kj, r_kj);      /*   5            */
+ +
+ +            nrij_1 = gmx_invsqrt(nrij2);    /*  10            */
+ +            nrkj_1 = gmx_invsqrt(nrkj2);    /*  10            */
+ +
+ +            cik = st*nrij_1*nrkj_1;         /*   2            */
+ +            cii = sth*nrij_1*nrij_1;        /*   2            */
+ +            ckk = sth*nrkj_1*nrkj_1;        /*   2            */
+ +
+ +            for (m = 0; m < DIM; m++)
+ +            {           /*  39                */
+ +                f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m] - f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +            if (g != NULL)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +        }                                           /* 161 TOTAL      */
+ +    }
+ +
+ +    return vtot;
+ +}
+ +
+ +#ifdef SIMD_BONDEDS
+ +
+ +/* As angles, but using SIMD to calculate many dihedrals at once.
+ + * This routines does not calculate energies and shift forces.
+ + */
+ +static gmx_inline void
+ +angles_noener_simd(int nbonds,
+ +                   const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                   const rvec x[], rvec f[],
+ +                   const t_pbc *pbc, const t_graph gmx_unused *g,
+ +                   real gmx_unused lambda,
+ +                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                   int gmx_unused *global_atom_index)
+ +{
+ +#define UNROLL GMX_SIMD_WIDTH_HERE
+ +    const int      nfa1 = 4;
+ +    int            i, iu, s, m;
+ +    int            type, ai[UNROLL], aj[UNROLL], ak[UNROLL];
+ +    real           coeff_array[2*UNROLL+UNROLL], *coeff;
+ +    real           dr_array[2*DIM*UNROLL+UNROLL], *dr;
+ +    real           f_buf_array[6*UNROLL+UNROLL], *f_buf;
+ +    gmx_mm_pr      k_S, theta0_S;
+ +    gmx_mm_pr      rijx_S, rijy_S, rijz_S;
+ +    gmx_mm_pr      rkjx_S, rkjy_S, rkjz_S;
+ +    gmx_mm_pr      one_S;
+ +    gmx_mm_pr      rij_rkj_S;
+ +    gmx_mm_pr      nrij2_S, nrij_1_S;
+ +    gmx_mm_pr      nrkj2_S, nrkj_1_S;
+ +    gmx_mm_pr      cos_S, sin_S;
+ +    gmx_mm_pr      theta_S;
+ +    gmx_mm_pr      st_S, sth_S;
+ +    gmx_mm_pr      cik_S, cii_S, ckk_S;
+ +    gmx_mm_pr      f_ix_S, f_iy_S, f_iz_S;
+ +    gmx_mm_pr      f_kx_S, f_ky_S, f_kz_S;
+ +    pbc_simd_t     pbc_simd;
+ +
+ +    /* Ensure register memory alignment */
+ +    coeff = gmx_simd_align_real(coeff_array);
+ +    dr    = gmx_simd_align_real(dr_array);
+ +    f_buf = gmx_simd_align_real(f_buf_array);
+ +
+ +    set_pbc_simd(pbc, &pbc_simd);
+ +
+ +    one_S = gmx_set1_pr(1.0);
+ +
+ +    /* nbonds is the number of angles times nfa1, here we step UNROLL angles */
+ +    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+ +    {
+ +        /* Collect atoms for UNROLL angles.
+ +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ +         */
+ +        iu = i;
+ +        for (s = 0; s < UNROLL; s++)
+ +        {
+ +            type  = forceatoms[iu];
+ +            ai[s] = forceatoms[iu+1];
+ +            aj[s] = forceatoms[iu+2];
+ +            ak[s] = forceatoms[iu+3];
+ +
+ +            coeff[s]        = forceparams[type].harmonic.krA;
+ +            coeff[UNROLL+s] = forceparams[type].harmonic.rA*DEG2RAD;
+ +
+ +            /* If you can't use pbc_dx_simd below for PBC, e.g. because
+ +             * you can't round in SIMD, use pbc_rvec_sub here.
+ +             */
+ +            /* Store the non PBC corrected distances packed and aligned */
+ +            for (m = 0; m < DIM; m++)
+ +            {
+ +                dr[s +      m *UNROLL] = x[ai[s]][m] - x[aj[s]][m];
+ +                dr[s + (DIM+m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
+ +            }
+ +
+ +            /* At the end fill the arrays with identical entries */
+ +            if (iu + nfa1 < nbonds)
+ +            {
+ +                iu += nfa1;
+ +            }
+ +        }
+ +
+ +        k_S       = gmx_load_pr(coeff);
+ +        theta0_S  = gmx_load_pr(coeff+UNROLL);
+ +
+ +        rijx_S    = gmx_load_pr(dr + 0*UNROLL);
+ +        rijy_S    = gmx_load_pr(dr + 1*UNROLL);
+ +        rijz_S    = gmx_load_pr(dr + 2*UNROLL);
+ +        rkjx_S    = gmx_load_pr(dr + 3*UNROLL);
+ +        rkjy_S    = gmx_load_pr(dr + 4*UNROLL);
+ +        rkjz_S    = gmx_load_pr(dr + 5*UNROLL);
+ +
+ +        pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
+ +        pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
+ +
+ +        rij_rkj_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
+ +                                 rkjx_S, rkjy_S, rkjz_S);
+ +
+ +        nrij2_S   = gmx_norm2_pr(rijx_S, rijy_S, rijz_S);
+ +        nrkj2_S   = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+ +
+ +        nrij_1_S  = gmx_invsqrt_pr(nrij2_S);
+ +        nrkj_1_S  = gmx_invsqrt_pr(nrkj2_S);
+ +
+ +        cos_S     = gmx_mul_pr(rij_rkj_S, gmx_mul_pr(nrij_1_S, nrkj_1_S));
+ +
+ +        theta_S   = gmx_acos_pr(cos_S);
+ +
+ +        sin_S     = gmx_invsqrt_pr(gmx_max_pr(gmx_sub_pr(one_S, gmx_mul_pr(cos_S, cos_S)),
+ +                                              gmx_setzero_pr()));
+ +        st_S      = gmx_mul_pr(gmx_mul_pr(k_S, gmx_sub_pr(theta0_S, theta_S)),
+ +                               sin_S);
+ +        sth_S     = gmx_mul_pr(st_S, cos_S);
+ +
+ +        cik_S     = gmx_mul_pr(st_S,  gmx_mul_pr(nrij_1_S, nrkj_1_S));
+ +        cii_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrij_1_S, nrij_1_S));
+ +        ckk_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrkj_1_S, nrkj_1_S));
+ +
+ +        f_ix_S    = gmx_mul_pr(cii_S, rijx_S);
+ +        f_ix_S    = gmx_nmsub_pr(cik_S, rkjx_S, f_ix_S);
+ +        f_iy_S    = gmx_mul_pr(cii_S, rijy_S);
+ +        f_iy_S    = gmx_nmsub_pr(cik_S, rkjy_S, f_iy_S);
+ +        f_iz_S    = gmx_mul_pr(cii_S, rijz_S);
+ +        f_iz_S    = gmx_nmsub_pr(cik_S, rkjz_S, f_iz_S);
+ +        f_kx_S    = gmx_mul_pr(ckk_S, rkjx_S);
+ +        f_kx_S    = gmx_nmsub_pr(cik_S, rijx_S, f_kx_S);
+ +        f_ky_S    = gmx_mul_pr(ckk_S, rkjy_S);
+ +        f_ky_S    = gmx_nmsub_pr(cik_S, rijy_S, f_ky_S);
+ +        f_kz_S    = gmx_mul_pr(ckk_S, rkjz_S);
+ +        f_kz_S    = gmx_nmsub_pr(cik_S, rijz_S, f_kz_S);
+ +
+ +        gmx_store_pr(f_buf + 0*UNROLL, f_ix_S);
+ +        gmx_store_pr(f_buf + 1*UNROLL, f_iy_S);
+ +        gmx_store_pr(f_buf + 2*UNROLL, f_iz_S);
+ +        gmx_store_pr(f_buf + 3*UNROLL, f_kx_S);
+ +        gmx_store_pr(f_buf + 4*UNROLL, f_ky_S);
+ +        gmx_store_pr(f_buf + 5*UNROLL, f_kz_S);
+ +
+ +        iu = i;
+ +        s  = 0;
+ +        do
+ +        {
+ +            for (m = 0; m < DIM; m++)
+ +            {
+ +                f[ai[s]][m] += f_buf[s + m*UNROLL];
+ +                f[aj[s]][m] -= f_buf[s + m*UNROLL] + f_buf[s + (DIM+m)*UNROLL];
+ +                f[ak[s]][m] += f_buf[s + (DIM+m)*UNROLL];
+ +            }
+ +            s++;
+ +            iu += nfa1;
+ +        }
+ +        while (s < UNROLL && iu < nbonds);
+ +    }
+ +#undef UNROLL
+ +}
+ +
+ +#endif /* SIMD_BONDEDS */
+ +
+ +real linear_angles(int nbonds,
+ +                   const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                   const rvec x[], rvec f[], rvec fshift[],
+ +                   const t_pbc *pbc, const t_graph *g,
+ +                   real lambda, real *dvdlambda,
+ +                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                   int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ai, aj, ak, t1, t2, type;
+ +    rvec f_i, f_j, f_k;
+ +    real L1, kA, kB, aA, aB, dr, dr2, va, vtot, a, b, klin;
+ +    ivec jt, dt_ij, dt_kj;
+ +    rvec r_ij, r_kj, r_ik, dx;
+ +
+ +    L1   = 1-lambda;
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        kA   = forceparams[type].linangle.klinA;
+ +        kB   = forceparams[type].linangle.klinB;
+ +        klin = L1*kA + lambda*kB;
+ +
+ +        aA   = forceparams[type].linangle.aA;
+ +        aB   = forceparams[type].linangle.aB;
+ +        a    = L1*aA+lambda*aB;
+ +        b    = 1-a;
+ +
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ +        rvec_sub(r_ij, r_kj, r_ik);
+ +
+ +        dr2 = 0;
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            dr        = -a * r_ij[m] - b * r_kj[m];
+ +            dr2      += dr*dr;
+ +            dx[m]     = dr;
+ +            f_i[m]    = a*klin*dr;
+ +            f_k[m]    = b*klin*dr;
+ +            f_j[m]    = -(f_i[m]+f_k[m]);
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +        va          = 0.5*klin*dr2;
+ +        *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx, r_ik);
+ +
+ +        vtot += va;
+ +
+ +        if (g)
+ +        {
+ +            copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +            t1 = IVEC2IS(dt_ij);
+ +            t2 = IVEC2IS(dt_kj);
+ +        }
+ +        rvec_inc(fshift[t1], f_i);
+ +        rvec_inc(fshift[CENTRAL], f_j);
+ +        rvec_inc(fshift[t2], f_k);
+ +    }                                         /* 57 TOTAL     */
+ +    return vtot;
+ +}
+ +
+ +real urey_bradley(int nbonds,
+ +                  const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                  const rvec x[], rvec f[], rvec fshift[],
+ +                  const t_pbc *pbc, const t_graph *g,
+ +                  real lambda, real *dvdlambda,
+ +                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                  int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ai, aj, ak, t1, t2, type, ki;
+ +    rvec r_ij, r_kj, r_ik;
+ +    real cos_theta, cos_theta2, theta;
+ +    real dVdt, va, vtot, dr, dr2, vbond, fbond, fik;
+ +    real kthA, th0A, kUBA, r13A, kthB, th0B, kUBB, r13B;
+ +    ivec jt, dt_ij, dt_kj, dt_ik;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type  = forceatoms[i++];
+ +        ai    = forceatoms[i++];
+ +        aj    = forceatoms[i++];
+ +        ak    = forceatoms[i++];
+ +        th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
+ +        kthA  = forceparams[type].u_b.kthetaA;
+ +        r13A  = forceparams[type].u_b.r13A;
+ +        kUBA  = forceparams[type].u_b.kUBA;
+ +        th0B  = forceparams[type].u_b.thetaB*DEG2RAD;
+ +        kthB  = forceparams[type].u_b.kthetaB;
+ +        r13B  = forceparams[type].u_b.r13B;
+ +        kUBB  = forceparams[type].u_b.kUBB;
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2);                     /*  41             */
+ +
+ +        *dvdlambda += harmonic(kthA, kthB, th0A, th0B, theta, lambda, &va, &dVdt); /*  21  */
+ +        vtot       += va;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);                               /*   3      */
+ +        dr2  = iprod(r_ik, r_ik);                                                   /*   5            */
+ +        dr   = dr2*gmx_invsqrt(dr2);                                                /*  10            */
+ +
+ +        *dvdlambda += harmonic(kUBA, kUBB, r13A, r13B, dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        cos_theta2 = sqr(cos_theta);                                                /*   1            */
+ +        if (cos_theta2 < 1)
+ +        {
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12            */
+ +            sth = st*cos_theta;                     /*   1            */
+ +#ifdef DEBUG
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
+ +                        theta*RAD2DEG, va, dVdt);
+ +            }
+ +#endif
+ +            nrkj2 = iprod(r_kj, r_kj);  /*   5                */
+ +            nrij2 = iprod(r_ij, r_ij);
+ +
+ +            cik = st*gmx_invsqrt(nrkj2*nrij2); /*  12         */
+ +            cii = sth/nrij2;                   /*  10         */
+ +            ckk = sth/nrkj2;                   /*  10         */
+ +
+ +            for (m = 0; (m < DIM); m++)        /*  39         */
+ +            {
+ +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m]-f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +        }                                       /* 161 TOTAL  */
+ +        /* Time for the bond calculations */
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        vtot  += vbond;            /* 1*/
+ +        fbond *= gmx_invsqrt(dr2); /*   6             */
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), dt_ik);
+ +            ki = IVEC2IS(dt_ik);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fik                 = fbond*r_ik[m];
+ +            f[ai][m]           += fik;
+ +            f[ak][m]           -= fik;
+ +            fshift[ki][m]      += fik;
+ +            fshift[CENTRAL][m] -= fik;
+ +        }
+ +    }
+ +    return vtot;
+ +}
+ +
+ +real quartic_angles(int nbonds,
+ +                    const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                    const rvec x[], rvec f[], rvec fshift[],
+ +                    const t_pbc *pbc, const t_graph *g,
+ +                    real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                    int gmx_unused *global_atom_index)
+ +{
+ +    int  i, j, ai, aj, ak, t1, t2, type;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, cos_theta2, theta, dt, dVdt, va, dtp, c, vtot;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2); /*  41         */
+ +
+ +        dt = theta - forceparams[type].qangle.theta*DEG2RAD;   /* 2          */
+ +
+ +        dVdt = 0;
+ +        va   = forceparams[type].qangle.c[0];
+ +        dtp  = 1.0;
+ +        for (j = 1; j <= 4; j++)
+ +        {
+ +            c     = forceparams[type].qangle.c[j];
+ +            dVdt -= j*c*dtp;
+ +            dtp  *= dt;
+ +            va   += c*dtp;
+ +        }
+ +        /* 20 */
+ +
+ +        vtot += va;
+ +
+ +        cos_theta2 = sqr(cos_theta);            /*   1                */
+ +        if (cos_theta2 < 1)
+ +        {
+ +            int  m;
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12            */
+ +            sth = st*cos_theta;                     /*   1            */
+ +#ifdef DEBUG
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
+ +                        theta*RAD2DEG, va, dVdt);
+ +            }
+ +#endif
+ +            nrkj2 = iprod(r_kj, r_kj);  /*   5                */
+ +            nrij2 = iprod(r_ij, r_ij);
+ +
+ +            cik = st*gmx_invsqrt(nrkj2*nrij2); /*  12         */
+ +            cii = sth/nrij2;                   /*  10         */
+ +            ckk = sth/nrkj2;                   /*  10         */
+ +
+ +            for (m = 0; (m < DIM); m++)        /*  39         */
+ +            {
+ +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m]-f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +        }                                       /* 153 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
+ +               const t_pbc *pbc,
+ +               rvec r_ij, rvec r_kj, rvec r_kl, rvec m, rvec n,
+ +               real *sign, int *t1, int *t2, int *t3)
+ +{
+ +    real ipr, phi;
+ +
+ +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3        */
+ +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3              */
+ +    *t3 = pbc_rvec_sub(pbc, xk, xl, r_kl); /*  3              */
+ +
+ +    cprod(r_ij, r_kj, m);                  /*  9        */
+ +    cprod(r_kj, r_kl, n);                  /*  9              */
+ +    phi     = gmx_angle(m, n);             /* 49 (assuming 25 for atan2) */
+ +    ipr     = iprod(r_ij, n);              /*  5        */
+ +    (*sign) = (ipr < 0.0) ? -1.0 : 1.0;
+ +    phi     = (*sign)*phi;                 /*  1              */
+ +    /* 82 TOTAL       */
+ +    return phi;
+ +}
+ +
+ +
+ +#ifdef SIMD_BONDEDS
+ +
+ +/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
+ + * also calculates the pre-factor required for the dihedral force update.
+ + * Note that bv and buf should be register aligned.
+ + */
+ +static gmx_inline void
+ +dih_angle_simd(const rvec *x,
+ +               const int *ai, const int *aj, const int *ak, const int *al,
+ +               const pbc_simd_t *pbc,
+ +               real *dr,
+ +               gmx_mm_pr *phi_S,
+ +               gmx_mm_pr *mx_S, gmx_mm_pr *my_S, gmx_mm_pr *mz_S,
+ +               gmx_mm_pr *nx_S, gmx_mm_pr *ny_S, gmx_mm_pr *nz_S,
+ +               gmx_mm_pr *nrkj_m2_S,
+ +               gmx_mm_pr *nrkj_n2_S,
+ +               real *p,
+ +               real *q)
+ +{
+ +#define UNROLL GMX_SIMD_WIDTH_HERE
+ +    int       s, m;
+ +    gmx_mm_pr rijx_S, rijy_S, rijz_S;
+ +    gmx_mm_pr rkjx_S, rkjy_S, rkjz_S;
+ +    gmx_mm_pr rklx_S, rkly_S, rklz_S;
+ +    gmx_mm_pr cx_S, cy_S, cz_S;
+ +    gmx_mm_pr cn_S;
+ +    gmx_mm_pr s_S;
+ +    gmx_mm_pr ipr_S;
+ +    gmx_mm_pr iprm_S, iprn_S;
+ +    gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+ +    gmx_mm_pr p_S, q_S;
+ +    gmx_mm_pr fmin_S = gmx_set1_pr(GMX_FLOAT_MIN);
-     /* Set sign of the angle with the sign of ipr_S.
-      * Since phi is currently positive, we can use OR instead of XOR.
-      */
-     *phi_S     = gmx_or_pr(*phi_S, gmx_and_pr(ipr_S, sign_mask_S));
+ +
+ +    for (s = 0; s < UNROLL; s++)
+ +    {
+ +        /* If you can't use pbc_dx_simd below for PBC, e.g. because
+ +         * you can't round in SIMD, use pbc_rvec_sub here.
+ +         */
+ +        for (m = 0; m < DIM; m++)
+ +        {
+ +            dr[s + (0*DIM + m)*UNROLL] = x[ai[s]][m] - x[aj[s]][m];
+ +            dr[s + (1*DIM + m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
+ +            dr[s + (2*DIM + m)*UNROLL] = x[ak[s]][m] - x[al[s]][m];
+ +        }
+ +    }
+ +
+ +    rijx_S = gmx_load_pr(dr + 0*UNROLL);
+ +    rijy_S = gmx_load_pr(dr + 1*UNROLL);
+ +    rijz_S = gmx_load_pr(dr + 2*UNROLL);
+ +    rkjx_S = gmx_load_pr(dr + 3*UNROLL);
+ +    rkjy_S = gmx_load_pr(dr + 4*UNROLL);
+ +    rkjz_S = gmx_load_pr(dr + 5*UNROLL);
+ +    rklx_S = gmx_load_pr(dr + 6*UNROLL);
+ +    rkly_S = gmx_load_pr(dr + 7*UNROLL);
+ +    rklz_S = gmx_load_pr(dr + 8*UNROLL);
+ +
+ +    pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
+ +    pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
+ +    pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
+ +
+ +    gmx_cprod_pr(rijx_S, rijy_S, rijz_S,
+ +                 rkjx_S, rkjy_S, rkjz_S,
+ +                 mx_S, my_S, mz_S);
+ +
+ +    gmx_cprod_pr(rkjx_S, rkjy_S, rkjz_S,
+ +                 rklx_S, rkly_S, rklz_S,
+ +                 nx_S, ny_S, nz_S);
+ +
+ +    gmx_cprod_pr(*mx_S, *my_S, *mz_S,
+ +                 *nx_S, *ny_S, *nz_S,
+ +                 &cx_S, &cy_S, &cz_S);
+ +
+ +    cn_S       = gmx_sqrt_pr(gmx_norm2_pr(cx_S, cy_S, cz_S));
+ +
+ +    s_S        = gmx_iprod_pr(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
+ +
+ +    /* Determine the dihedral angle, the sign might need correction */
+ +    *phi_S     = gmx_atan2_pr(cn_S, s_S);
+ +
+ +    ipr_S      = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
+ +                              *nx_S, *ny_S, *nz_S);
+ +
+ +    iprm_S     = gmx_norm2_pr(*mx_S, *my_S, *mz_S);
+ +    iprn_S     = gmx_norm2_pr(*nx_S, *ny_S, *nz_S);
+ +
+ +    nrkj2_S    = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+ +
+ +    /* Avoid division by zero. When zero, the result is multiplied by 0
+ +     * anyhow, so the 3 max below do not affect the final result.
+ +     */
+ +    nrkj2_S    = gmx_max_pr(nrkj2_S, fmin_S);
+ +    nrkj_1_S   = gmx_invsqrt_pr(nrkj2_S);
+ +    nrkj_2_S   = gmx_mul_pr(nrkj_1_S, nrkj_1_S);
+ +    nrkj_S     = gmx_mul_pr(nrkj2_S, nrkj_1_S);
+ +
+ +    iprm_S     = gmx_max_pr(iprm_S, fmin_S);
+ +    iprn_S     = gmx_max_pr(iprn_S, fmin_S);
+ +    *nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S));
+ +    *nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S));
+ +
++    /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
++    *phi_S     = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
+ +
+ +    p_S        = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
+ +                              rkjx_S, rkjy_S, rkjz_S);
+ +    p_S        = gmx_mul_pr(p_S, nrkj_2_S);
+ +
+ +    q_S        = gmx_iprod_pr(rklx_S, rkly_S, rklz_S,
+ +                              rkjx_S, rkjy_S, rkjz_S);
+ +    q_S        = gmx_mul_pr(q_S, nrkj_2_S);
+ +
+ +    gmx_store_pr(p, p_S);
+ +    gmx_store_pr(q, q_S);
+ +#undef UNROLL
+ +}
+ +
+ +#endif /* SIMD_BONDEDS */
+ +
+ +
+ +void do_dih_fup(int i, int j, int k, int l, real ddphi,
+ +                rvec r_ij, rvec r_kj, rvec r_kl,
+ +                rvec m, rvec n, rvec f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                const rvec x[], int t1, int t2, int t3)
+ +{
+ +    /* 143 FLOPS */
+ +    rvec f_i, f_j, f_k, f_l;
+ +    rvec uvec, vvec, svec, dx_jl;
+ +    real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
+ +    real a, b, p, q, toler;
+ +    ivec jt, dt_ij, dt_kj, dt_lj;
+ +
+ +    iprm  = iprod(m, m);       /*  5    */
+ +    iprn  = iprod(n, n);       /*  5  */
+ +    nrkj2 = iprod(r_kj, r_kj); /*  5  */
+ +    toler = nrkj2*GMX_REAL_EPS;
+ +    if ((iprm > toler) && (iprn > toler))
+ +    {
+ +        nrkj_1 = gmx_invsqrt(nrkj2); /* 10    */
+ +        nrkj_2 = nrkj_1*nrkj_1;      /*  1    */
+ +        nrkj   = nrkj2*nrkj_1;       /*  1    */
+ +        a      = -ddphi*nrkj/iprm;   /* 11    */
+ +        svmul(a, m, f_i);            /*  3    */
+ +        b     = ddphi*nrkj/iprn;     /* 11    */
+ +        svmul(b, n, f_l);            /*  3  */
+ +        p     = iprod(r_ij, r_kj);   /*  5    */
+ +        p    *= nrkj_2;              /*  1    */
+ +        q     = iprod(r_kl, r_kj);   /*  5    */
+ +        q    *= nrkj_2;              /*  1    */
+ +        svmul(p, f_i, uvec);         /*  3    */
+ +        svmul(q, f_l, vvec);         /*  3    */
+ +        rvec_sub(uvec, vvec, svec);  /*  3    */
+ +        rvec_sub(f_i, svec, f_j);    /*  3    */
+ +        rvec_add(f_l, svec, f_k);    /*  3    */
+ +        rvec_inc(f[i], f_i);         /*  3    */
+ +        rvec_dec(f[j], f_j);         /*  3    */
+ +        rvec_dec(f[k], f_k);         /*  3    */
+ +        rvec_inc(f[l], f_l);         /*  3    */
+ +
+ +        if (g)
+ +        {
+ +            copy_ivec(SHIFT_IVEC(g, j), jt);
+ +            ivec_sub(SHIFT_IVEC(g, i), jt, dt_ij);
+ +            ivec_sub(SHIFT_IVEC(g, k), jt, dt_kj);
+ +            ivec_sub(SHIFT_IVEC(g, l), jt, dt_lj);
+ +            t1 = IVEC2IS(dt_ij);
+ +            t2 = IVEC2IS(dt_kj);
+ +            t3 = IVEC2IS(dt_lj);
+ +        }
+ +        else if (pbc)
+ +        {
+ +            t3 = pbc_rvec_sub(pbc, x[l], x[j], dx_jl);
+ +        }
+ +        else
+ +        {
+ +            t3 = CENTRAL;
+ +        }
+ +
+ +        rvec_inc(fshift[t1], f_i);
+ +        rvec_dec(fshift[CENTRAL], f_j);
+ +        rvec_dec(fshift[t2], f_k);
+ +        rvec_inc(fshift[t3], f_l);
+ +    }
+ +    /* 112 TOTAL    */
+ +}
+ +
+ +/* As do_dih_fup above, but without shift forces */
+ +static void
+ +do_dih_fup_noshiftf(int i, int j, int k, int l, real ddphi,
+ +                    rvec r_ij, rvec r_kj, rvec r_kl,
+ +                    rvec m, rvec n, rvec f[])
+ +{
+ +    rvec f_i, f_j, f_k, f_l;
+ +    rvec uvec, vvec, svec, dx_jl;
+ +    real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
+ +    real a, b, p, q, toler;
+ +    ivec jt, dt_ij, dt_kj, dt_lj;
+ +
+ +    iprm  = iprod(m, m);       /*  5    */
+ +    iprn  = iprod(n, n);       /*  5  */
+ +    nrkj2 = iprod(r_kj, r_kj); /*  5  */
+ +    toler = nrkj2*GMX_REAL_EPS;
+ +    if ((iprm > toler) && (iprn > toler))
+ +    {
+ +        nrkj_1 = gmx_invsqrt(nrkj2); /* 10    */
+ +        nrkj_2 = nrkj_1*nrkj_1;      /*  1    */
+ +        nrkj   = nrkj2*nrkj_1;       /*  1    */
+ +        a      = -ddphi*nrkj/iprm;   /* 11    */
+ +        svmul(a, m, f_i);            /*  3    */
+ +        b     = ddphi*nrkj/iprn;     /* 11    */
+ +        svmul(b, n, f_l);            /*  3  */
+ +        p     = iprod(r_ij, r_kj);   /*  5    */
+ +        p    *= nrkj_2;              /*  1    */
+ +        q     = iprod(r_kl, r_kj);   /*  5    */
+ +        q    *= nrkj_2;              /*  1    */
+ +        svmul(p, f_i, uvec);         /*  3    */
+ +        svmul(q, f_l, vvec);         /*  3    */
+ +        rvec_sub(uvec, vvec, svec);  /*  3    */
+ +        rvec_sub(f_i, svec, f_j);    /*  3    */
+ +        rvec_add(f_l, svec, f_k);    /*  3    */
+ +        rvec_inc(f[i], f_i);         /*  3    */
+ +        rvec_dec(f[j], f_j);         /*  3    */
+ +        rvec_dec(f[k], f_k);         /*  3    */
+ +        rvec_inc(f[l], f_l);         /*  3    */
+ +    }
+ +}
+ +
+ +/* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
+ +static gmx_inline void
+ +do_dih_fup_noshiftf_precalc(int i, int j, int k, int l,
+ +                            real p, real q,
+ +                            real f_i_x, real f_i_y, real f_i_z,
+ +                            real mf_l_x, real mf_l_y, real mf_l_z,
+ +                            rvec f[])
+ +{
+ +    rvec f_i, f_j, f_k, f_l;
+ +    rvec uvec, vvec, svec;
+ +
+ +    f_i[XX] = f_i_x;
+ +    f_i[YY] = f_i_y;
+ +    f_i[ZZ] = f_i_z;
+ +    f_l[XX] = -mf_l_x;
+ +    f_l[YY] = -mf_l_y;
+ +    f_l[ZZ] = -mf_l_z;
+ +    svmul(p, f_i, uvec);
+ +    svmul(q, f_l, vvec);
+ +    rvec_sub(uvec, vvec, svec);
+ +    rvec_sub(f_i, svec, f_j);
+ +    rvec_add(f_l, svec, f_k);
+ +    rvec_inc(f[i], f_i);
+ +    rvec_dec(f[j], f_j);
+ +    rvec_dec(f[k], f_k);
+ +    rvec_inc(f[l], f_l);
+ +}
+ +
+ +
+ +real dopdihs(real cpA, real cpB, real phiA, real phiB, int mult,
+ +             real phi, real lambda, real *V, real *F)
+ +{
+ +    real v, dvdlambda, mdphi, v1, sdphi, ddphi;
+ +    real L1   = 1.0 - lambda;
+ +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+ +    real dph0 = (phiB - phiA)*DEG2RAD;
+ +    real cp   = L1*cpA + lambda*cpB;
+ +
+ +    mdphi =  mult*phi - ph0;
+ +    sdphi = sin(mdphi);
+ +    ddphi = -cp*mult*sdphi;
+ +    v1    = 1.0 + cos(mdphi);
+ +    v     = cp*v1;
+ +
+ +    dvdlambda  = (cpB - cpA)*v1 + cp*dph0*sdphi;
+ +
+ +    *V = v;
+ +    *F = ddphi;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 40 flops */
+ +}
+ +
+ +static void
+ +dopdihs_noener(real cpA, real cpB, real phiA, real phiB, int mult,
+ +               real phi, real lambda, real *F)
+ +{
+ +    real mdphi, sdphi, ddphi;
+ +    real L1   = 1.0 - lambda;
+ +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+ +    real cp   = L1*cpA + lambda*cpB;
+ +
+ +    mdphi = mult*phi - ph0;
+ +    sdphi = sin(mdphi);
+ +    ddphi = -cp*mult*sdphi;
+ +
+ +    *F = ddphi;
+ +
+ +    /* That was 20 flops */
+ +}
+ +
+ +static void
+ +dopdihs_mdphi(real cpA, real cpB, real phiA, real phiB, int mult,
+ +              real phi, real lambda, real *cp, real *mdphi)
+ +{
+ +    real L1   = 1.0 - lambda;
+ +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+ +
+ +    *cp    = L1*cpA + lambda*cpB;
+ +
+ +    *mdphi = mult*phi - ph0;
+ +}
+ +
+ +static real dopdihs_min(real cpA, real cpB, real phiA, real phiB, int mult,
+ +                        real phi, real lambda, real *V, real *F)
+ +/* similar to dopdihs, except for a minus sign  *
+ + * and a different treatment of mult/phi0       */
+ +{
+ +    real v, dvdlambda, mdphi, v1, sdphi, ddphi;
+ +    real L1   = 1.0 - lambda;
+ +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+ +    real dph0 = (phiB - phiA)*DEG2RAD;
+ +    real cp   = L1*cpA + lambda*cpB;
+ +
+ +    mdphi = mult*(phi-ph0);
+ +    sdphi = sin(mdphi);
+ +    ddphi = cp*mult*sdphi;
+ +    v1    = 1.0-cos(mdphi);
+ +    v     = cp*v1;
+ +
+ +    dvdlambda  = (cpB-cpA)*v1 + cp*dph0*sdphi;
+ +
+ +    *V = v;
+ +    *F = ddphi;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 40 flops */
+ +}
+ +
+ +real pdihs(int nbonds,
+ +           const t_iatom forceatoms[], const t_iparams forceparams[],
+ +           const rvec x[], rvec f[], rvec fshift[],
+ +           const t_pbc *pbc, const t_graph *g,
+ +           real lambda, real *dvdlambda,
+ +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +           int gmx_unused *global_atom_index)
+ +{
+ +    int  i, type, ai, aj, ak, al;
+ +    int  t1, t2, t3;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +    real phi, sign, ddphi, vpd, vtot;
+ +
+ +    vtot = 0.0;
+ +
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &sign, &t1, &t2, &t3);  /*  84      */
+ +        *dvdlambda += dopdihs(forceparams[type].pdihs.cpA,
+ +                              forceparams[type].pdihs.cpB,
+ +                              forceparams[type].pdihs.phiA,
+ +                              forceparams[type].pdihs.phiB,
+ +                              forceparams[type].pdihs.mult,
+ +                              phi, lambda, &vpd, &ddphi);
+ +
+ +        vtot += vpd;
+ +        do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
+ +
+ +#ifdef DEBUG
+ +        fprintf(debug, "pdih: (%d,%d,%d,%d) phi=%g\n",
+ +                ai, aj, ak, al, phi);
+ +#endif
+ +    } /* 223 TOTAL  */
+ +
+ +    return vtot;
+ +}
+ +
+ +void make_dp_periodic(real *dp)  /* 1 flop? */
+ +{
+ +    /* dp cannot be outside (-pi,pi) */
+ +    if (*dp >= M_PI)
+ +    {
+ +        *dp -= 2*M_PI;
+ +    }
+ +    else if (*dp < -M_PI)
+ +    {
+ +        *dp += 2*M_PI;
+ +    }
+ +    return;
+ +}
+ +
+ +/* As pdihs above, but without calculating energies and shift forces */
+ +static void
+ +pdihs_noener(int nbonds,
+ +             const t_iatom forceatoms[], const t_iparams forceparams[],
+ +             const rvec x[], rvec f[],
+ +             const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ +             real lambda,
+ +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +             int gmx_unused *global_atom_index)
+ +{
+ +    int  i, type, ai, aj, ak, al;
+ +    int  t1, t2, t3;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +    real phi, sign, ddphi_tot, ddphi;
+ +
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        ai   = forceatoms[i+1];
+ +        aj   = forceatoms[i+2];
+ +        ak   = forceatoms[i+3];
+ +        al   = forceatoms[i+4];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &sign, &t1, &t2, &t3);
+ +
+ +        ddphi_tot = 0;
+ +
+ +        /* Loop over dihedrals working on the same atoms,
+ +         * so we avoid recalculating angles and force distributions.
+ +         */
+ +        do
+ +        {
+ +            type = forceatoms[i];
+ +            dopdihs_noener(forceparams[type].pdihs.cpA,
+ +                           forceparams[type].pdihs.cpB,
+ +                           forceparams[type].pdihs.phiA,
+ +                           forceparams[type].pdihs.phiB,
+ +                           forceparams[type].pdihs.mult,
+ +                           phi, lambda, &ddphi);
+ +            ddphi_tot += ddphi;
+ +
+ +            i += 5;
+ +        }
+ +        while (i < nbonds &&
+ +               forceatoms[i+1] == ai &&
+ +               forceatoms[i+2] == aj &&
+ +               forceatoms[i+3] == ak &&
+ +               forceatoms[i+4] == al);
+ +
+ +        do_dih_fup_noshiftf(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n, f);
+ +    }
+ +}
+ +
+ +
+ +#ifdef SIMD_BONDEDS
+ +
+ +/* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
+ +static void
+ +pdihs_noener_simd(int nbonds,
+ +                  const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                  const rvec x[], rvec f[],
+ +                  const t_pbc *pbc, const t_graph gmx_unused *g,
+ +                  real gmx_unused lambda,
+ +                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                  int gmx_unused *global_atom_index)
+ +{
+ +#define UNROLL GMX_SIMD_WIDTH_HERE
+ +    const int       nfa1 = 5;
+ +    int             i, iu, s;
+ +    int             type, ai[UNROLL], aj[UNROLL], ak[UNROLL], al[UNROLL];
+ +    int             t1[UNROLL], t2[UNROLL], t3[UNROLL];
+ +    real            ddphi;
+ +    real            dr_array[3*DIM*UNROLL+UNROLL], *dr;
+ +    real            buf_array[7*UNROLL+UNROLL], *buf;
+ +    real           *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
+ +    gmx_mm_pr       phi0_S, phi_S;
+ +    gmx_mm_pr       mx_S, my_S, mz_S;
+ +    gmx_mm_pr       nx_S, ny_S, nz_S;
+ +    gmx_mm_pr       nrkj_m2_S, nrkj_n2_S;
+ +    gmx_mm_pr       cp_S, mdphi_S, mult_S;
+ +    gmx_mm_pr       sin_S, cos_S;
+ +    gmx_mm_pr       mddphi_S;
+ +    gmx_mm_pr       sf_i_S, msf_l_S;
+ +    pbc_simd_t      pbc_simd;
+ +
+ +    /* Ensure SIMD register alignment */
+ +    dr  = gmx_simd_align_real(dr_array);
+ +    buf = gmx_simd_align_real(buf_array);
+ +
+ +    /* Extract aligned pointer for parameters and variables */
+ +    cp    = buf + 0*UNROLL;
+ +    phi0  = buf + 1*UNROLL;
+ +    mult  = buf + 2*UNROLL;
+ +    p     = buf + 3*UNROLL;
+ +    q     = buf + 4*UNROLL;
+ +    sf_i  = buf + 5*UNROLL;
+ +    msf_l = buf + 6*UNROLL;
+ +
+ +    set_pbc_simd(pbc, &pbc_simd);
+ +
+ +    /* nbonds is the number of dihedrals times nfa1, here we step UNROLL dihs */
+ +    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+ +    {
+ +        /* Collect atoms quadruplets for UNROLL dihedrals.
+ +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ +         */
+ +        iu = i;
+ +        for (s = 0; s < UNROLL; s++)
+ +        {
+ +            type  = forceatoms[iu];
+ +            ai[s] = forceatoms[iu+1];
+ +            aj[s] = forceatoms[iu+2];
+ +            ak[s] = forceatoms[iu+3];
+ +            al[s] = forceatoms[iu+4];
+ +
+ +            cp[s]   = forceparams[type].pdihs.cpA;
+ +            phi0[s] = forceparams[type].pdihs.phiA*DEG2RAD;
+ +            mult[s] = forceparams[type].pdihs.mult;
+ +
+ +            /* At the end fill the arrays with identical entries */
+ +            if (iu + nfa1 < nbonds)
+ +            {
+ +                iu += nfa1;
+ +            }
+ +        }
+ +
+ +        /* Caclulate UNROLL dihedral angles at once */
+ +        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+ +                       dr,
+ +                       &phi_S,
+ +                       &mx_S, &my_S, &mz_S,
+ +                       &nx_S, &ny_S, &nz_S,
+ +                       &nrkj_m2_S,
+ +                       &nrkj_n2_S,
+ +                       p, q);
+ +
+ +        cp_S     = gmx_load_pr(cp);
+ +        phi0_S   = gmx_load_pr(phi0);
+ +        mult_S   = gmx_load_pr(mult);
+ +
+ +        mdphi_S  = gmx_sub_pr(gmx_mul_pr(mult_S, phi_S), phi0_S);
+ +
+ +        /* Calculate UNROLL sines at once */
+ +        gmx_sincos_pr(mdphi_S, &sin_S, &cos_S);
+ +        mddphi_S = gmx_mul_pr(gmx_mul_pr(cp_S, mult_S), sin_S);
+ +        sf_i_S   = gmx_mul_pr(mddphi_S, nrkj_m2_S);
+ +        msf_l_S  = gmx_mul_pr(mddphi_S, nrkj_n2_S);
+ +
+ +        /* After this m?_S will contain f[i] */
+ +        mx_S     = gmx_mul_pr(sf_i_S, mx_S);
+ +        my_S     = gmx_mul_pr(sf_i_S, my_S);
+ +        mz_S     = gmx_mul_pr(sf_i_S, mz_S);
+ +
+ +        /* After this m?_S will contain -f[l] */
+ +        nx_S     = gmx_mul_pr(msf_l_S, nx_S);
+ +        ny_S     = gmx_mul_pr(msf_l_S, ny_S);
+ +        nz_S     = gmx_mul_pr(msf_l_S, nz_S);
+ +
+ +        gmx_store_pr(dr + 0*UNROLL, mx_S);
+ +        gmx_store_pr(dr + 1*UNROLL, my_S);
+ +        gmx_store_pr(dr + 2*UNROLL, mz_S);
+ +        gmx_store_pr(dr + 3*UNROLL, nx_S);
+ +        gmx_store_pr(dr + 4*UNROLL, ny_S);
+ +        gmx_store_pr(dr + 5*UNROLL, nz_S);
+ +
+ +        iu = i;
+ +        s  = 0;
+ +        do
+ +        {
+ +            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+ +                                        p[s], q[s],
+ +                                        dr[     XX *UNROLL+s],
+ +                                        dr[     YY *UNROLL+s],
+ +                                        dr[     ZZ *UNROLL+s],
+ +                                        dr[(DIM+XX)*UNROLL+s],
+ +                                        dr[(DIM+YY)*UNROLL+s],
+ +                                        dr[(DIM+ZZ)*UNROLL+s],
+ +                                        f);
+ +            s++;
+ +            iu += nfa1;
+ +        }
+ +        while (s < UNROLL && iu < nbonds);
+ +    }
+ +#undef UNROLL
+ +}
+ +
+ +#endif /* SIMD_BONDEDS */
+ +
+ +
+ +real idihs(int nbonds,
+ +           const t_iatom forceatoms[], const t_iparams forceparams[],
+ +           const rvec x[], rvec f[], rvec fshift[],
+ +           const t_pbc *pbc, const t_graph *g,
+ +           real lambda, real *dvdlambda,
+ +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +           int gmx_unused *global_atom_index)
+ +{
+ +    int  i, type, ai, aj, ak, al;
+ +    int  t1, t2, t3;
+ +    real phi, phi0, dphi0, ddphi, sign, vtot;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +    real L1, kk, dp, dp2, kA, kB, pA, pB, dvdl_term;
+ +
+ +    L1        = 1.0-lambda;
+ +    dvdl_term = 0;
+ +    vtot      = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &sign, &t1, &t2, &t3);  /*  84                */
+ +
+ +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ +         * force changes if we just apply a normal harmonic.
+ +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ +         * This means we will never have the periodicity problem, unless
+ +         * the dihedral is Pi away from phiO, which is very unlikely due to
+ +         * the potential.
+ +         */
+ +        kA = forceparams[type].harmonic.krA;
+ +        kB = forceparams[type].harmonic.krB;
+ +        pA = forceparams[type].harmonic.rA;
+ +        pB = forceparams[type].harmonic.rB;
+ +
+ +        kk    = L1*kA + lambda*kB;
+ +        phi0  = (L1*pA + lambda*pB)*DEG2RAD;
+ +        dphi0 = (pB - pA)*DEG2RAD;
+ +
+ +        dp = phi-phi0;
+ +
+ +        make_dp_periodic(&dp);
+ +
+ +        dp2 = dp*dp;
+ +
+ +        vtot += 0.5*kk*dp2;
+ +        ddphi = -kk*dp;
+ +
+ +        dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
+ +
+ +        do_dih_fup(ai, aj, ak, al, (real)(-ddphi), r_ij, r_kj, r_kl, m, n,
+ +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
+ +        /* 218 TOTAL  */
+ +#ifdef DEBUG
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "idih: (%d,%d,%d,%d) phi=%g\n",
+ +                    ai, aj, ak, al, phi);
+ +        }
+ +#endif
+ +    }
+ +
+ +    *dvdlambda += dvdl_term;
+ +    return vtot;
+ +}
+ +
+ +
+ +/*! \brief returns dx, rdist, and dpdl for functions posres() and fbposres()
+ + */
+ +static void posres_dx(const rvec x, const rvec pos0A, const rvec pos0B,
+ +                      const rvec comA_sc, const rvec comB_sc,
+ +                      real lambda,
+ +                      t_pbc *pbc, int refcoord_scaling, int npbcdim,
+ +                      rvec dx, rvec rdist, rvec dpdl)
+ +{
+ +    int  m, d;
+ +    real posA, posB, L1, ref = 0.;
+ +    rvec pos;
+ +
+ +    L1 = 1.0-lambda;
+ +
+ +    for (m = 0; m < DIM; m++)
+ +    {
+ +        posA = pos0A[m];
+ +        posB = pos0B[m];
+ +        if (m < npbcdim)
+ +        {
+ +            switch (refcoord_scaling)
+ +            {
+ +                case erscNO:
+ +                    ref      = 0;
+ +                    rdist[m] = L1*posA + lambda*posB;
+ +                    dpdl[m]  = posB - posA;
+ +                    break;
+ +                case erscALL:
+ +                    /* Box relative coordinates are stored for dimensions with pbc */
+ +                    posA *= pbc->box[m][m];
+ +                    posB *= pbc->box[m][m];
+ +                    for (d = m+1; d < npbcdim; d++)
+ +                    {
+ +                        posA += pos0A[d]*pbc->box[d][m];
+ +                        posB += pos0B[d]*pbc->box[d][m];
+ +                    }
+ +                    ref      = L1*posA + lambda*posB;
+ +                    rdist[m] = 0;
+ +                    dpdl[m]  = posB - posA;
+ +                    break;
+ +                case erscCOM:
+ +                    ref      = L1*comA_sc[m] + lambda*comB_sc[m];
+ +                    rdist[m] = L1*posA       + lambda*posB;
+ +                    dpdl[m]  = comB_sc[m] - comA_sc[m] + posB - posA;
+ +                    break;
+ +                default:
+ +                    gmx_fatal(FARGS, "No such scaling method implemented");
+ +            }
+ +        }
+ +        else
+ +        {
+ +            ref      = L1*posA + lambda*posB;
+ +            rdist[m] = 0;
+ +            dpdl[m]  = posB - posA;
+ +        }
+ +
+ +        /* We do pbc_dx with ref+rdist,
+ +         * since with only ref we can be up to half a box vector wrong.
+ +         */
+ +        pos[m] = ref + rdist[m];
+ +    }
+ +
+ +    if (pbc)
+ +    {
+ +        pbc_dx(pbc, x, pos, dx);
+ +    }
+ +    else
+ +    {
+ +        rvec_sub(x, pos, dx);
+ +    }
+ +}
+ +
+ +/*! \brief Adds forces of flat-bottomed positions restraints to f[]
+ + *         and fixes vir_diag. Returns the flat-bottomed potential. */
+ +real fbposres(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec f[], rvec vir_diag,
+ +              t_pbc *pbc,
+ +              int refcoord_scaling, int ePBC, rvec com)
+ +/* compute flat-bottomed positions restraints */
+ +{
+ +    int              i, ai, m, d, type, npbcdim = 0, fbdim;
+ +    const t_iparams *pr;
+ +    real             vtot, kk, v;
+ +    real             ref = 0, dr, dr2, rpot, rfb, rfb2, fact, invdr;
+ +    rvec             com_sc, rdist, pos, dx, dpdl, fm;
+ +    gmx_bool         bInvert;
+ +
+ +    npbcdim = ePBC2npbcdim(ePBC);
+ +
+ +    if (refcoord_scaling == erscCOM)
+ +    {
+ +        clear_rvec(com_sc);
+ +        for (m = 0; m < npbcdim; m++)
+ +        {
+ +            for (d = m; d < npbcdim; d++)
+ +            {
+ +                com_sc[m] += com[d]*pbc->box[d][m];
+ +            }
+ +        }
+ +    }
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        pr   = &forceparams[type];
+ +
+ +        /* same calculation as for normal posres, but with identical A and B states, and lambda==0 */
+ +        posres_dx(x[ai], forceparams[type].fbposres.pos0, forceparams[type].fbposres.pos0,
+ +                  com_sc, com_sc, 0.0,
+ +                  pbc, refcoord_scaling, npbcdim,
+ +                  dx, rdist, dpdl);
+ +
+ +        clear_rvec(fm);
+ +        v = 0.0;
+ +
+ +        kk   = pr->fbposres.k;
+ +        rfb  = pr->fbposres.r;
+ +        rfb2 = sqr(rfb);
+ +
+ +        /* with rfb<0, push particle out of the sphere/cylinder/layer */
+ +        bInvert = FALSE;
+ +        if (rfb < 0.)
+ +        {
+ +            bInvert = TRUE;
+ +            rfb     = -rfb;
+ +        }
+ +
+ +        switch (pr->fbposres.geom)
+ +        {
+ +            case efbposresSPHERE:
+ +                /* spherical flat-bottom posres */
+ +                dr2 = norm2(dx);
+ +                if (dr2 > 0.0 &&
+ +                    ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
+ +                    )
+ +                {
+ +                    dr   = sqrt(dr2);
+ +                    v    = 0.5*kk*sqr(dr - rfb);
+ +                    fact = -kk*(dr-rfb)/dr; /* Force pointing to the center pos0 */
+ +                    svmul(fact, dx, fm);
+ +                }
+ +                break;
+ +            case efbposresCYLINDER:
+ +                /* cylidrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
+ +                dr2 = sqr(dx[XX])+sqr(dx[YY]);
+ +                if  (dr2 > 0.0 &&
+ +                     ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
+ +                     )
+ +                {
+ +                    dr     = sqrt(dr2);
+ +                    invdr  = 1./dr;
+ +                    v      = 0.5*kk*sqr(dr - rfb);
+ +                    fm[XX] = -kk*(dr-rfb)*dx[XX]*invdr; /* Force pointing to the center */
+ +                    fm[YY] = -kk*(dr-rfb)*dx[YY]*invdr;
+ +                }
+ +                break;
+ +            case efbposresX: /* fbdim=XX */
+ +            case efbposresY: /* fbdim=YY */
+ +            case efbposresZ: /* fbdim=ZZ */
+ +                /* 1D flat-bottom potential */
+ +                fbdim = pr->fbposres.geom - efbposresX;
+ +                dr    = dx[fbdim];
+ +                if ( ( dr > rfb && bInvert == FALSE ) || ( 0 < dr && dr < rfb && bInvert == TRUE )  )
+ +                {
+ +                    v         = 0.5*kk*sqr(dr - rfb);
+ +                    fm[fbdim] = -kk*(dr - rfb);
+ +                }
+ +                else if ( (dr < (-rfb) && bInvert == FALSE ) || ( (-rfb) < dr && dr < 0 && bInvert == TRUE ))
+ +                {
+ +                    v         = 0.5*kk*sqr(dr + rfb);
+ +                    fm[fbdim] = -kk*(dr + rfb);
+ +                }
+ +                break;
+ +        }
+ +
+ +        vtot += v;
+ +
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            f[ai][m]   += fm[m];
+ +            /* Here we correct for the pbc_dx which included rdist */
+ +            vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm[m];
+ +        }
+ +    }
+ +
+ +    return vtot;
+ +}
+ +
+ +
+ +real posres(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec f[], rvec vir_diag,
+ +            t_pbc *pbc,
+ +            real lambda, real *dvdlambda,
+ +            int refcoord_scaling, int ePBC, rvec comA, rvec comB)
+ +{
+ +    int              i, ai, m, d, type, ki, npbcdim = 0;
+ +    const t_iparams *pr;
+ +    real             L1;
+ +    real             vtot, kk, fm;
+ +    real             posA, posB, ref = 0;
+ +    rvec             comA_sc, comB_sc, rdist, dpdl, pos, dx;
+ +    gmx_bool         bForceValid = TRUE;
+ +
+ +    if ((f == NULL) || (vir_diag == NULL))    /* should both be null together! */
+ +    {
+ +        bForceValid = FALSE;
+ +    }
+ +
+ +    npbcdim = ePBC2npbcdim(ePBC);
+ +
+ +    if (refcoord_scaling == erscCOM)
+ +    {
+ +        clear_rvec(comA_sc);
+ +        clear_rvec(comB_sc);
+ +        for (m = 0; m < npbcdim; m++)
+ +        {
+ +            for (d = m; d < npbcdim; d++)
+ +            {
+ +                comA_sc[m] += comA[d]*pbc->box[d][m];
+ +                comB_sc[m] += comB[d]*pbc->box[d][m];
+ +            }
+ +        }
+ +    }
+ +
+ +    L1 = 1.0 - lambda;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        pr   = &forceparams[type];
+ +
+ +        /* return dx, rdist, and dpdl */
+ +        posres_dx(x[ai], forceparams[type].posres.pos0A, forceparams[type].posres.pos0B,
+ +                  comA_sc, comB_sc, lambda,
+ +                  pbc, refcoord_scaling, npbcdim,
+ +                  dx, rdist, dpdl);
+ +
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            kk          = L1*pr->posres.fcA[m] + lambda*pr->posres.fcB[m];
+ +            fm          = -kk*dx[m];
+ +            vtot       += 0.5*kk*dx[m]*dx[m];
+ +            *dvdlambda +=
+ +                0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
+ +                -fm*dpdl[m];
+ +
+ +            /* Here we correct for the pbc_dx which included rdist */
+ +            if (bForceValid)
+ +            {
+ +                f[ai][m]    += fm;
+ +                vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm;
+ +            }
+ +        }
+ +    }
+ +
+ +    return vtot;
+ +}
+ +
+ +static real low_angres(int nbonds,
+ +                       const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                       const rvec x[], rvec f[], rvec fshift[],
+ +                       const t_pbc *pbc, const t_graph *g,
+ +                       real lambda, real *dvdlambda,
+ +                       gmx_bool bZAxis)
+ +{
+ +    int  i, m, type, ai, aj, ak, al;
+ +    int  t1, t2;
+ +    real phi, cos_phi, cos_phi2, vid, vtot, dVdphi;
+ +    rvec r_ij, r_kl, f_i, f_k = {0, 0, 0};
+ +    real st, sth, nrij2, nrkl2, c, cij, ckl;
+ +
+ +    ivec dt;
+ +    t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
+ +
+ +    vtot = 0.0;
+ +    ak   = al = 0; /* to avoid warnings */
+ +    for (i = 0; i < nbonds; )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        t1   = pbc_rvec_sub(pbc, x[aj], x[ai], r_ij);       /*  3             */
+ +        if (!bZAxis)
+ +        {
+ +            ak   = forceatoms[i++];
+ +            al   = forceatoms[i++];
+ +            t2   = pbc_rvec_sub(pbc, x[al], x[ak], r_kl);  /*  3              */
+ +        }
+ +        else
+ +        {
+ +            r_kl[XX] = 0;
+ +            r_kl[YY] = 0;
+ +            r_kl[ZZ] = 1;
+ +        }
+ +
+ +        cos_phi = cos_angle(r_ij, r_kl); /* 25                */
+ +        phi     = acos(cos_phi);         /* 10           */
+ +
+ +        *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
+ +                                  forceparams[type].pdihs.cpB,
+ +                                  forceparams[type].pdihs.phiA,
+ +                                  forceparams[type].pdihs.phiB,
+ +                                  forceparams[type].pdihs.mult,
+ +                                  phi, lambda, &vid, &dVdphi); /*  40  */
+ +
+ +        vtot += vid;
+ +
+ +        cos_phi2 = sqr(cos_phi);                /*   1                */
+ +        if (cos_phi2 < 1)
+ +        {
+ +            st    = -dVdphi*gmx_invsqrt(1 - cos_phi2); /*  12         */
+ +            sth   = st*cos_phi;                        /*   1         */
+ +            nrij2 = iprod(r_ij, r_ij);                 /*   5         */
+ +            nrkl2 = iprod(r_kl, r_kl);                 /*   5          */
+ +
+ +            c   = st*gmx_invsqrt(nrij2*nrkl2);         /*  11         */
+ +            cij = sth/nrij2;                           /*  10         */
+ +            ckl = sth/nrkl2;                           /*  10         */
+ +
+ +            for (m = 0; m < DIM; m++)                  /*  18+18       */
+ +            {
+ +                f_i[m]    = (c*r_kl[m]-cij*r_ij[m]);
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] -= f_i[m];
+ +                if (!bZAxis)
+ +                {
+ +                    f_k[m]    = (c*r_ij[m]-ckl*r_kl[m]);
+ +                    f[ak][m] += f_k[m];
+ +                    f[al][m] -= f_k[m];
+ +                }
+ +            }
+ +
+ +            if (g)
+ +            {
+ +                ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +                t1 = IVEC2IS(dt);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_dec(fshift[CENTRAL], f_i);
+ +            if (!bZAxis)
+ +            {
+ +                if (g)
+ +                {
+ +                    ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, al), dt);
+ +                    t2 = IVEC2IS(dt);
+ +                }
+ +                rvec_inc(fshift[t2], f_k);
+ +                rvec_dec(fshift[CENTRAL], f_k);
+ +            }
+ +        }
+ +    }
+ +
+ +    return vtot; /*  184 / 157 (bZAxis)  total  */
+ +}
+ +
+ +real angres(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec f[], rvec fshift[],
+ +            const t_pbc *pbc, const t_graph *g,
+ +            real lambda, real *dvdlambda,
+ +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +            int gmx_unused *global_atom_index)
+ +{
+ +    return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ +                      lambda, dvdlambda, FALSE);
+ +}
+ +
+ +real angresz(int nbonds,
+ +             const t_iatom forceatoms[], const t_iparams forceparams[],
+ +             const rvec x[], rvec f[], rvec fshift[],
+ +             const t_pbc *pbc, const t_graph *g,
+ +             real lambda, real *dvdlambda,
+ +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +             int gmx_unused *global_atom_index)
+ +{
+ +    return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ +                      lambda, dvdlambda, TRUE);
+ +}
+ +
+ +real dihres(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec f[], rvec fshift[],
+ +            const t_pbc *pbc, const t_graph *g,
+ +            real lambda, real *dvdlambda,
+ +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +            int gmx_unused  *global_atom_index)
+ +{
+ +    real vtot = 0;
+ +    int  ai, aj, ak, al, i, k, type, t1, t2, t3;
+ +    real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
+ +    real phi, ddphi, ddp, ddp2, dp, sign, d2r, fc, L1;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +
+ +    L1 = 1.0-lambda;
+ +
+ +    d2r = DEG2RAD;
+ +    k   = 0;
+ +
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi0A  = forceparams[type].dihres.phiA*d2r;
+ +        dphiA  = forceparams[type].dihres.dphiA*d2r;
+ +        kfacA  = forceparams[type].dihres.kfacA;
+ +
+ +        phi0B  = forceparams[type].dihres.phiB*d2r;
+ +        dphiB  = forceparams[type].dihres.dphiB*d2r;
+ +        kfacB  = forceparams[type].dihres.kfacB;
+ +
+ +        phi0  = L1*phi0A + lambda*phi0B;
+ +        dphi  = L1*dphiA + lambda*dphiB;
+ +        kfac  = L1*kfacA + lambda*kfacB;
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &sign, &t1, &t2, &t3);
+ +        /* 84 flops */
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "dihres[%d]: %d %d %d %d : phi=%f, dphi=%f, kfac=%f\n",
+ +                    k++, ai, aj, ak, al, phi0, dphi, kfac);
+ +        }
+ +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ +         * force changes if we just apply a normal harmonic.
+ +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ +         * This means we will never have the periodicity problem, unless
+ +         * the dihedral is Pi away from phiO, which is very unlikely due to
+ +         * the potential.
+ +         */
+ +        dp = phi-phi0;
+ +        make_dp_periodic(&dp);
+ +
+ +        if (dp > dphi)
+ +        {
+ +            ddp = dp-dphi;
+ +        }
+ +        else if (dp < -dphi)
+ +        {
+ +            ddp = dp+dphi;
+ +        }
+ +        else
+ +        {
+ +            ddp = 0;
+ +        }
+ +
+ +        if (ddp != 0.0)
+ +        {
+ +            ddp2  = ddp*ddp;
+ +            vtot += 0.5*kfac*ddp2;
+ +            ddphi = kfac*ddp;
+ +
+ +            *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
+ +            /* lambda dependence from changing restraint distances */
+ +            if (ddp > 0)
+ +            {
+ +                *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
+ +            }
+ +            else if (ddp < 0)
+ +            {
+ +                *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
+ +            }
+ +            do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ +                       f, fshift, pbc, g, x, t1, t2, t3);      /* 112         */
+ +        }
+ +    }
+ +    return vtot;
+ +}
+ +
+ +
+ +real unimplemented(int gmx_unused nbonds,
+ +                   const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
+ +                   const rvec gmx_unused x[], rvec gmx_unused f[], rvec gmx_unused fshift[],
+ +                   const t_pbc gmx_unused *pbc, const t_graph  gmx_unused *g,
+ +                   real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                   const t_mdatoms  gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                   int gmx_unused *global_atom_index)
+ +{
+ +    gmx_impl("*** you are using a not implemented function");
+ +
+ +    return 0.0; /* To make the compiler happy */
+ +}
+ +
+ +real rbdihs(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec f[], rvec fshift[],
+ +            const t_pbc *pbc, const t_graph *g,
+ +            real lambda, real *dvdlambda,
+ +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +            int gmx_unused *global_atom_index)
+ +{
+ +    const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
+ +    int        type, ai, aj, ak, al, i, j;
+ +    int        t1, t2, t3;
+ +    rvec       r_ij, r_kj, r_kl, m, n;
+ +    real       parmA[NR_RBDIHS];
+ +    real       parmB[NR_RBDIHS];
+ +    real       parm[NR_RBDIHS];
+ +    real       cos_phi, phi, rbp, rbpBA;
+ +    real       v, sign, ddphi, sin_phi;
+ +    real       cosfac, vtot;
+ +    real       L1        = 1.0-lambda;
+ +    real       dvdl_term = 0;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &sign, &t1, &t2, &t3);  /*  84                */
+ +
+ +        /* Change to polymer convention */
+ +        if (phi < c0)
+ +        {
+ +            phi += M_PI;
+ +        }
+ +        else
+ +        {
+ +            phi -= M_PI;    /*   1            */
+ +
+ +        }
+ +        cos_phi = cos(phi);
+ +        /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
+ +        sin_phi = sin(phi);
+ +
+ +        for (j = 0; (j < NR_RBDIHS); j++)
+ +        {
+ +            parmA[j] = forceparams[type].rbdihs.rbcA[j];
+ +            parmB[j] = forceparams[type].rbdihs.rbcB[j];
+ +            parm[j]  = L1*parmA[j]+lambda*parmB[j];
+ +        }
+ +        /* Calculate cosine powers */
+ +        /* Calculate the energy */
+ +        /* Calculate the derivative */
+ +
+ +        v            = parm[0];
+ +        dvdl_term   += (parmB[0]-parmA[0]);
+ +        ddphi        = c0;
+ +        cosfac       = c1;
+ +
+ +        rbp          = parm[1];
+ +        rbpBA        = parmB[1]-parmA[1];
+ +        ddphi       += rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[2];
+ +        rbpBA        = parmB[2]-parmA[2];
+ +        ddphi       += c2*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[3];
+ +        rbpBA        = parmB[3]-parmA[3];
+ +        ddphi       += c3*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[4];
+ +        rbpBA        = parmB[4]-parmA[4];
+ +        ddphi       += c4*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[5];
+ +        rbpBA        = parmB[5]-parmA[5];
+ +        ddphi       += c5*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +
+ +        ddphi = -ddphi*sin_phi;         /*  11                */
+ +
+ +        do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
+ +        vtot += v;
+ +    }
+ +    *dvdlambda += dvdl_term;
+ +
+ +    return vtot;
+ +}
+ +
+ +int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
+ +{
+ +    int im1, ip1, ip2;
+ +
+ +    if (ip < 0)
+ +    {
+ +        ip = ip + grid_spacing - 1;
+ +    }
+ +    else if (ip > grid_spacing)
+ +    {
+ +        ip = ip - grid_spacing - 1;
+ +    }
+ +
+ +    im1 = ip - 1;
+ +    ip1 = ip + 1;
+ +    ip2 = ip + 2;
+ +
+ +    if (ip == 0)
+ +    {
+ +        im1 = grid_spacing - 1;
+ +    }
+ +    else if (ip == grid_spacing-2)
+ +    {
+ +        ip2 = 0;
+ +    }
+ +    else if (ip == grid_spacing-1)
+ +    {
+ +        ip1 = 0;
+ +        ip2 = 1;
+ +    }
+ +
+ +    *ipm1 = im1;
+ +    *ipp1 = ip1;
+ +    *ipp2 = ip2;
+ +
+ +    return ip;
+ +
+ +}
+ +
+ +real cmap_dihs(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const gmx_cmap_t *cmap_grid,
+ +               const rvec x[], rvec f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph *g,
+ +               real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +               int  gmx_unused *global_atom_index)
+ +{
+ +    int         i, j, k, n, idx;
+ +    int         ai, aj, ak, al, am;
+ +    int         a1i, a1j, a1k, a1l, a2i, a2j, a2k, a2l;
+ +    int         type, cmapA;
+ +    int         t11, t21, t31, t12, t22, t32;
+ +    int         iphi1, ip1m1, ip1p1, ip1p2;
+ +    int         iphi2, ip2m1, ip2p1, ip2p2;
+ +    int         l1, l2, l3, l4;
+ +    int         pos1, pos2, pos3, pos4, tmp;
+ +
+ +    real        ty[4], ty1[4], ty2[4], ty12[4], tc[16], tx[16];
+ +    real        phi1, psi1, cos_phi1, sin_phi1, sign1, xphi1;
+ +    real        phi2, psi2, cos_phi2, sin_phi2, sign2, xphi2;
+ +    real        dx, xx, tt, tu, e, df1, df2, ddf1, ddf2, ddf12, vtot;
+ +    real        ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
+ +    real        ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
+ +    real        fg1, hg1, fga1, hgb1, gaa1, gbb1;
+ +    real        fg2, hg2, fga2, hgb2, gaa2, gbb2;
+ +    real        fac;
+ +
+ +    rvec        r1_ij, r1_kj, r1_kl, m1, n1;
+ +    rvec        r2_ij, r2_kj, r2_kl, m2, n2;
+ +    rvec        f1_i, f1_j, f1_k, f1_l;
+ +    rvec        f2_i, f2_j, f2_k, f2_l;
+ +    rvec        a1, b1, a2, b2;
+ +    rvec        f1, g1, h1, f2, g2, h2;
+ +    rvec        dtf1, dtg1, dth1, dtf2, dtg2, dth2;
+ +    ivec        jt1, dt1_ij, dt1_kj, dt1_lj;
+ +    ivec        jt2, dt2_ij, dt2_kj, dt2_lj;
+ +
+ +    const real *cmapd;
+ +
+ +    int         loop_index[4][4] = {
+ +        {0, 4, 8, 12},
+ +        {1, 5, 9, 13},
+ +        {2, 6, 10, 14},
+ +        {3, 7, 11, 15}
+ +    };
+ +
+ +    /* Total CMAP energy */
+ +    vtot = 0;
+ +
+ +    for (n = 0; n < nbonds; )
+ +    {
+ +        /* Five atoms are involved in the two torsions */
+ +        type   = forceatoms[n++];
+ +        ai     = forceatoms[n++];
+ +        aj     = forceatoms[n++];
+ +        ak     = forceatoms[n++];
+ +        al     = forceatoms[n++];
+ +        am     = forceatoms[n++];
+ +
+ +        /* Which CMAP type is this */
+ +        cmapA = forceparams[type].cmap.cmapA;
+ +        cmapd = cmap_grid->cmapdata[cmapA].cmap;
+ +
+ +        /* First torsion */
+ +        a1i   = ai;
+ +        a1j   = aj;
+ +        a1k   = ak;
+ +        a1l   = al;
+ +
+ +        phi1  = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
+ +                          &sign1, &t11, &t21, &t31);  /* 84 */
+ +
+ +        cos_phi1 = cos(phi1);
+ +
+ +        a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
+ +        a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
+ +        a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
+ +
+ +        b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
+ +        b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
+ +        b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
+ +
+ +        tmp = pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
+ +
+ +        ra21  = iprod(a1, a1);       /* 5 */
+ +        rb21  = iprod(b1, b1);       /* 5 */
+ +        rg21  = iprod(r1_kj, r1_kj); /* 5 */
+ +        rg1   = sqrt(rg21);
+ +
+ +        rgr1  = 1.0/rg1;
+ +        ra2r1 = 1.0/ra21;
+ +        rb2r1 = 1.0/rb21;
+ +        rabr1 = sqrt(ra2r1*rb2r1);
+ +
+ +        sin_phi1 = rg1 * rabr1 * iprod(a1, h1) * (-1);
+ +
+ +        if (cos_phi1 < -0.5 || cos_phi1 > 0.5)
+ +        {
+ +            phi1 = asin(sin_phi1);
+ +
+ +            if (cos_phi1 < 0)
+ +            {
+ +                if (phi1 > 0)
+ +                {
+ +                    phi1 = M_PI - phi1;
+ +                }
+ +                else
+ +                {
+ +                    phi1 = -M_PI - phi1;
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            phi1 = acos(cos_phi1);
+ +
+ +            if (sin_phi1 < 0)
+ +            {
+ +                phi1 = -phi1;
+ +            }
+ +        }
+ +
+ +        xphi1 = phi1 + M_PI; /* 1 */
+ +
+ +        /* Second torsion */
+ +        a2i   = aj;
+ +        a2j   = ak;
+ +        a2k   = al;
+ +        a2l   = am;
+ +
+ +        phi2  = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
+ +                          &sign2, &t12, &t22, &t32); /* 84 */
+ +
+ +        cos_phi2 = cos(phi2);
+ +
+ +        a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
+ +        a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
+ +        a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
+ +
+ +        b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
+ +        b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
+ +        b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
+ +
+ +        tmp = pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
+ +
+ +        ra22  = iprod(a2, a2);         /* 5 */
+ +        rb22  = iprod(b2, b2);         /* 5 */
+ +        rg22  = iprod(r2_kj, r2_kj);   /* 5 */
+ +        rg2   = sqrt(rg22);
+ +
+ +        rgr2  = 1.0/rg2;
+ +        ra2r2 = 1.0/ra22;
+ +        rb2r2 = 1.0/rb22;
+ +        rabr2 = sqrt(ra2r2*rb2r2);
+ +
+ +        sin_phi2 = rg2 * rabr2 * iprod(a2, h2) * (-1);
+ +
+ +        if (cos_phi2 < -0.5 || cos_phi2 > 0.5)
+ +        {
+ +            phi2 = asin(sin_phi2);
+ +
+ +            if (cos_phi2 < 0)
+ +            {
+ +                if (phi2 > 0)
+ +                {
+ +                    phi2 = M_PI - phi2;
+ +                }
+ +                else
+ +                {
+ +                    phi2 = -M_PI - phi2;
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            phi2 = acos(cos_phi2);
+ +
+ +            if (sin_phi2 < 0)
+ +            {
+ +                phi2 = -phi2;
+ +            }
+ +        }
+ +
+ +        xphi2 = phi2 + M_PI; /* 1 */
+ +
+ +        /* Range mangling */
+ +        if (xphi1 < 0)
+ +        {
+ +            xphi1 = xphi1 + 2*M_PI;
+ +        }
+ +        else if (xphi1 >= 2*M_PI)
+ +        {
+ +            xphi1 = xphi1 - 2*M_PI;
+ +        }
+ +
+ +        if (xphi2 < 0)
+ +        {
+ +            xphi2 = xphi2 + 2*M_PI;
+ +        }
+ +        else if (xphi2 >= 2*M_PI)
+ +        {
+ +            xphi2 = xphi2 - 2*M_PI;
+ +        }
+ +
+ +        /* Number of grid points */
+ +        dx = 2*M_PI / cmap_grid->grid_spacing;
+ +
+ +        /* Where on the grid are we */
+ +        iphi1 = (int)(xphi1/dx);
+ +        iphi2 = (int)(xphi2/dx);
+ +
+ +        iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
+ +        iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
+ +
+ +        pos1    = iphi1*cmap_grid->grid_spacing+iphi2;
+ +        pos2    = ip1p1*cmap_grid->grid_spacing+iphi2;
+ +        pos3    = ip1p1*cmap_grid->grid_spacing+ip2p1;
+ +        pos4    = iphi1*cmap_grid->grid_spacing+ip2p1;
+ +
+ +        ty[0]   = cmapd[pos1*4];
+ +        ty[1]   = cmapd[pos2*4];
+ +        ty[2]   = cmapd[pos3*4];
+ +        ty[3]   = cmapd[pos4*4];
+ +
+ +        ty1[0]   = cmapd[pos1*4+1];
+ +        ty1[1]   = cmapd[pos2*4+1];
+ +        ty1[2]   = cmapd[pos3*4+1];
+ +        ty1[3]   = cmapd[pos4*4+1];
+ +
+ +        ty2[0]   = cmapd[pos1*4+2];
+ +        ty2[1]   = cmapd[pos2*4+2];
+ +        ty2[2]   = cmapd[pos3*4+2];
+ +        ty2[3]   = cmapd[pos4*4+2];
+ +
+ +        ty12[0]   = cmapd[pos1*4+3];
+ +        ty12[1]   = cmapd[pos2*4+3];
+ +        ty12[2]   = cmapd[pos3*4+3];
+ +        ty12[3]   = cmapd[pos4*4+3];
+ +
+ +        /* Switch to degrees */
+ +        dx    = 360.0 / cmap_grid->grid_spacing;
+ +        xphi1 = xphi1 * RAD2DEG;
+ +        xphi2 = xphi2 * RAD2DEG;
+ +
+ +        for (i = 0; i < 4; i++) /* 16 */
+ +        {
+ +            tx[i]    = ty[i];
+ +            tx[i+4]  = ty1[i]*dx;
+ +            tx[i+8]  = ty2[i]*dx;
+ +            tx[i+12] = ty12[i]*dx*dx;
+ +        }
+ +
+ +        idx = 0;
+ +        for (i = 0; i < 4; i++) /* 1056 */
+ +        {
+ +            for (j = 0; j < 4; j++)
+ +            {
+ +                xx = 0;
+ +                for (k = 0; k < 16; k++)
+ +                {
+ +                    xx = xx + cmap_coeff_matrix[k*16+idx]*tx[k];
+ +                }
+ +
+ +                idx++;
+ +                tc[i*4+j] = xx;
+ +            }
+ +        }
+ +
+ +        tt    = (xphi1-iphi1*dx)/dx;
+ +        tu    = (xphi2-iphi2*dx)/dx;
+ +
+ +        e     = 0;
+ +        df1   = 0;
+ +        df2   = 0;
+ +        ddf1  = 0;
+ +        ddf2  = 0;
+ +        ddf12 = 0;
+ +
+ +        for (i = 3; i >= 0; i--)
+ +        {
+ +            l1 = loop_index[i][3];
+ +            l2 = loop_index[i][2];
+ +            l3 = loop_index[i][1];
+ +
+ +            e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
+ +            df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
+ +            df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
+ +            ddf1  = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
+ +            ddf2  = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
+ +        }
+ +
+ +        ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
+ +            3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
+ +
+ +        fac     = RAD2DEG/dx;
+ +        df1     = df1   * fac;
+ +        df2     = df2   * fac;
+ +        ddf1    = ddf1  * fac * fac;
+ +        ddf2    = ddf2  * fac * fac;
+ +        ddf12   = ddf12 * fac * fac;
+ +
+ +        /* CMAP energy */
+ +        vtot += e;
+ +
+ +        /* Do forces - first torsion */
+ +        fg1       = iprod(r1_ij, r1_kj);
+ +        hg1       = iprod(r1_kl, r1_kj);
+ +        fga1      = fg1*ra2r1*rgr1;
+ +        hgb1      = hg1*rb2r1*rgr1;
+ +        gaa1      = -ra2r1*rg1;
+ +        gbb1      = rb2r1*rg1;
+ +
+ +        for (i = 0; i < DIM; i++)
+ +        {
+ +            dtf1[i]   = gaa1 * a1[i];
+ +            dtg1[i]   = fga1 * a1[i] - hgb1 * b1[i];
+ +            dth1[i]   = gbb1 * b1[i];
+ +
+ +            f1[i]     = df1  * dtf1[i];
+ +            g1[i]     = df1  * dtg1[i];
+ +            h1[i]     = df1  * dth1[i];
+ +
+ +            f1_i[i]   =  f1[i];
+ +            f1_j[i]   = -f1[i] - g1[i];
+ +            f1_k[i]   =  h1[i] + g1[i];
+ +            f1_l[i]   = -h1[i];
+ +
+ +            f[a1i][i] = f[a1i][i] + f1_i[i];
+ +            f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */
+ +            f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */
+ +            f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */
+ +        }
+ +
+ +        /* Do forces - second torsion */
+ +        fg2       = iprod(r2_ij, r2_kj);
+ +        hg2       = iprod(r2_kl, r2_kj);
+ +        fga2      = fg2*ra2r2*rgr2;
+ +        hgb2      = hg2*rb2r2*rgr2;
+ +        gaa2      = -ra2r2*rg2;
+ +        gbb2      = rb2r2*rg2;
+ +
+ +        for (i = 0; i < DIM; i++)
+ +        {
+ +            dtf2[i]   = gaa2 * a2[i];
+ +            dtg2[i]   = fga2 * a2[i] - hgb2 * b2[i];
+ +            dth2[i]   = gbb2 * b2[i];
+ +
+ +            f2[i]     = df2  * dtf2[i];
+ +            g2[i]     = df2  * dtg2[i];
+ +            h2[i]     = df2  * dth2[i];
+ +
+ +            f2_i[i]   =  f2[i];
+ +            f2_j[i]   = -f2[i] - g2[i];
+ +            f2_k[i]   =  h2[i] + g2[i];
+ +            f2_l[i]   = -h2[i];
+ +
+ +            f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */
+ +            f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */
+ +            f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */
+ +            f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */
+ +        }
+ +
+ +        /* Shift forces */
+ +        if (g)
+ +        {
+ +            copy_ivec(SHIFT_IVEC(g, a1j), jt1);
+ +            ivec_sub(SHIFT_IVEC(g, a1i),  jt1, dt1_ij);
+ +            ivec_sub(SHIFT_IVEC(g, a1k),  jt1, dt1_kj);
+ +            ivec_sub(SHIFT_IVEC(g, a1l),  jt1, dt1_lj);
+ +            t11 = IVEC2IS(dt1_ij);
+ +            t21 = IVEC2IS(dt1_kj);
+ +            t31 = IVEC2IS(dt1_lj);
+ +
+ +            copy_ivec(SHIFT_IVEC(g, a2j), jt2);
+ +            ivec_sub(SHIFT_IVEC(g, a2i),  jt2, dt2_ij);
+ +            ivec_sub(SHIFT_IVEC(g, a2k),  jt2, dt2_kj);
+ +            ivec_sub(SHIFT_IVEC(g, a2l),  jt2, dt2_lj);
+ +            t12 = IVEC2IS(dt2_ij);
+ +            t22 = IVEC2IS(dt2_kj);
+ +            t32 = IVEC2IS(dt2_lj);
+ +        }
+ +        else if (pbc)
+ +        {
+ +            t31 = pbc_rvec_sub(pbc, x[a1l], x[a1j], h1);
+ +            t32 = pbc_rvec_sub(pbc, x[a2l], x[a2j], h2);
+ +        }
+ +        else
+ +        {
+ +            t31 = CENTRAL;
+ +            t32 = CENTRAL;
+ +        }
+ +
+ +        rvec_inc(fshift[t11], f1_i);
+ +        rvec_inc(fshift[CENTRAL], f1_j);
+ +        rvec_inc(fshift[t21], f1_k);
+ +        rvec_inc(fshift[t31], f1_l);
+ +
+ +        rvec_inc(fshift[t21], f2_i);
+ +        rvec_inc(fshift[CENTRAL], f2_j);
+ +        rvec_inc(fshift[t22], f2_k);
+ +        rvec_inc(fshift[t32], f2_l);
+ +    }
+ +    return vtot;
+ +}
+ +
+ +
+ +
+ +/***********************************************************
+ + *
+ + *   G R O M O S  9 6   F U N C T I O N S
+ + *
+ + ***********************************************************/
+ +real g96harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ +                 real *V, real *F)
+ +{
+ +    const real half = 0.5;
+ +    real       L1, kk, x0, dx, dx2;
+ +    real       v, f, dvdlambda;
+ +
+ +    L1    = 1.0-lambda;
+ +    kk    = L1*kA+lambda*kB;
+ +    x0    = L1*xA+lambda*xB;
+ +
+ +    dx    = x-x0;
+ +    dx2   = dx*dx;
+ +
+ +    f          = -kk*dx;
+ +    v          = half*kk*dx2;
+ +    dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+ +
+ +    *F    = f;
+ +    *V    = v;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 21 flops */
+ +}
+ +
+ +real g96bonds(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec f[], rvec fshift[],
+ +              const t_pbc *pbc, const t_graph *g,
+ +              real lambda, real *dvdlambda,
+ +              const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +              int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ki, ai, aj, type;
+ +    real dr2, fbond, vbond, fij, vtot;
+ +    rvec dx;
+ +    ivec dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +
+ +        *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ +                                  forceparams[type].harmonic.krB,
+ +                                  forceparams[type].harmonic.rA,
+ +                                  forceparams[type].harmonic.rB,
+ +                                  dr2, lambda, &vbond, &fbond);
+ +
+ +        vtot  += 0.5*vbond;                         /* 1*/
+ +#ifdef DEBUG
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "G96-BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
+ +                    sqrt(dr2), vbond, fbond);
+ +        }
+ +#endif
+ +
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }               /* 44 TOTAL       */
+ +    return vtot;
+ +}
+ +
+ +real g96bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ +                   rvec r_ij, rvec r_kj,
+ +                   int *t1, int *t2)
+ +/* Return value is the angle between the bonds i-j and j-k */
+ +{
+ +    real costh;
+ +
+ +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3              */
+ +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3              */
+ +
+ +    costh = cos_angle(r_ij, r_kj);         /* 25              */
+ +    /* 41 TOTAL       */
+ +    return costh;
+ +}
+ +
+ +real g96angles(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph *g,
+ +               real lambda, real *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ai, aj, ak, type, m, t1, t2;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, dVdt, va, vtot;
+ +    real rij_1, rij_2, rkj_1, rkj_2, rijrkj_1;
+ +    rvec f_i, f_j, f_k;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        cos_theta  = g96bond_angle(x[ai], x[aj], x[ak], pbc, r_ij, r_kj, &t1, &t2);
+ +
+ +        *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ +                                  forceparams[type].harmonic.krB,
+ +                                  forceparams[type].harmonic.rA,
+ +                                  forceparams[type].harmonic.rB,
+ +                                  cos_theta, lambda, &va, &dVdt);
+ +        vtot    += va;
+ +
+ +        rij_1    = gmx_invsqrt(iprod(r_ij, r_ij));
+ +        rkj_1    = gmx_invsqrt(iprod(r_kj, r_kj));
+ +        rij_2    = rij_1*rij_1;
+ +        rkj_2    = rkj_1*rkj_1;
+ +        rijrkj_1 = rij_1*rkj_1;                 /* 23 */
+ +
+ +#ifdef DEBUG
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "G96ANGLES: costheta = %10g  vth = %10g  dV/dct = %10g\n",
+ +                    cos_theta, va, dVdt);
+ +        }
+ +#endif
+ +        for (m = 0; (m < DIM); m++)     /*  42        */
+ +        {
+ +            f_i[m]    = dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
+ +            f_k[m]    = dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
+ +            f_j[m]    = -f_i[m]-f_k[m];
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        if (g)
+ +        {
+ +            copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +            t1 = IVEC2IS(dt_ij);
+ +            t2 = IVEC2IS(dt_kj);
+ +        }
+ +        rvec_inc(fshift[t1], f_i);
+ +        rvec_inc(fshift[CENTRAL], f_j);
+ +        rvec_inc(fshift[t2], f_k);          /* 9 */
+ +        /* 163 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +real cross_bond_bond(int nbonds,
+ +                     const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                     const rvec x[], rvec f[], rvec fshift[],
+ +                     const t_pbc *pbc, const t_graph *g,
+ +                     real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused  *fcd,
+ +                     int gmx_unused *global_atom_index)
+ +{
+ +    /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ +     * pp. 842-847
+ +     */
+ +    int  i, ai, aj, ak, type, m, t1, t2;
+ +    rvec r_ij, r_kj;
+ +    real vtot, vrr, s1, s2, r1, r2, r1e, r2e, krr;
+ +    rvec f_i, f_j, f_k;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        r1e  = forceparams[type].cross_bb.r1e;
+ +        r2e  = forceparams[type].cross_bb.r2e;
+ +        krr  = forceparams[type].cross_bb.krr;
+ +
+ +        /* Compute distance vectors ... */
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ +
+ +        /* ... and their lengths */
+ +        r1 = norm(r_ij);
+ +        r2 = norm(r_kj);
+ +
+ +        /* Deviations from ideality */
+ +        s1 = r1-r1e;
+ +        s2 = r2-r2e;
+ +
+ +        /* Energy (can be negative!) */
+ +        vrr   = krr*s1*s2;
+ +        vtot += vrr;
+ +
+ +        /* Forces */
+ +        svmul(-krr*s2/r1, r_ij, f_i);
+ +        svmul(-krr*s1/r2, r_kj, f_k);
+ +
+ +        for (m = 0; (m < DIM); m++)     /*  12        */
+ +        {
+ +            f_j[m]    = -f_i[m] - f_k[m];
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        /* Virial stuff */
+ +        if (g)
+ +        {
+ +            copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +            t1 = IVEC2IS(dt_ij);
+ +            t2 = IVEC2IS(dt_kj);
+ +        }
+ +        rvec_inc(fshift[t1], f_i);
+ +        rvec_inc(fshift[CENTRAL], f_j);
+ +        rvec_inc(fshift[t2], f_k);          /* 9 */
+ +        /* 163 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +real cross_bond_angle(int nbonds,
+ +                      const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                      const rvec x[], rvec f[], rvec fshift[],
+ +                      const t_pbc *pbc, const t_graph *g,
+ +                      real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                      int gmx_unused *global_atom_index)
+ +{
+ +    /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ +     * pp. 842-847
+ +     */
+ +    int  i, ai, aj, ak, type, m, t1, t2, t3;
+ +    rvec r_ij, r_kj, r_ik;
+ +    real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
+ +    rvec f_i, f_j, f_k;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        r1e  = forceparams[type].cross_ba.r1e;
+ +        r2e  = forceparams[type].cross_ba.r2e;
+ +        r3e  = forceparams[type].cross_ba.r3e;
+ +        krt  = forceparams[type].cross_ba.krt;
+ +
+ +        /* Compute distance vectors ... */
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ +        t3 = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
+ +
+ +        /* ... and their lengths */
+ +        r1 = norm(r_ij);
+ +        r2 = norm(r_kj);
+ +        r3 = norm(r_ik);
+ +
+ +        /* Deviations from ideality */
+ +        s1 = r1-r1e;
+ +        s2 = r2-r2e;
+ +        s3 = r3-r3e;
+ +
+ +        /* Energy (can be negative!) */
+ +        vrt   = krt*s3*(s1+s2);
+ +        vtot += vrt;
+ +
+ +        /* Forces */
+ +        k1 = -krt*(s3/r1);
+ +        k2 = -krt*(s3/r2);
+ +        k3 = -krt*(s1+s2)/r3;
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            f_i[m] = k1*r_ij[m] + k3*r_ik[m];
+ +            f_k[m] = k2*r_kj[m] - k3*r_ik[m];
+ +            f_j[m] = -f_i[m] - f_k[m];
+ +        }
+ +
+ +        for (m = 0; (m < DIM); m++)     /*  12        */
+ +        {
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        /* Virial stuff */
+ +        if (g)
+ +        {
+ +            copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +            t1 = IVEC2IS(dt_ij);
+ +            t2 = IVEC2IS(dt_kj);
+ +        }
+ +        rvec_inc(fshift[t1], f_i);
+ +        rvec_inc(fshift[CENTRAL], f_j);
+ +        rvec_inc(fshift[t2], f_k);          /* 9 */
+ +        /* 163 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +static real bonded_tab(const char *type, int table_nr,
+ +                       const bondedtable_t *table, real kA, real kB, real r,
+ +                       real lambda, real *V, real *F)
+ +{
+ +    real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
+ +    int  n0, nnn;
+ +    real v, f, dvdlambda;
+ +
+ +    k = (1.0 - lambda)*kA + lambda*kB;
+ +
+ +    tabscale = table->scale;
+ +    VFtab    = table->data;
+ +
+ +    rt    = r*tabscale;
+ +    n0    = rt;
+ +    if (n0 >= table->n)
+ +    {
+ +        gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
+ +                  type, table_nr, r, n0, n0+1, table->n);
+ +    }
+ +    eps   = rt - n0;
+ +    eps2  = eps*eps;
+ +    nnn   = 4*n0;
+ +    Yt    = VFtab[nnn];
+ +    Ft    = VFtab[nnn+1];
+ +    Geps  = VFtab[nnn+2]*eps;
+ +    Heps2 = VFtab[nnn+3]*eps2;
+ +    Fp    = Ft + Geps + Heps2;
+ +    VV    = Yt + Fp*eps;
+ +    FF    = Fp + Geps + 2.0*Heps2;
+ +
+ +    *F         = -k*FF*tabscale;
+ +    *V         = k*VV;
+ +    dvdlambda  = (kB - kA)*VV;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 22 flops */
+ +}
+ +
+ +real tab_bonds(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph *g,
+ +               real lambda, real *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ +               int gmx_unused  *global_atom_index)
+ +{
+ +    int  i, m, ki, ai, aj, type, table;
+ +    real dr, dr2, fbond, vbond, fij, vtot;
+ +    rvec dx;
+ +    ivec dt;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +        dr   = dr2*gmx_invsqrt(dr2);                /*  10            */
+ +
+ +        table = forceparams[type].tab.table;
+ +
+ +        *dvdlambda += bonded_tab("bond", table,
+ +                                 &fcd->bondtab[table],
+ +                                 forceparams[type].tab.kA,
+ +                                 forceparams[type].tab.kB,
+ +                                 dr, lambda, &vbond, &fbond); /*  22 */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +
+ +        vtot  += vbond;            /* 1*/
+ +        fbond *= gmx_invsqrt(dr2); /*   6             */
+ +#ifdef DEBUG
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "TABBONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
+ +                    dr, vbond, fbond);
+ +        }
+ +#endif
+ +        if (g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +            ki = IVEC2IS(dt);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fij                 = fbond*dx[m];
+ +            f[ai][m]           += fij;
+ +            f[aj][m]           -= fij;
+ +            fshift[ki][m]      += fij;
+ +            fshift[CENTRAL][m] -= fij;
+ +        }
+ +    }               /* 62 TOTAL       */
+ +    return vtot;
+ +}
+ +
+ +real tab_angles(int nbonds,
+ +                const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                const rvec x[], rvec f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                real lambda, real *dvdlambda,
+ +                const t_mdatoms gmx_unused  *md, t_fcdata *fcd,
+ +                int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ai, aj, ak, t1, t2, type, table;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2); /*  41         */
+ +
+ +        table = forceparams[type].tab.table;
+ +
+ +        *dvdlambda += bonded_tab("angle", table,
+ +                                 &fcd->angletab[table],
+ +                                 forceparams[type].tab.kA,
+ +                                 forceparams[type].tab.kB,
+ +                                 theta, lambda, &va, &dVdt); /*  22  */
+ +        vtot += va;
+ +
+ +        cos_theta2 = sqr(cos_theta);            /*   1                */
+ +        if (cos_theta2 < 1)
+ +        {
+ +            int  m;
+ +            real snt, st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12            */
+ +            sth = st*cos_theta;                     /*   1            */
+ +#ifdef DEBUG
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
+ +                        theta*RAD2DEG, va, dVdt);
+ +            }
+ +#endif
+ +            nrkj2 = iprod(r_kj, r_kj);  /*   5                */
+ +            nrij2 = iprod(r_ij, r_ij);
+ +
+ +            cik = st*gmx_invsqrt(nrkj2*nrij2); /*  12         */
+ +            cii = sth/nrij2;                   /*  10         */
+ +            ckk = sth/nrkj2;                   /*  10         */
+ +
+ +            for (m = 0; (m < DIM); m++)        /*  39         */
+ +            {
+ +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m]-f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +        }                                       /* 169 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +real tab_dihs(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec f[], rvec fshift[],
+ +              const t_pbc *pbc, const t_graph *g,
+ +              real lambda, real *dvdlambda,
+ +              const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ +              int gmx_unused *global_atom_index)
+ +{
+ +    int  i, type, ai, aj, ak, al, table;
+ +    int  t1, t2, t3;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +    real phi, sign, ddphi, vpd, vtot;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &sign, &t1, &t2, &t3);  /*  84  */
+ +
+ +        table = forceparams[type].tab.table;
+ +
+ +        /* Hopefully phi+M_PI never results in values < 0 */
+ +        *dvdlambda += bonded_tab("dihedral", table,
+ +                                 &fcd->dihtab[table],
+ +                                 forceparams[type].tab.kA,
+ +                                 forceparams[type].tab.kB,
+ +                                 phi+M_PI, lambda, &vpd, &ddphi);
+ +
+ +        vtot += vpd;
+ +        do_dih_fup(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
+ +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112  */
+ +
+ +#ifdef DEBUG
+ +        fprintf(debug, "pdih: (%d,%d,%d,%d) phi=%g\n",
+ +                ai, aj, ak, al, phi);
+ +#endif
+ +    } /* 227 TOTAL  */
+ +
+ +    return vtot;
+ +}
+ +
+ +static unsigned
+ +calc_bonded_reduction_mask(const t_idef *idef,
+ +                           int shift,
+ +                           int t, int nt)
+ +{
+ +    unsigned mask;
+ +    int      ftype, nb, nat1, nb0, nb1, i, a;
+ +
+ +    mask = 0;
+ +
+ +    for (ftype = 0; ftype < F_NRE; ftype++)
+ +    {
+ +        if (interaction_function[ftype].flags & IF_BOND &&
+ +            !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
+ +            (ftype<F_GB12 || ftype>F_GB14))
+ +        {
+ +            nb = idef->il[ftype].nr;
+ +            if (nb > 0)
+ +            {
+ +                nat1 = interaction_function[ftype].nratoms + 1;
+ +
+ +                /* Divide this interaction equally over the threads.
+ +                 * This is not stored: should match division in calc_bonds.
+ +                 */
+ +                nb0 = (((nb/nat1)* t   )/nt)*nat1;
+ +                nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
+ +
+ +                for (i = nb0; i < nb1; i += nat1)
+ +                {
+ +                    for (a = 1; a < nat1; a++)
+ +                    {
+ +                        mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    return mask;
+ +}
+ +
+ +void init_bonded_thread_force_reduction(t_forcerec   *fr,
+ +                                        const t_idef *idef)
+ +{
+ +#define MAX_BLOCK_BITS 32
+ +    int t;
+ +    int ctot, c, b;
+ +
+ +    if (fr->nthreads <= 1)
+ +    {
+ +        fr->red_nblock = 0;
+ +
+ +        return;
+ +    }
+ +
+ +    /* We divide the force array in a maximum of 32 blocks.
+ +     * Minimum force block reduction size is 2^6=64.
+ +     */
+ +    fr->red_ashift = 6;
+ +    while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
+ +    {
+ +        fr->red_ashift++;
+ +    }
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "bonded force buffer block atom shift %d bits\n",
+ +                fr->red_ashift);
+ +    }
+ +
+ +    /* Determine to which blocks each thread's bonded force calculation
+ +     * contributes. Store this is a mask for each thread.
+ +     */
+ +#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+ +    for (t = 1; t < fr->nthreads; t++)
+ +    {
+ +        fr->f_t[t].red_mask =
+ +            calc_bonded_reduction_mask(idef, fr->red_ashift, t, fr->nthreads);
+ +    }
+ +
+ +    /* Determine the maximum number of blocks we need to reduce over */
+ +    fr->red_nblock = 0;
+ +    ctot           = 0;
+ +    for (t = 0; t < fr->nthreads; t++)
+ +    {
+ +        c = 0;
+ +        for (b = 0; b < MAX_BLOCK_BITS; b++)
+ +        {
+ +            if (fr->f_t[t].red_mask & (1U<<b))
+ +            {
+ +                fr->red_nblock = max(fr->red_nblock, b+1);
+ +                c++;
+ +            }
+ +        }
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "thread %d flags %x count %d\n",
+ +                    t, fr->f_t[t].red_mask, c);
+ +        }
+ +        ctot += c;
+ +    }
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
+ +                fr->red_nblock, 1<<fr->red_ashift);
+ +        fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
+ +                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
+ +                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
+ +    }
+ +}
+ +
+ +static void zero_thread_forces(f_thread_t *f_t, int n,
+ +                               int nblock, int blocksize)
+ +{
+ +    int b, a0, a1, a, i, j;
+ +
+ +    if (n > f_t->f_nalloc)
+ +    {
+ +        f_t->f_nalloc = over_alloc_large(n);
+ +        srenew(f_t->f, f_t->f_nalloc);
+ +    }
+ +
+ +    if (f_t->red_mask != 0)
+ +    {
+ +        for (b = 0; b < nblock; b++)
+ +        {
+ +            if (f_t->red_mask && (1U<<b))
+ +            {
+ +                a0 = b*blocksize;
+ +                a1 = min((b+1)*blocksize, n);
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    clear_rvec(f_t->f[a]);
+ +                }
+ +            }
+ +        }
+ +    }
+ +    for (i = 0; i < SHIFTS; i++)
+ +    {
+ +        clear_rvec(f_t->fshift[i]);
+ +    }
+ +    for (i = 0; i < F_NRE; i++)
+ +    {
+ +        f_t->ener[i] = 0;
+ +    }
+ +    for (i = 0; i < egNR; i++)
+ +    {
+ +        for (j = 0; j < f_t->grpp.nener; j++)
+ +        {
+ +            f_t->grpp.ener[i][j] = 0;
+ +        }
+ +    }
+ +    for (i = 0; i < efptNR; i++)
+ +    {
+ +        f_t->dvdl[i] = 0;
+ +    }
+ +}
+ +
+ +static void reduce_thread_force_buffer(int n, rvec *f,
+ +                                       int nthreads, f_thread_t *f_t,
+ +                                       int nblock, int block_size)
+ +{
+ +    /* The max thread number is arbitrary,
+ +     * we used a fixed number to avoid memory management.
+ +     * Using more than 16 threads is probably never useful performance wise.
+ +     */
+ +#define MAX_BONDED_THREADS 256
+ +    int b;
+ +
+ +    if (nthreads > MAX_BONDED_THREADS)
+ +    {
+ +        gmx_fatal(FARGS, "Can not reduce bonded forces on more than %d threads",
+ +                  MAX_BONDED_THREADS);
+ +    }
+ +
+ +    /* This reduction can run on any number of threads,
+ +     * independently of nthreads.
+ +     */
+ +#pragma omp parallel for num_threads(nthreads) schedule(static)
+ +    for (b = 0; b < nblock; b++)
+ +    {
+ +        rvec *fp[MAX_BONDED_THREADS];
+ +        int   nfb, ft, fb;
+ +        int   a0, a1, a;
+ +
+ +        /* Determine which threads contribute to this block */
+ +        nfb = 0;
+ +        for (ft = 1; ft < nthreads; ft++)
+ +        {
+ +            if (f_t[ft].red_mask & (1U<<b))
+ +            {
+ +                fp[nfb++] = f_t[ft].f;
+ +            }
+ +        }
+ +        if (nfb > 0)
+ +        {
+ +            /* Reduce force buffers for threads that contribute */
+ +            a0 =  b   *block_size;
+ +            a1 = (b+1)*block_size;
+ +            a1 = min(a1, n);
+ +            for (a = a0; a < a1; a++)
+ +            {
+ +                for (fb = 0; fb < nfb; fb++)
+ +                {
+ +                    rvec_inc(f[a], fp[fb][a]);
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void reduce_thread_forces(int n, rvec *f, rvec *fshift,
+ +                                 real *ener, gmx_grppairener_t *grpp, real *dvdl,
+ +                                 int nthreads, f_thread_t *f_t,
+ +                                 int nblock, int block_size,
+ +                                 gmx_bool bCalcEnerVir,
+ +                                 gmx_bool bDHDL)
+ +{
+ +    if (nblock > 0)
+ +    {
+ +        /* Reduce the bonded force buffer */
+ +        reduce_thread_force_buffer(n, f, nthreads, f_t, nblock, block_size);
+ +    }
+ +
+ +    /* When necessary, reduce energy and virial using one thread only */
+ +    if (bCalcEnerVir)
+ +    {
+ +        int t, i, j;
+ +
+ +        for (i = 0; i < SHIFTS; i++)
+ +        {
+ +            for (t = 1; t < nthreads; t++)
+ +            {
+ +                rvec_inc(fshift[i], f_t[t].fshift[i]);
+ +            }
+ +        }
+ +        for (i = 0; i < F_NRE; i++)
+ +        {
+ +            for (t = 1; t < nthreads; t++)
+ +            {
+ +                ener[i] += f_t[t].ener[i];
+ +            }
+ +        }
+ +        for (i = 0; i < egNR; i++)
+ +        {
+ +            for (j = 0; j < f_t[1].grpp.nener; j++)
+ +            {
+ +                for (t = 1; t < nthreads; t++)
+ +                {
+ +
+ +                    grpp->ener[i][j] += f_t[t].grpp.ener[i][j];
+ +                }
+ +            }
+ +        }
+ +        if (bDHDL)
+ +        {
+ +            for (i = 0; i < efptNR; i++)
+ +            {
+ +
+ +                for (t = 1; t < nthreads; t++)
+ +                {
+ +                    dvdl[i] += f_t[t].dvdl[i];
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static real calc_one_bond(FILE *fplog, int thread,
+ +                          int ftype, const t_idef *idef,
+ +                          rvec x[], rvec f[], rvec fshift[],
+ +                          t_forcerec *fr,
+ +                          const t_pbc *pbc, const t_graph *g,
+ +                          gmx_enerdata_t gmx_unused *enerd, gmx_grppairener_t *grpp,
+ +                          t_nrnb *nrnb,
+ +                          real *lambda, real *dvdl,
+ +                          const t_mdatoms *md, t_fcdata *fcd,
+ +                          gmx_bool bCalcEnerVir,
+ +                          int *global_atom_index, gmx_bool bPrintSepPot)
+ +{
+ +    int      ind, nat1, nbonds, efptFTYPE;
+ +    real     v = 0;
+ +    t_iatom *iatoms;
+ +    int      nb0, nbn;
+ +
+ +    if (IS_RESTRAINT_TYPE(ftype))
+ +    {
+ +        efptFTYPE = efptRESTRAINT;
+ +    }
+ +    else
+ +    {
+ +        efptFTYPE = efptBONDED;
+ +    }
+ +
+ +    if (interaction_function[ftype].flags & IF_BOND &&
+ +        !(ftype == F_CONNBONDS || ftype == F_POSRES))
+ +    {
+ +        ind       = interaction_function[ftype].nrnb_ind;
+ +        nat1      = interaction_function[ftype].nratoms + 1;
+ +        nbonds    = idef->il[ftype].nr/nat1;
+ +        iatoms    = idef->il[ftype].iatoms;
+ +
+ +        nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
+ +        nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
+ +
+ +        if (!IS_LISTED_LJ_C(ftype))
+ +        {
+ +            if (ftype == F_CMAP)
+ +            {
+ +                v = cmap_dihs(nbn, iatoms+nb0,
+ +                              idef->iparams, &idef->cmap_grid,
+ +                              (const rvec*)x, f, fshift,
+ +                              pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+ +                              md, fcd, global_atom_index);
+ +            }
+ +#ifdef SIMD_BONDEDS
+ +            else if (ftype == F_ANGLES &&
+ +                     !bCalcEnerVir && fr->efep == efepNO)
+ +            {
+ +                /* No energies, shift forces, dvdl */
+ +                angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+ +                                   idef->iparams,
+ +                                   (const rvec*)x, f,
+ +                                   pbc, g, lambda[efptFTYPE], md, fcd,
+ +                                   global_atom_index);
+ +                v = 0;
+ +            }
+ +#endif
+ +            else if (ftype == F_PDIHS &&
+ +                     !bCalcEnerVir && fr->efep == efepNO)
+ +            {
+ +                /* No energies, shift forces, dvdl */
+ +#ifndef SIMD_BONDEDS
+ +                pdihs_noener
+ +#else
+ +                pdihs_noener_simd
+ +#endif
+ +                    (nbn, idef->il[ftype].iatoms+nb0,
+ +                    idef->iparams,
+ +                    (const rvec*)x, f,
+ +                    pbc, g, lambda[efptFTYPE], md, fcd,
+ +                    global_atom_index);
+ +                v = 0;
+ +            }
+ +            else
+ +            {
+ +                v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
+ +                                                      idef->iparams,
+ +                                                      (const rvec*)x, f, fshift,
+ +                                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+ +                                                      md, fcd, global_atom_index);
+ +            }
+ +            if (bPrintSepPot)
+ +            {
+ +                fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
+ +                        interaction_function[ftype].longname,
+ +                        nbonds/nat1, v, lambda[efptFTYPE]);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
+ +                                    pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
+ +
+ +            if (bPrintSepPot)
+ +            {
+ +                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+ +                        interaction_function[ftype].longname,
+ +                        interaction_function[F_LJ14].longname, nbonds/nat1, dvdl[efptVDW]);
+ +                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+ +                        interaction_function[ftype].longname,
+ +                        interaction_function[F_COUL14].longname, nbonds/nat1, dvdl[efptCOUL]);
+ +            }
+ +        }
+ +        if (ind != -1 && thread == 0)
+ +        {
+ +            inc_nrnb(nrnb, ind, nbonds);
+ +        }
+ +    }
+ +
+ +    return v;
+ +}
+ +
+ +/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
+ +   function, or horrible things will happen when doing free energy
+ +   calculations!  In a good coding world, this would not be a
+ +   different function, but for speed reasons, it needs to be made a
+ +   separate function.  TODO for 5.0 - figure out a way to reorganize
+ +   to reduce duplication.
+ + */
+ +
+ +static real calc_one_bond_foreign(FILE gmx_unused *fplog, int ftype, const t_idef *idef,
+ +                                  rvec x[], rvec f[], t_forcerec *fr,
+ +                                  const t_pbc *pbc, const t_graph *g,
+ +                                  gmx_grppairener_t *grpp, t_nrnb *nrnb,
+ +                                  real *lambda, real *dvdl,
+ +                                  const t_mdatoms *md, t_fcdata *fcd,
+ +                                  int *global_atom_index, gmx_bool gmx_unused bPrintSepPot)
+ +{
+ +    int      ind, nat1, nbonds, efptFTYPE, nbonds_np;
+ +    real     v = 0;
+ +    t_iatom *iatoms;
+ +
+ +    if (IS_RESTRAINT_TYPE(ftype))
+ +    {
+ +        efptFTYPE = efptRESTRAINT;
+ +    }
+ +    else
+ +    {
+ +        efptFTYPE = efptBONDED;
+ +    }
+ +
+ +    if (ftype < F_GB12 || ftype > F_GB14)
+ +    {
+ +        if (interaction_function[ftype].flags & IF_BOND &&
+ +            !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
+ +        {
+ +            ind       = interaction_function[ftype].nrnb_ind;
+ +            nat1      = interaction_function[ftype].nratoms+1;
+ +            nbonds_np = idef->il[ftype].nr_nonperturbed;
+ +            nbonds    = idef->il[ftype].nr - nbonds_np;
+ +            iatoms    = idef->il[ftype].iatoms + nbonds_np;
+ +            if (nbonds > 0)
+ +            {
+ +                if (!IS_LISTED_LJ_C(ftype))
+ +                {
+ +                    if (ftype == F_CMAP)
+ +                    {
+ +                        v = cmap_dihs(nbonds, iatoms,
+ +                                      idef->iparams, &idef->cmap_grid,
+ +                                      (const rvec*)x, f, fr->fshift,
+ +                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]), md, fcd,
+ +                                      global_atom_index);
+ +                    }
+ +                    else
+ +                    {
+ +                        v =     interaction_function[ftype].ifunc(nbonds, iatoms,
+ +                                                                  idef->iparams,
+ +                                                                  (const rvec*)x, f, fr->fshift,
+ +                                                                  pbc, g, lambda[efptFTYPE], &dvdl[efptFTYPE],
+ +                                                                  md, fcd, global_atom_index);
+ +                    }
+ +                }
+ +                else
+ +                {
+ +                    v = do_nonbonded_listed(ftype, nbonds, iatoms,
+ +                                            idef->iparams,
+ +                                            (const rvec*)x, f, fr->fshift,
+ +                                            pbc, g, lambda, dvdl,
+ +                                            md, fr, grpp, global_atom_index);
+ +                }
+ +                if (ind != -1)
+ +                {
+ +                    inc_nrnb(nrnb, ind, nbonds/nat1);
+ +                }
+ +            }
+ +        }
+ +    }
+ +    return v;
+ +}
+ +
+ +void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
+ +                const t_idef *idef,
+ +                rvec x[], history_t *hist,
+ +                rvec f[], t_forcerec *fr,
+ +                const t_pbc *pbc, const t_graph *g,
+ +                gmx_enerdata_t *enerd, t_nrnb *nrnb,
+ +                real *lambda,
+ +                const t_mdatoms *md,
+ +                t_fcdata *fcd, int *global_atom_index,
+ +                t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
+ +                int force_flags,
+ +                gmx_bool bPrintSepPot, gmx_large_int_t step)
+ +{
+ +    gmx_bool      bCalcEnerVir;
+ +    int           i;
+ +    real          v, dvdl[efptNR], dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
+ +                                                        of lambda, which will be thrown away in the end*/
+ +    const  t_pbc *pbc_null;
+ +    char          buf[22];
+ +    int           thread;
+ +
+ +    bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
+ +
+ +    for (i = 0; i < efptNR; i++)
+ +    {
+ +        dvdl[i] = 0.0;
+ +    }
+ +    if (fr->bMolPBC)
+ +    {
+ +        pbc_null = pbc;
+ +    }
+ +    else
+ +    {
+ +        pbc_null = NULL;
+ +    }
+ +    if (bPrintSepPot)
+ +    {
+ +        fprintf(fplog, "Step %s: bonded V and dVdl for this node\n",
+ +                gmx_step_str(step, buf));
+ +    }
+ +
+ +#ifdef DEBUG
+ +    if (g && debug)
+ +    {
+ +        p_graph(debug, "Bondage is fun", g);
+ +    }
+ +#endif
+ +
+ +    /* Do pre force calculation stuff which might require communication */
+ +    if (idef->il[F_ORIRES].nr)
+ +    {
+ +        enerd->term[F_ORIRESDEV] =
+ +            calc_orires_dev(ms, idef->il[F_ORIRES].nr,
+ +                            idef->il[F_ORIRES].iatoms,
+ +                            idef->iparams, md, (const rvec*)x,
+ +                            pbc_null, fcd, hist);
+ +    }
+ +    if (idef->il[F_DISRES].nr)
+ +    {
+ +        calc_disres_R_6(ms, idef->il[F_DISRES].nr,
+ +                        idef->il[F_DISRES].iatoms,
+ +                        idef->iparams, (const rvec*)x, pbc_null,
+ +                        fcd, hist);
+ +    }
+ +
+ +#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+ +    for (thread = 0; thread < fr->nthreads; thread++)
+ +    {
+ +        int                ftype, nbonds, ind, nat1;
+ +        real              *epot, v;
+ +        /* thread stuff */
+ +        rvec              *ft, *fshift;
+ +        real              *dvdlt;
+ +        gmx_grppairener_t *grpp;
+ +        int                nb0, nbn;
+ +
+ +        if (thread == 0)
+ +        {
+ +            ft     = f;
+ +            fshift = fr->fshift;
+ +            epot   = enerd->term;
+ +            grpp   = &enerd->grpp;
+ +            dvdlt  = dvdl;
+ +        }
+ +        else
+ +        {
+ +            zero_thread_forces(&fr->f_t[thread], fr->natoms_force,
+ +                               fr->red_nblock, 1<<fr->red_ashift);
+ +
+ +            ft     = fr->f_t[thread].f;
+ +            fshift = fr->f_t[thread].fshift;
+ +            epot   = fr->f_t[thread].ener;
+ +            grpp   = &fr->f_t[thread].grpp;
+ +            dvdlt  = fr->f_t[thread].dvdl;
+ +        }
+ +        /* Loop over all bonded force types to calculate the bonded forces */
+ +        for (ftype = 0; (ftype < F_NRE); ftype++)
+ +        {
+ +            if (idef->il[ftype].nr > 0 &&
+ +                (interaction_function[ftype].flags & IF_BOND) &&
+ +                (ftype < F_GB12 || ftype > F_GB14) &&
+ +                !(ftype == F_CONNBONDS || ftype == F_POSRES))
+ +            {
+ +                v = calc_one_bond(fplog, thread, ftype, idef, x,
+ +                                  ft, fshift, fr, pbc_null, g, enerd, grpp,
+ +                                  nrnb, lambda, dvdlt,
+ +                                  md, fcd, bCalcEnerVir,
+ +                                  global_atom_index, bPrintSepPot);
+ +                epot[ftype]        += v;
+ +            }
+ +        }
+ +    }
+ +    if (fr->nthreads > 1)
+ +    {
+ +        reduce_thread_forces(fr->natoms_force, f, fr->fshift,
+ +                             enerd->term, &enerd->grpp, dvdl,
+ +                             fr->nthreads, fr->f_t,
+ +                             fr->red_nblock, 1<<fr->red_ashift,
+ +                             bCalcEnerVir,
+ +                             force_flags & GMX_FORCE_DHDL);
+ +    }
+ +    if (force_flags & GMX_FORCE_DHDL)
+ +    {
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            enerd->dvdl_nonlin[i] += dvdl[i];
+ +        }
+ +    }
+ +
+ +    /* Copy the sum of violations for the distance restraints from fcd */
+ +    if (fcd)
+ +    {
+ +        enerd->term[F_DISRESVIOL] = fcd->disres.sumviol;
+ +
+ +    }
+ +}
+ +
+ +void calc_bonds_lambda(FILE *fplog,
+ +                       const t_idef *idef,
+ +                       rvec x[],
+ +                       t_forcerec *fr,
+ +                       const t_pbc *pbc, const t_graph *g,
+ +                       gmx_grppairener_t *grpp, real *epot, t_nrnb *nrnb,
+ +                       real *lambda,
+ +                       const t_mdatoms *md,
+ +                       t_fcdata *fcd,
+ +                       int *global_atom_index)
+ +{
+ +    int           i, ftype, nbonds_np, nbonds, ind, nat;
+ +    real          v, dr, dr2;
+ +    real          dvdl_dum[efptNR];
+ +    rvec         *f, *fshift_orig;
+ +    const  t_pbc *pbc_null;
+ +    t_iatom      *iatom_fe;
+ +
+ +    if (fr->bMolPBC)
+ +    {
+ +        pbc_null = pbc;
+ +    }
+ +    else
+ +    {
+ +        pbc_null = NULL;
+ +    }
+ +
+ +    snew(f, fr->natoms_force);
+ +    /* We want to preserve the fshift array in forcerec */
+ +    fshift_orig = fr->fshift;
+ +    snew(fr->fshift, SHIFTS);
+ +
+ +    /* Loop over all bonded force types to calculate the bonded forces */
+ +    for (ftype = 0; (ftype < F_NRE); ftype++)
+ +    {
+ +        v = calc_one_bond_foreign(fplog, ftype, idef, x,
+ +                                  f, fr, pbc_null, g, grpp, nrnb, lambda, dvdl_dum,
+ +                                  md, fcd, global_atom_index, FALSE);
+ +        epot[ftype] += v;
+ +    }
+ +
+ +    sfree(fr->fshift);
+ +    fr->fshift = fshift_orig;
+ +    sfree(f);
+ +}
diff --cc src/gromacs/gmxpreprocess/calc_verletbuf.c

index f0cba0732e8869f8c2cbbaa51399e3c96d6dd2e8,0000000000000000000000000000000000000000..385ba89b1cc059ac55a0cd04b90e0f9157b30e93

mode 100644,000000..100644
--- 1/src/gromacs/gmxpreprocess/calc_verletbuf.c
--- /dev/null
+++ b/src/gromacs/gmxpreprocess/calc_verletbuf.c
@@@ -1,713 -1,0 +1,723 @@@
-         list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+ +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.03
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <assert.h>
+ +
+ +#include <sys/types.h>
+ +#include <math.h>
+ +#include "typedefs.h"
+ +#include "physics.h"
+ +#include "smalloc.h"
+ +#include "gmx_fatal.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "coulomb.h"
+ +#include "calc_verletbuf.h"
+ +#include "../mdlib/nbnxn_consts.h"
+ +
++#ifdef GMX_NBNXN_SIMD
++/* The include below sets the SIMD instruction type (precision+width)
++ * for all nbnxn SIMD search and non-bonded kernel code.
++ */
++#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
++#define GMX_USE_HALF_WIDTH_SIMD_HERE
++#endif
++#include "gmx_simd_macros.h"
++#endif
++
+ +/* Struct for unique atom type for calculating the energy drift.
+ + * The atom displacement depends on mass and constraints.
+ + * The energy jump for given distance depend on LJ type and q.
+ + */
+ +typedef struct
+ +{
+ +    real     mass; /* mass */
+ +    int      type; /* type (used for LJ parameters) */
+ +    real     q;    /* charge */
+ +    int      con;  /* constrained: 0, else 1, if 1, use #DOF=2 iso 3 */
+ +    int      n;    /* total #atoms of this type in the system */
+ +} verletbuf_atomtype_t;
+ +
+ +
+ +void verletbuf_get_list_setup(gmx_bool                bGPU,
+ +                              verletbuf_list_setup_t *list_setup)
+ +{
+ +    list_setup->cluster_size_i     = NBNXN_CPU_CLUSTER_I_SIZE;
+ +
+ +    if (bGPU)
+ +    {
+ +        list_setup->cluster_size_j = NBNXN_GPU_CLUSTER_SIZE;
+ +    }
+ +    else
+ +    {
+ +#ifndef GMX_NBNXN_SIMD
+ +        list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
+ +#else
++        list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE;
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +        /* We assume the smallest cluster size to be on the safe side */
+ +        list_setup->cluster_size_j /= 2;
+ +#endif
+ +#endif
+ +    }
+ +}
+ +
+ +static void add_at(verletbuf_atomtype_t **att_p, int *natt_p,
+ +                   real mass, int type, real q, int con, int nmol)
+ +{
+ +    verletbuf_atomtype_t *att;
+ +    int                   natt, i;
+ +
+ +    if (mass == 0)
+ +    {
+ +        /* Ignore massless particles */
+ +        return;
+ +    }
+ +
+ +    att  = *att_p;
+ +    natt = *natt_p;
+ +
+ +    i = 0;
+ +    while (i < natt &&
+ +           !(mass == att[i].mass &&
+ +             type == att[i].type &&
+ +             q    == att[i].q &&
+ +             con  == att[i].con))
+ +    {
+ +        i++;
+ +    }
+ +
+ +    if (i < natt)
+ +    {
+ +        att[i].n += nmol;
+ +    }
+ +    else
+ +    {
+ +        (*natt_p)++;
+ +        srenew(*att_p, *natt_p);
+ +        (*att_p)[i].mass = mass;
+ +        (*att_p)[i].type = type;
+ +        (*att_p)[i].q    = q;
+ +        (*att_p)[i].con  = con;
+ +        (*att_p)[i].n    = nmol;
+ +    }
+ +}
+ +
+ +static void get_verlet_buffer_atomtypes(const gmx_mtop_t      *mtop,
+ +                                        verletbuf_atomtype_t **att_p,
+ +                                        int                   *natt_p,
+ +                                        int                   *n_nonlin_vsite)
+ +{
+ +    verletbuf_atomtype_t *att;
+ +    int                   natt;
+ +    int                   mb, nmol, ft, i, j, a1, a2, a3, a;
+ +    const t_atoms        *atoms;
+ +    const t_ilist        *il;
+ +    const t_atom         *at;
+ +    const t_iparams      *ip;
+ +    real                 *con_m, *vsite_m, cam[5];
+ +
+ +    att  = NULL;
+ +    natt = 0;
+ +
+ +    if (n_nonlin_vsite != NULL)
+ +    {
+ +        *n_nonlin_vsite = 0;
+ +    }
+ +
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        nmol = mtop->molblock[mb].nmol;
+ +
+ +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +
+ +        /* Check for constraints, as they affect the kinetic energy */
+ +        snew(con_m, atoms->nr);
+ +        snew(vsite_m, atoms->nr);
+ +
+ +        for (ft = F_CONSTR; ft <= F_CONSTRNC; ft++)
+ +        {
+ +            il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+ +
+ +            for (i = 0; i < il->nr; i += 1+NRAL(ft))
+ +            {
+ +                a1         = il->iatoms[i+1];
+ +                a2         = il->iatoms[i+2];
+ +                con_m[a1] += atoms->atom[a2].m;
+ +                con_m[a2] += atoms->atom[a1].m;
+ +            }
+ +        }
+ +
+ +        il = &mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE];
+ +
+ +        for (i = 0; i < il->nr; i += 1+NRAL(F_SETTLE))
+ +        {
+ +            a1         = il->iatoms[i+1];
+ +            a2         = il->iatoms[i+2];
+ +            a3         = il->iatoms[i+3];
+ +            con_m[a1] += atoms->atom[a2].m + atoms->atom[a3].m;
+ +            con_m[a2] += atoms->atom[a1].m + atoms->atom[a3].m;
+ +            con_m[a3] += atoms->atom[a1].m + atoms->atom[a2].m;
+ +        }
+ +
+ +        /* Check for virtual sites, determine mass from constructing atoms */
+ +        for (ft = 0; ft < F_NRE; ft++)
+ +        {
+ +            if (IS_VSITE(ft))
+ +            {
+ +                il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+ +
+ +                for (i = 0; i < il->nr; i += 1+NRAL(ft))
+ +                {
+ +                    ip = &mtop->ffparams.iparams[il->iatoms[i]];
+ +
+ +                    a1 = il->iatoms[i+1];
+ +
+ +                    for (j = 1; j < NRAL(ft); j++)
+ +                    {
+ +                        cam[j] = atoms->atom[il->iatoms[i+1+j]].m;
+ +                        if (cam[j] == 0)
+ +                        {
+ +                            cam[j] = vsite_m[il->iatoms[i+1+j]];
+ +                        }
+ +                        if (cam[j] == 0)
+ +                        {
+ +                            gmx_fatal(FARGS, "In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
+ +                                      *mtop->moltype[mtop->molblock[mb].type].name,
+ +                                      interaction_function[ft].longname,
+ +                                      il->iatoms[i+1+j]+1);
+ +                        }
+ +                    }
+ +
+ +                    switch (ft)
+ +                    {
+ +                        case F_VSITE2:
+ +                            /* Exact except for ignoring constraints */
+ +                            vsite_m[a1] = (cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a))/(cam[1]*cam[2]);
+ +                            break;
+ +                        case F_VSITE3:
+ +                            /* Exact except for ignoring constraints */
+ +                            vsite_m[a1] = (cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b))/(cam[1]*cam[2]*cam[3]);
+ +                            break;
+ +                        default:
+ +                            /* Use the mass of the lightest constructing atom.
+ +                             * This is an approximation.
+ +                             * If the distance of the virtual site to the
+ +                             * constructing atom is less than all distances
+ +                             * between constructing atoms, this is a safe
+ +                             * over-estimate of the displacement of the vsite.
+ +                             * This condition holds for all H mass replacement
+ +                             * replacement vsite constructions, except for SP2/3
+ +                             * groups. In SP3 groups one H will have a F_VSITE3
+ +                             * construction, so even there the total drift
+ +                             * estimation shouldn't be far off.
+ +                             */
+ +                            assert(j >= 1);
+ +                            vsite_m[a1] = cam[1];
+ +                            for (j = 2; j < NRAL(ft); j++)
+ +                            {
+ +                                vsite_m[a1] = min(vsite_m[a1], cam[j]);
+ +                            }
+ +                            if (n_nonlin_vsite != NULL)
+ +                            {
+ +                                *n_nonlin_vsite += nmol;
+ +                            }
+ +                            break;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        for (a = 0; a < atoms->nr; a++)
+ +        {
+ +            at = &atoms->atom[a];
+ +            /* We consider an atom constrained, #DOF=2, when it is
+ +             * connected with constraints to one or more atoms with
+ +             * total mass larger than 1.5 that of the atom itself.
+ +             */
+ +            add_at(&att, &natt,
+ +                   at->m, at->type, at->q, con_m[a] > 1.5*at->m, nmol);
+ +        }
+ +
+ +        sfree(vsite_m);
+ +        sfree(con_m);
+ +    }
+ +
+ +    if (gmx_debug_at)
+ +    {
+ +        for (a = 0; a < natt; a++)
+ +        {
+ +            fprintf(debug, "type %d: m %5.2f t %d q %6.3f con %d n %d\n",
+ +                    a, att[a].mass, att[a].type, att[a].q, att[a].con, att[a].n);
+ +        }
+ +    }
+ +
+ +    *att_p  = att;
+ +    *natt_p = natt;
+ +}
+ +
+ +static void approx_2dof(real s2, real x,
+ +                        real *shift, real *scale)
+ +{
+ +    /* A particle with 1 DOF constrained has 2 DOFs instead of 3.
+ +     * This code is also used for particles with multiple constraints,
+ +     * in which case we overestimate the displacement.
+ +     * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s).
+ +     * We approximate this with scale*Gaussian(s,r+shift),
+ +     * by matching the distribution value and derivative at x.
+ +     * This is a tight overestimate for all r>=0 at any s and x.
+ +     */
+ +    real ex, er;
+ +
+ +    ex = exp(-x*x/(2*s2));
+ +    er = gmx_erfc(x/sqrt(2*s2));
+ +
+ +    *shift = -x + sqrt(2*s2/M_PI)*ex/er;
+ +    *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er;
+ +}
+ +
+ +static real ener_drift(const verletbuf_atomtype_t *att, int natt,
+ +                       const gmx_ffparams_t *ffp,
+ +                       real kT_fac,
+ +                       real md_ljd, real md_ljr, real md_el, real dd_el,
+ +                       real r_buffer,
+ +                       real rlist, real boxvol)
+ +{
+ +    double drift_tot, pot1, pot2, pot;
+ +    int    i, j;
+ +    real   s2i, s2j, s2, s;
+ +    int    ti, tj;
+ +    real   md, dd;
+ +    real   sc_fac, rsh;
+ +    double c_exp, c_erfc;
+ +
+ +    drift_tot = 0;
+ +
+ +    /* Loop over the different atom type pairs */
+ +    for (i = 0; i < natt; i++)
+ +    {
+ +        s2i = kT_fac/att[i].mass;
+ +        ti  = att[i].type;
+ +
+ +        for (j = i; j < natt; j++)
+ +        {
+ +            s2j = kT_fac/att[j].mass;
+ +            tj  = att[j].type;
+ +
+ +            /* Note that attractive and repulsive potentials for individual
+ +             * pairs will partially cancel.
+ +             */
+ +            /* -dV/dr at the cut-off for LJ + Coulomb */
+ +            md =
+ +                md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 +
+ +                md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 +
+ +                md_el*att[i].q*att[j].q;
+ +
+ +            /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */
+ +            dd = dd_el*att[i].q*att[j].q;
+ +
+ +            s2  = s2i + s2j;
+ +
+ +            rsh    = r_buffer;
+ +            sc_fac = 1.0;
+ +            /* For constraints: adapt r and scaling for the Gaussian */
+ +            if (att[i].con)
+ +            {
+ +                real sh, sc;
+ +                approx_2dof(s2i, r_buffer*s2i/s2, &sh, &sc);
+ +                rsh    += sh;
+ +                sc_fac *= sc;
+ +            }
+ +            if (att[j].con)
+ +            {
+ +                real sh, sc;
+ +                approx_2dof(s2j, r_buffer*s2j/s2, &sh, &sc);
+ +                rsh    += sh;
+ +                sc_fac *= sc;
+ +            }
+ +
+ +            /* Exact contribution of an atom pair with Gaussian displacement
+ +             * with sigma s to the energy drift for a potential with
+ +             * derivative -md and second derivative dd at the cut-off.
+ +             * The only catch is that for potentials that change sign
+ +             * near the cut-off there could be an unlucky compensation
+ +             * of positive and negative energy drift.
+ +             * Such potentials are extremely rare though.
+ +             *
+ +             * Note that pot has unit energy*length, as the linear
+ +             * atom density still needs to be put in.
+ +             */
+ +            c_exp  = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI);
+ +            c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2)));
+ +            s      = sqrt(s2);
+ +
+ +            pot1 = sc_fac*
+ +                md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp);
+ +            pot2 = sc_fac*
+ +                dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc);
+ +            pot = pot1 + pot2;
+ +
+ +            if (gmx_debug_at)
+ +            {
+ +                fprintf(debug, "n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
+ +                        att[i].n, att[j].n, sqrt(s2i), sqrt(s2j),
+ +                        att[i].con+att[j].con,
+ +                        md, dd, pot1, pot2, pot);
+ +            }
+ +
+ +            /* Multiply by the number of atom pairs */
+ +            if (j == i)
+ +            {
+ +                pot *= (double)att[i].n*(att[i].n - 1)/2;
+ +            }
+ +            else
+ +            {
+ +                pot *= (double)att[i].n*att[j].n;
+ +            }
+ +            /* We need the line density to get the energy drift of the system.
+ +             * The effective average r^2 is close to (rlist+sigma)^2.
+ +             */
+ +            pot *= 4*M_PI*sqr(rlist + s)/boxvol;
+ +
+ +            /* Add the unsigned drift to avoid cancellation of errors */
+ +            drift_tot += fabs(pot);
+ +        }
+ +    }
+ +
+ +    return drift_tot;
+ +}
+ +
+ +static real surface_frac(int cluster_size, real particle_distance, real rlist)
+ +{
+ +    real d, area_rel;
+ +
+ +    if (rlist < 0.5*particle_distance)
+ +    {
+ +        /* We have non overlapping spheres */
+ +        return 1.0;
+ +    }
+ +
+ +    /* Half the inter-particle distance relative to rlist */
+ +    d = 0.5*particle_distance/rlist;
+ +
+ +    /* Determine the area of the surface at distance rlist to the closest
+ +     * particle, relative to surface of a sphere of radius rlist.
+ +     * The formulas below assume close to cubic cells for the pair search grid,
+ +     * which the pair search code tries to achieve.
+ +     * Note that in practice particle distances will not be delta distributed,
+ +     * but have some spread, often involving shorter distances,
+ +     * as e.g. O-H bonds in a water molecule. Thus the estimates below will
+ +     * usually be slightly too high and thus conservative.
+ +     */
+ +    switch (cluster_size)
+ +    {
+ +        case 1:
+ +            /* One particle: trivial */
+ +            area_rel = 1.0;
+ +            break;
+ +        case 2:
+ +            /* Two particles: two spheres at fractional distance 2*a */
+ +            area_rel = 1.0 + d;
+ +            break;
+ +        case 4:
+ +            /* We assume a perfect, symmetric tetrahedron geometry.
+ +             * The surface around a tetrahedron is too complex for a full
+ +             * analytical solution, so we use a Taylor expansion.
+ +             */
+ +            area_rel = (1.0 + 1/M_PI*(6*acos(1/sqrt(3))*d +
+ +                                      sqrt(3)*d*d*(1.0 +
+ +                                                   5.0/18.0*d*d +
+ +                                                   7.0/45.0*d*d*d*d +
+ +                                                   83.0/756.0*d*d*d*d*d*d)));
+ +            break;
+ +        default:
+ +            gmx_incons("surface_frac called with unsupported cluster_size");
+ +            area_rel = 1.0;
+ +    }
+ +
+ +    return area_rel/cluster_size;
+ +}
+ +
+ +void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
+ +                             const t_inputrec *ir, real drift_target,
+ +                             const verletbuf_list_setup_t *list_setup,
+ +                             int *n_nonlin_vsite,
+ +                             real *rlist)
+ +{
+ +    double                resolution;
+ +    char                 *env;
+ +
+ +    real                  particle_distance;
+ +    real                  nb_clust_frac_pairs_not_in_list_at_cutoff;
+ +
+ +    verletbuf_atomtype_t *att  = NULL;
+ +    int                   natt = -1, i;
+ +    double                reppow;
+ +    real                  md_ljd, md_ljr, md_el, dd_el;
+ +    real                  elfac;
+ +    real                  kT_fac, mass_min;
+ +    int                   ib0, ib1, ib;
+ +    real                  rb, rl;
+ +    real                  drift;
+ +
+ +    /* Resolution of the buffer size */
+ +    resolution = 0.001;
+ +
+ +    env = getenv("GMX_VERLET_BUFFER_RES");
+ +    if (env != NULL)
+ +    {
+ +        sscanf(env, "%lf", &resolution);
+ +    }
+ +
+ +    /* In an atom wise pair-list there would be no pairs in the list
+ +     * beyond the pair-list cut-off.
+ +     * However, we use a pair-list of groups vs groups of atoms.
+ +     * For groups of 4 atoms, the parallelism of SSE instructions, only
+ +     * 10% of the atoms pairs are not in the list just beyond the cut-off.
+ +     * As this percentage increases slowly compared to the decrease of the
+ +     * Gaussian displacement distribution over this range, we can simply
+ +     * reduce the drift by this fraction.
+ +     * For larger groups, e.g. of 8 atoms, this fraction will be lower,
+ +     * so then buffer size will be on the conservative (large) side.
+ +     *
+ +     * Note that the formulas used here do not take into account
+ +     * cancellation of errors which could occur by missing both
+ +     * attractive and repulsive interactions.
+ +     *
+ +     * The only major assumption is homogeneous particle distribution.
+ +     * For an inhomogeneous system, such as a liquid-vapor system,
+ +     * the buffer will be underestimated. The actual energy drift
+ +     * will be higher by the factor: local/homogeneous particle density.
+ +     *
+ +     * The results of this estimate have been checked againt simulations.
+ +     * In most cases the real drift differs by less than a factor 2.
+ +     */
+ +
+ +    /* Worst case assumption: HCP packing of particles gives largest distance */
+ +    particle_distance = pow(boxvol*sqrt(2)/mtop->natoms, 1.0/3.0);
+ +
+ +    get_verlet_buffer_atomtypes(mtop, &att, &natt, n_nonlin_vsite);
+ +    assert(att != NULL && natt >= 0);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "particle distance assuming HCP packing: %f nm\n",
+ +                particle_distance);
+ +        fprintf(debug, "energy drift atom types: %d\n", natt);
+ +    }
+ +
+ +    reppow = mtop->ffparams.reppow;
+ +    md_ljd = 0;
+ +    md_ljr = 0;
+ +    if (ir->vdwtype == evdwCUT)
+ +    {
+ +        /* -dV/dr of -r^-6 and r^-repporw */
+ +        md_ljd = -6*pow(ir->rvdw, -7.0);
+ +        md_ljr = reppow*pow(ir->rvdw, -(reppow+1));
+ +        /* The contribution of the second derivative is negligible */
+ +    }
+ +    else
+ +    {
+ +        gmx_fatal(FARGS, "Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions");
+ +    }
+ +
+ +    elfac = ONE_4PI_EPS0/ir->epsilon_r;
+ +
+ +    /* Determine md=-dV/dr and dd=d^2V/dr^2 */
+ +    md_el = 0;
+ +    dd_el = 0;
+ +    if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
+ +    {
+ +        real eps_rf, k_rf;
+ +
+ +        if (ir->coulombtype == eelCUT)
+ +        {
+ +            eps_rf = 1;
+ +            k_rf   = 0;
+ +        }
+ +        else
+ +        {
+ +            eps_rf = ir->epsilon_rf/ir->epsilon_r;
+ +            if (eps_rf != 0)
+ +            {
+ +                k_rf = pow(ir->rcoulomb, -3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r);
+ +            }
+ +            else
+ +            {
+ +                /* epsilon_rf = infinity */
+ +                k_rf = 0.5*pow(ir->rcoulomb, -3.0);
+ +            }
+ +        }
+ +
+ +        if (eps_rf > 0)
+ +        {
+ +            md_el = elfac*(pow(ir->rcoulomb, -2.0) - 2*k_rf*ir->rcoulomb);
+ +        }
+ +        dd_el = elfac*(2*pow(ir->rcoulomb, -3.0) + 2*k_rf);
+ +    }
+ +    else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD)
+ +    {
+ +        real b, rc, br;
+ +
+ +        b     = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ +        rc    = ir->rcoulomb;
+ +        br    = b*rc;
+ +        md_el = elfac*(b*exp(-br*br)*M_2_SQRTPI/rc + gmx_erfc(br)/(rc*rc));
+ +        dd_el = elfac/(rc*rc)*(2*b*(1 + br*br)*exp(-br*br)*M_2_SQRTPI + 2*gmx_erfc(br)/rc);
+ +    }
+ +    else
+ +    {
+ +        gmx_fatal(FARGS, "Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics");
+ +    }
+ +
+ +    /* Determine the variance of the atomic displacement
+ +     * over nstlist-1 steps: kT_fac
+ +     * For inertial dynamics (not Brownian dynamics) the mass factor
+ +     * is not included in kT_fac, it is added later.
+ +     */
+ +    if (ir->eI == eiBD)
+ +    {
+ +        /* Get the displacement distribution from the random component only.
+ +         * With accurate integration the systematic (force) displacement
+ +         * should be negligible (unless nstlist is extremely large, which
+ +         * you wouldn't do anyhow).
+ +         */
+ +        kT_fac = 2*BOLTZ*ir->opts.ref_t[0]*(ir->nstlist-1)*ir->delta_t;
+ +        if (ir->bd_fric > 0)
+ +        {
+ +            /* This is directly sigma^2 of the displacement */
+ +            kT_fac /= ir->bd_fric;
+ +
+ +            /* Set the masses to 1 as kT_fac is the full sigma^2,
+ +             * but we divide by m in ener_drift().
+ +             */
+ +            for (i = 0; i < natt; i++)
+ +            {
+ +                att[i].mass = 1;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            real tau_t;
+ +
+ +            /* Per group tau_t is not implemented yet, use the maximum */
+ +            tau_t = ir->opts.tau_t[0];
+ +            for (i = 1; i < ir->opts.ngtc; i++)
+ +            {
+ +                tau_t = max(tau_t, ir->opts.tau_t[i]);
+ +            }
+ +
+ +            kT_fac *= tau_t;
+ +            /* This kT_fac needs to be divided by the mass to get sigma^2 */
+ +        }
+ +    }
+ +    else
+ +    {
+ +        kT_fac = BOLTZ*ir->opts.ref_t[0]*sqr((ir->nstlist-1)*ir->delta_t);
+ +    }
+ +
+ +    mass_min = att[0].mass;
+ +    for (i = 1; i < natt; i++)
+ +    {
+ +        mass_min = min(mass_min, att[i].mass);
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "md_ljd %e md_ljr %e\n", md_ljd, md_ljr);
+ +        fprintf(debug, "md_el %e dd_el %e\n", md_el, dd_el);
+ +        fprintf(debug, "sqrt(kT_fac) %f\n", sqrt(kT_fac));
+ +        fprintf(debug, "mass_min %f\n", mass_min);
+ +    }
+ +
+ +    /* Search using bisection */
+ +    ib0 = -1;
+ +    /* The drift will be neglible at 5 times the max sigma */
+ +    ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1;
+ +    while (ib1 - ib0 > 1)
+ +    {
+ +        ib = (ib0 + ib1)/2;
+ +        rb = ib*resolution;
+ +        rl = max(ir->rvdw, ir->rcoulomb) + rb;
+ +
+ +        /* Calculate the average energy drift at the last step
+ +         * of the nstlist steps at which the pair-list is used.
+ +         */
+ +        drift = ener_drift(att, natt, &mtop->ffparams,
+ +                           kT_fac,
+ +                           md_ljd, md_ljr, md_el, dd_el, rb,
+ +                           rl, boxvol);
+ +
+ +        /* Correct for the fact that we are using a Ni x Nj particle pair list
+ +         * and not a 1 x 1 particle pair list. This reduces the drift.
+ +         */
+ +        /* We don't have a formula for 8 (yet), use 4 which is conservative */
+ +        nb_clust_frac_pairs_not_in_list_at_cutoff =
+ +            surface_frac(min(list_setup->cluster_size_i, 4),
+ +                         particle_distance, rl)*
+ +            surface_frac(min(list_setup->cluster_size_j, 4),
+ +                         particle_distance, rl);
+ +        drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
+ +
+ +        /* Convert the drift to drift per unit time per atom */
+ +        drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %f\n",
+ +                    ib0, ib, ib1, rb,
+ +                    list_setup->cluster_size_i, list_setup->cluster_size_j,
+ +                    nb_clust_frac_pairs_not_in_list_at_cutoff,
+ +                    drift);
+ +        }
+ +
+ +        if (fabs(drift) > drift_target)
+ +        {
+ +            ib0 = ib;
+ +        }
+ +        else
+ +        {
+ +            ib1 = ib;
+ +        }
+ +    }
+ +
+ +    sfree(att);
+ +
+ +    *rlist = max(ir->rvdw, ir->rcoulomb) + ib1*resolution;
+ +}
diff --cc src/gromacs/gmxpreprocess/gen_vsite.c

index a8d5ea9d1fbe502245592806d8ac1bcffe9d0345,0000000000000000000000000000000000000000..ea9f64930936429d7240cbb187361a5f3ae4113d

mode 100644,000000..100644
--- 1/src/gromacs/gmxpreprocess/gen_vsite.c
--- /dev/null
+++ b/src/gromacs/gmxpreprocess/gen_vsite.c
@@@ -1,2233 -1,0 +1,2239 @@@
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include "string2.h"
+ +#include <stdio.h>
+ +#include <math.h>
+ +#include <string.h>
+ +#include "gen_vsite.h"
+ +#include "smalloc.h"
+ +#include "resall.h"
+ +#include "add_par.h"
+ +#include "vec.h"
+ +#include "toputil.h"
+ +#include "physics.h"
+ +#include "index.h"
+ +#include "names.h"
+ +#include "futil.h"
+ +#include "gpp_atomtype.h"
+ +#include "fflibutil.h"
+ +#include "macros.h"
+ +
+ +#define MAXNAME 32
+ +#define OPENDIR     '[' /* starting sign for directive                */
+ +#define CLOSEDIR    ']' /* ending sign for directive          */
+ +
+ +typedef struct {
+ +    char       atomtype[MAXNAME];  /* Type for the XH3/XH2 atom */
+ +    gmx_bool   isplanar;           /* If true, the atomtype above and the three connected
+ +                                    * ones are in a planar geometry. The two next entries
+ +                                    * are undefined in that case
+ +                                    */
+ +    int    nhydrogens;             /* number of connected hydrogens */
+ +    char   nextheavytype[MAXNAME]; /* Type for the heavy atom bonded to XH2/XH3 */
+ +    char   dummymass[MAXNAME];     /* The type of MNH* or MCH3* dummy mass to use */
+ +} t_vsiteconf;
+ +
+ +
+ +/* Structure to represent average bond and angles values in vsite aromatic
+ + * residues. Note that these are NOT necessarily the bonds and angles from the
+ + * forcefield; many forcefields (like Amber, OPLS) have some inherent strain in
+ + * 5-rings (i.e. the sum of angles is !=540, but impropers keep it planar)
+ + */
+ +typedef struct {
+ +    char resname[MAXNAME];
+ +    int  nbonds;
+ +    int  nangles;
+ +    struct vsitetop_bond {
+ +        char   atom1[MAXNAME];
+ +        char   atom2[MAXNAME];
+ +        float  value;
+ +    } *bond; /* list of bonds */
+ +    struct vsitetop_angle {
+ +        char   atom1[MAXNAME];
+ +        char   atom2[MAXNAME];
+ +        char   atom3[MAXNAME];
+ +        float  value;
+ +    } *angle; /* list of angles */
+ +} t_vsitetop;
+ +
+ +
+ +enum {
+ +    DDB_CH3, DDB_NH3, DDB_NH2, DDB_PHE, DDB_TYR,
+ +    DDB_TRP, DDB_HISA, DDB_HISB, DDB_HISH, DDB_DIR_NR
+ +};
+ +
+ +typedef char t_dirname[STRLEN];
+ +
+ +static const t_dirname ddb_dirnames[DDB_DIR_NR] = {
+ +    "CH3",
+ +    "NH3",
+ +    "NH2",
+ +    "PHE",
+ +    "TYR",
+ +    "TRP",
+ +    "HISA",
+ +    "HISB",
+ +    "HISH"
+ +};
+ +
+ +static int ddb_name2dir(char *name)
+ +{
+ +    /* Translate a directive name to the number of the directive.
+ +     * HID/HIE/HIP names are translated to the ones we use in Gromacs.
+ +     */
+ +
+ +    int i, index;
+ +
+ +    index = -1;
+ +
+ +    for (i = 0; i < DDB_DIR_NR && index < 0; i++)
+ +    {
+ +        if (!gmx_strcasecmp(name, ddb_dirnames[i]))
+ +        {
+ +            index = i;
+ +        }
+ +    }
+ +
+ +    return index;
+ +}
+ +
+ +
+ +static void read_vsite_database(const char *ddbname,
+ +                                t_vsiteconf **pvsiteconflist, int *nvsiteconf,
+ +                                t_vsitetop **pvsitetoplist, int *nvsitetop)
+ +{
+ +    /* This routine is a quick hack to fix the problem with hardcoded atomtypes
+ +     * and aromatic vsite parameters by reading them from a ff???.vsd file.
+ +     *
+ +     * The file can contain sections [ NH3 ], [ CH3 ], [ NH2 ], and ring residue names.
+ +     * For the NH3 and CH3 section each line has three fields. The first is the atomtype
+ +     * (nb: not bonded type) of the N/C atom to be replaced, the second field is
+ +     * the type of the next heavy atom it is bonded to, and the third field the type
+ +     * of dummy mass that will be used for this group.
+ +     *
+ +     * If the NH2 group planar (sp2 N) a different vsite construct is used, so in this
+ +     * case the second field should just be the word planar.
+ +     */
+ +
+ +    FILE        *ddb;
+ +    char         dirstr[STRLEN];
+ +    char         pline[STRLEN];
+ +    int          i, j, n, k, nvsite, ntop, curdir, prevdir;
+ +    t_vsiteconf *vsiteconflist;
+ +    t_vsitetop  *vsitetoplist;
+ +    char        *ch;
+ +    char         s1[MAXNAME], s2[MAXNAME], s3[MAXNAME], s4[MAXNAME];
+ +
+ +    ddb = libopen(ddbname);
+ +
+ +    nvsite        = *nvsiteconf;
+ +    vsiteconflist = *pvsiteconflist;
+ +    ntop          = *nvsitetop;
+ +    vsitetoplist  = *pvsitetoplist;
+ +
+ +    curdir = -1;
+ +
+ +    snew(vsiteconflist, 1);
+ +    snew(vsitetoplist, 1);
+ +
+ +    while (fgets2(pline, STRLEN-2, ddb) != NULL)
+ +    {
+ +        strip_comment(pline);
+ +        trim(pline);
+ +        if (strlen(pline) > 0)
+ +        {
+ +            if (pline[0] == OPENDIR)
+ +            {
+ +                strncpy(dirstr, pline+1, STRLEN-2);
+ +                if ((ch = strchr (dirstr, CLOSEDIR)) != NULL)
+ +                {
+ +                    (*ch) = 0;
+ +                }
+ +                trim (dirstr);
+ +
+ +                if (!gmx_strcasecmp(dirstr, "HID") ||
+ +                    !gmx_strcasecmp(dirstr, "HISD"))
+ +                {
+ +                    sprintf(dirstr, "HISA");
+ +                }
+ +                else if (!gmx_strcasecmp(dirstr, "HIE") ||
+ +                         !gmx_strcasecmp(dirstr, "HISE"))
+ +                {
+ +                    sprintf(dirstr, "HISB");
+ +                }
+ +                else if (!gmx_strcasecmp(dirstr, "HIP"))
+ +                {
+ +                    sprintf(dirstr, "HISH");
+ +                }
+ +
+ +                curdir = ddb_name2dir(dirstr);
+ +                if (curdir < 0)
+ +                {
+ +                    gmx_fatal(FARGS, "Invalid directive %s in vsite database %s",
+ +                              dirstr, ddbname);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                switch (curdir)
+ +                {
+ +                    case -1:
+ +                        gmx_fatal(FARGS, "First entry in vsite database must be a directive.\n");
+ +                        break;
+ +                    case DDB_CH3:
+ +                    case DDB_NH3:
+ +                    case DDB_NH2:
+ +                        n = sscanf(pline, "%s%s%s", s1, s2, s3);
+ +                        if (n < 3 && !gmx_strcasecmp(s2, "planar"))
+ +                        {
+ +                            srenew(vsiteconflist, nvsite+1);
+ +                            strncpy(vsiteconflist[nvsite].atomtype, s1, MAXNAME-1);
+ +                            vsiteconflist[nvsite].isplanar         = TRUE;
+ +                            vsiteconflist[nvsite].nextheavytype[0] = 0;
+ +                            vsiteconflist[nvsite].dummymass[0]     = 0;
+ +                            vsiteconflist[nvsite].nhydrogens       = 2;
+ +                            nvsite++;
+ +                        }
+ +                        else if (n == 3)
+ +                        {
+ +                            srenew(vsiteconflist, (nvsite+1));
+ +                            strncpy(vsiteconflist[nvsite].atomtype, s1, MAXNAME-1);
+ +                            vsiteconflist[nvsite].isplanar = FALSE;
+ +                            strncpy(vsiteconflist[nvsite].nextheavytype, s2, MAXNAME-1);
+ +                            strncpy(vsiteconflist[nvsite].dummymass, s3, MAXNAME-1);
+ +                            if (curdir == DDB_NH2)
+ +                            {
+ +                                vsiteconflist[nvsite].nhydrogens = 2;
+ +                            }
+ +                            else
+ +                            {
+ +                                vsiteconflist[nvsite].nhydrogens = 3;
+ +                            }
+ +                            nvsite++;
+ +                        }
+ +                        else
+ +                        {
+ +                            gmx_fatal(FARGS, "Not enough directives in vsite database line: %s\n", pline);
+ +                        }
+ +                        break;
+ +                    case DDB_PHE:
+ +                    case DDB_TYR:
+ +                    case DDB_TRP:
+ +                    case DDB_HISA:
+ +                    case DDB_HISB:
+ +                    case DDB_HISH:
+ +                        i = 0;
+ +                        while ((i < ntop) && gmx_strcasecmp(dirstr, vsitetoplist[i].resname))
+ +                        {
+ +                            i++;
+ +                        }
+ +                        /* Allocate a new topology entry if this is a new residue */
+ +                        if (i == ntop)
+ +                        {
+ +                            srenew(vsitetoplist, ntop+1);
+ +                            ntop++; /* i still points to current vsite topology entry */
+ +                            strncpy(vsitetoplist[i].resname, dirstr, MAXNAME-1);
+ +                            vsitetoplist[i].nbonds = vsitetoplist[i].nangles = 0;
+ +                            snew(vsitetoplist[i].bond, 1);
+ +                            snew(vsitetoplist[i].angle, 1);
+ +                        }
+ +                        n = sscanf(pline, "%s%s%s%s", s1, s2, s3, s4);
+ +                        if (n == 3)
+ +                        {
+ +                            /* bond */
+ +                            k = vsitetoplist[i].nbonds++;
+ +                            srenew(vsitetoplist[i].bond, k+1);
+ +                            strncpy(vsitetoplist[i].bond[k].atom1, s1, MAXNAME-1);
+ +                            strncpy(vsitetoplist[i].bond[k].atom2, s2, MAXNAME-1);
+ +                            vsitetoplist[i].bond[k].value = strtod(s3, NULL);
+ +                        }
+ +                        else if (n == 4)
+ +                        {
+ +                            /* angle */
+ +                            k = vsitetoplist[i].nangles++;
+ +                            srenew(vsitetoplist[i].angle, k+1);
+ +                            strncpy(vsitetoplist[i].angle[k].atom1, s1, MAXNAME-1);
+ +                            strncpy(vsitetoplist[i].angle[k].atom2, s2, MAXNAME-1);
+ +                            strncpy(vsitetoplist[i].angle[k].atom3, s3, MAXNAME-1);
+ +                            vsitetoplist[i].angle[k].value = strtod(s4, NULL);
+ +                        }
+ +                        else
+ +                        {
+ +                            gmx_fatal(FARGS, "Need 3 or 4 values to specify bond/angle values in %s: %s\n", ddbname, pline);
+ +                        }
+ +                        break;
+ +                    default:
+ +                        gmx_fatal(FARGS, "Didnt find a case for directive %s in read_vsite_database\n", dirstr);
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    *pvsiteconflist = vsiteconflist;
+ +    *pvsitetoplist  = vsitetoplist;
+ +    *nvsiteconf     = nvsite;
+ +    *nvsitetop      = ntop;
+ +
+ +    ffclose(ddb);
+ +}
+ +
+ +static int nitrogen_is_planar(t_vsiteconf vsiteconflist[], int nvsiteconf, char atomtype[])
+ +{
+ +    /* Return 1 if atomtype exists in database list and is planar, 0 if not,
+ +     * and -1 if not found.
+ +     */
+ +    int      i, res;
+ +    gmx_bool found = FALSE;
+ +    for (i = 0; i < nvsiteconf && !found; i++)
+ +    {
+ +        found = (!gmx_strcasecmp(vsiteconflist[i].atomtype, atomtype) && (vsiteconflist[i].nhydrogens == 2));
+ +    }
+ +    if (found)
+ +    {
+ +        res = (vsiteconflist[i-1].isplanar == TRUE);
+ +    }
+ +    else
+ +    {
+ +        res = -1;
+ +    }
+ +
+ +    return res;
+ +}
+ +
+ +static char *get_dummymass_name(t_vsiteconf vsiteconflist[], int nvsiteconf, char atom[], char nextheavy[])
+ +{
+ +    /* Return the dummy mass name if found, or NULL if not set in ddb database */
+ +    int      i;
+ +    gmx_bool found = FALSE;
+ +    for (i = 0; i < nvsiteconf && !found; i++)
+ +    {
+ +        found = (!gmx_strcasecmp(vsiteconflist[i].atomtype, atom) &&
+ +                 !gmx_strcasecmp(vsiteconflist[i].nextheavytype, nextheavy));
+ +    }
+ +    if (found)
+ +    {
+ +        return vsiteconflist[i-1].dummymass;
+ +    }
+ +    else
+ +    {
+ +        return NULL;
+ +    }
+ +}
+ +
+ +
+ +
+ +static real get_ddb_bond(t_vsitetop *vsitetop, int nvsitetop,
+ +                         const char res[],
+ +                         const char atom1[], const char atom2[])
+ +{
+ +    int i, j;
+ +
+ +    i = 0;
+ +    while (i < nvsitetop && gmx_strcasecmp(res, vsitetop[i].resname))
+ +    {
+ +        i++;
+ +    }
+ +    if (i == nvsitetop)
+ +    {
+ +        gmx_fatal(FARGS, "No vsite information for residue %s found in vsite database.\n", res);
+ +    }
+ +    j = 0;
+ +    while (j < vsitetop[i].nbonds &&
+ +           ( strcmp(atom1, vsitetop[i].bond[j].atom1) || strcmp(atom2, vsitetop[i].bond[j].atom2)) &&
+ +           ( strcmp(atom2, vsitetop[i].bond[j].atom1) || strcmp(atom1, vsitetop[i].bond[j].atom2)))
+ +    {
+ +        j++;
+ +    }
+ +    if (j == vsitetop[i].nbonds)
+ +    {
+ +        gmx_fatal(FARGS, "Couldnt find bond %s-%s for residue %s in vsite database.\n", atom1, atom2, res);
+ +    }
+ +
+ +    return vsitetop[i].bond[j].value;
+ +}
+ +
+ +
+ +static real get_ddb_angle(t_vsitetop *vsitetop, int nvsitetop,
+ +                          const char res[], const char atom1[],
+ +                          const char atom2[], const char atom3[])
+ +{
+ +    int i, j;
+ +
+ +    i = 0;
+ +    while (i < nvsitetop && gmx_strcasecmp(res, vsitetop[i].resname))
+ +    {
+ +        i++;
+ +    }
+ +    if (i == nvsitetop)
+ +    {
+ +        gmx_fatal(FARGS, "No vsite information for residue %s found in vsite database.\n", res);
+ +    }
+ +    j = 0;
+ +    while (j < vsitetop[i].nangles &&
+ +           ( strcmp(atom1, vsitetop[i].angle[j].atom1) ||
+ +             strcmp(atom2, vsitetop[i].angle[j].atom2) ||
+ +             strcmp(atom3, vsitetop[i].angle[j].atom3)) &&
+ +           ( strcmp(atom3, vsitetop[i].angle[j].atom1) ||
+ +             strcmp(atom2, vsitetop[i].angle[j].atom2) ||
+ +             strcmp(atom1, vsitetop[i].angle[j].atom3)))
+ +    {
+ +        j++;
+ +    }
+ +    if (j == vsitetop[i].nangles)
+ +    {
+ +        gmx_fatal(FARGS, "Couldnt find angle %s-%s-%s for residue %s in vsite database.\n", atom1, atom2, atom3, res);
+ +    }
+ +
+ +    return vsitetop[i].angle[j].value;
+ +}
+ +
+ +
+ +static void count_bonds(int atom, t_params *psb, char ***atomname,
+ +                        int *nrbonds, int *nrHatoms, int Hatoms[], int *Heavy,
+ +                        int *nrheavies, int heavies[])
+ +{
+ +    int i, heavy, other, nrb, nrH, nrhv;
+ +
+ +    /* find heavy atom bound to this hydrogen */
+ +    heavy = NOTSET;
+ +    for (i = 0; (i < psb->nr) && (heavy == NOTSET); i++)
+ +    {
+ +        if (psb->param[i].AI == atom)
+ +        {
+ +            heavy = psb->param[i].AJ;
+ +        }
+ +        else if (psb->param[i].AJ == atom)
+ +        {
+ +            heavy = psb->param[i].AI;
+ +        }
+ +    }
+ +    if (heavy == NOTSET)
+ +    {
+ +        gmx_fatal(FARGS, "unbound hydrogen atom %d", atom+1);
+ +    }
+ +    /* find all atoms bound to heavy atom */
+ +    other = NOTSET;
+ +    nrb   = 0;
+ +    nrH   = 0;
+ +    nrhv  = 0;
+ +    for (i = 0; i < psb->nr; i++)
+ +    {
+ +        if (psb->param[i].AI == heavy)
+ +        {
+ +            other = psb->param[i].AJ;
+ +        }
+ +        else if (psb->param[i].AJ == heavy)
+ +        {
+ +            other = psb->param[i].AI;
+ +        }
+ +        if (other != NOTSET)
+ +        {
+ +            nrb++;
+ +            if (is_hydrogen(*(atomname[other])))
+ +            {
+ +                Hatoms[nrH] = other;
+ +                nrH++;
+ +            }
+ +            else
+ +            {
+ +                heavies[nrhv] = other;
+ +                nrhv++;
+ +            }
+ +            other = NOTSET;
+ +        }
+ +    }
+ +    *Heavy     = heavy;
+ +    *nrbonds   = nrb;
+ +    *nrHatoms  = nrH;
+ +    *nrheavies = nrhv;
+ +}
+ +
+ +static void print_bonds(FILE *fp, int o2n[],
+ +                        int nrHatoms, int Hatoms[], int Heavy,
+ +                        int nrheavies, int heavies[])
+ +{
+ +    int i;
+ +
+ +    fprintf(fp, "Found: %d Hatoms: ", nrHatoms);
+ +    for (i = 0; i < nrHatoms; i++)
+ +    {
+ +        fprintf(fp, " %d", o2n[Hatoms[i]]+1);
+ +    }
+ +    fprintf(fp, "; %d Heavy atoms: %d", nrheavies+1, o2n[Heavy]+1);
+ +    for (i = 0; i < nrheavies; i++)
+ +    {
+ +        fprintf(fp, " %d", o2n[heavies[i]]+1);
+ +    }
+ +    fprintf(fp, "\n");
+ +}
+ +
+ +static int get_atype(int atom, t_atoms *at, int nrtp, t_restp rtp[],
+ +                     gmx_residuetype_t rt)
+ +{
+ +    int      type;
+ +    gmx_bool bNterm;
+ +    int      j;
+ +    t_restp *rtpp;
+ +
+ +    if (at->atom[atom].m)
+ +    {
+ +        type = at->atom[atom].type;
+ +    }
+ +    else
+ +    {
+ +        /* get type from rtp */
+ +        rtpp   = get_restp(*(at->resinfo[at->atom[atom].resind].name), nrtp, rtp);
+ +        bNterm = gmx_residuetype_is_protein(rt, *(at->resinfo[at->atom[atom].resind].name)) &&
+ +            (at->atom[atom].resind == 0);
+ +        j    = search_jtype(rtpp, *(at->atomname[atom]), bNterm);
+ +        type = rtpp->atom[j].type;
+ +    }
+ +    return type;
+ +}
+ +
+ +static int vsite_nm2type(const char *name, gpp_atomtype_t atype)
+ +{
+ +    int tp;
+ +
+ +    tp = get_atomtype_type(name, atype);
+ +    if (tp == NOTSET)
+ +    {
+ +        gmx_fatal(FARGS, "Dummy mass type (%s) not found in atom type database",
+ +                  name);
+ +    }
+ +
+ +    return tp;
+ +}
+ +
+ +static real get_amass(int atom, t_atoms *at, int nrtp, t_restp rtp[],
+ +                      gmx_residuetype_t rt)
+ +{
+ +    real     mass;
+ +    gmx_bool bNterm;
+ +    int      j;
+ +    t_restp *rtpp;
+ +
+ +    if (at->atom[atom].m)
+ +    {
+ +        mass = at->atom[atom].m;
+ +    }
+ +    else
+ +    {
+ +        /* get mass from rtp */
+ +        rtpp   = get_restp(*(at->resinfo[at->atom[atom].resind].name), nrtp, rtp);
+ +        bNterm = gmx_residuetype_is_protein(rt, *(at->resinfo[at->atom[atom].resind].name)) &&
+ +            (at->atom[atom].resind == 0);
+ +        j    = search_jtype(rtpp, *(at->atomname[atom]), bNterm);
+ +        mass = rtpp->atom[j].m;
+ +    }
+ +    return mass;
+ +}
+ +
+ +static void my_add_param(t_params *plist, int ai, int aj, real b)
+ +{
+ +    static real c[MAXFORCEPARAM] =
+ +    { NOTSET, NOTSET, NOTSET, NOTSET, NOTSET, NOTSET };
+ +
+ +    c[0] = b;
+ +    add_param(plist, ai, aj, c, NULL);
+ +}
+ +
+ +static void add_vsites(t_params plist[], int vsite_type[],
+ +                       int Heavy, int nrHatoms, int Hatoms[],
+ +                       int nrheavies, int heavies[])
+ +{
+ +    int      i, j, ftype, other, moreheavy, bb;
+ +    gmx_bool bSwapParity;
+ +
+ +    for (i = 0; i < nrHatoms; i++)
+ +    {
+ +        ftype = vsite_type[Hatoms[i]];
+ +        /* Errors in setting the vsite_type should really be caugth earlier,
+ +         * because here it's not possible to print any useful error message.
+ +         * But it's still better to print a message than to segfault.
+ +         */
+ +        if (ftype == NOTSET)
+ +        {
+ +            gmx_incons("Undetected error in setting up virtual sites");
+ +        }
+ +        bSwapParity           = (ftype < 0);
+ +        vsite_type[Hatoms[i]] = ftype = abs(ftype);
+ +        if (ftype == F_BONDS)
+ +        {
+ +            if ( (nrheavies != 1) && (nrHatoms != 1) )
+ +            {
+ +                gmx_fatal(FARGS, "cannot make constraint in add_vsites for %d heavy "
+ +                          "atoms and %d hydrogen atoms", nrheavies, nrHatoms);
+ +            }
+ +            my_add_param(&(plist[F_CONSTRNC]), Hatoms[i], heavies[0], NOTSET);
+ +        }
+ +        else
+ +        {
+ +            switch (ftype)
+ +            {
+ +                case F_VSITE3:
+ +                case F_VSITE3FD:
+ +                case F_VSITE3OUT:
+ +                    if (nrheavies < 2)
+ +                    {
+ +                        gmx_fatal(FARGS, "Not enough heavy atoms (%d) for %s (min 3)",
+ +                                  nrheavies+1,
+ +                                  interaction_function[vsite_type[Hatoms[i]]].name);
+ +                    }
+ +                    add_vsite3_atoms(&plist[ftype], Hatoms[i], Heavy, heavies[0], heavies[1],
+ +                                     bSwapParity);
+ +                    break;
+ +                case F_VSITE3FAD:
+ +                {
+ +                    if (nrheavies > 1)
+ +                    {
+ +                        moreheavy = heavies[1];
+ +                    }
+ +                    else
+ +                    {
+ +                        /* find more heavy atoms */
+ +                        other = moreheavy = NOTSET;
+ +                        for (j = 0; (j < plist[F_BONDS].nr) && (moreheavy == NOTSET); j++)
+ +                        {
+ +                            if (plist[F_BONDS].param[j].AI == heavies[0])
+ +                            {
+ +                                other = plist[F_BONDS].param[j].AJ;
+ +                            }
+ +                            else if (plist[F_BONDS].param[j].AJ == heavies[0])
+ +                            {
+ +                                other = plist[F_BONDS].param[j].AI;
+ +                            }
+ +                            if ( (other != NOTSET) && (other != Heavy) )
+ +                            {
+ +                                moreheavy = other;
+ +                            }
+ +                        }
+ +                        if (moreheavy == NOTSET)
+ +                        {
+ +                            gmx_fatal(FARGS, "Unbound molecule part %d-%d", Heavy+1, Hatoms[0]+1);
+ +                        }
+ +                    }
+ +                    add_vsite3_atoms(&plist[ftype], Hatoms[i], Heavy, heavies[0], moreheavy,
+ +                                     bSwapParity);
+ +                    break;
+ +                }
+ +                case F_VSITE4FD:
+ +                case F_VSITE4FDN:
+ +                    if (nrheavies < 3)
+ +                    {
+ +                        gmx_fatal(FARGS, "Not enough heavy atoms (%d) for %s (min 4)",
+ +                                  nrheavies+1,
+ +                                  interaction_function[vsite_type[Hatoms[i]]].name);
+ +                    }
+ +                    add_vsite4_atoms(&plist[ftype],
+ +                                     Hatoms[0], Heavy, heavies[0], heavies[1], heavies[2]);
+ +                    break;
+ +
+ +                default:
+ +                    gmx_fatal(FARGS, "can't use add_vsites for interaction function %s",
+ +                              interaction_function[vsite_type[Hatoms[i]]].name);
+ +            } /* switch ftype */
+ +        }     /* else */
+ +    }         /* for i */
+ +}
+ +
+ +#define ANGLE_6RING (DEG2RAD*120)
+ +
+ +/* cosine rule: a^2 = b^2 + c^2 - 2 b c cos(alpha) */
+ +/* get a^2 when a, b and alpha are given: */
+ +#define cosrule(b, c, alpha) ( sqr(b) + sqr(c) - 2*b*c*cos(alpha) )
+ +/* get cos(alpha) when a, b and c are given: */
+ +#define acosrule(a, b, c) ( (sqr(b)+sqr(c)-sqr(a))/(2*b*c) )
+ +
+ +static int gen_vsites_6ring(t_atoms *at, int *vsite_type[], t_params plist[],
+ +                            int nrfound, int *ats, real bond_cc, real bond_ch,
+ +                            real xcom, gmx_bool bDoZ)
+ +{
+ +    /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ +    enum {
+ +        atCG, atCD1, atHD1, atCD2, atHD2, atCE1, atHE1, atCE2, atHE2,
+ +        atCZ, atHZ, atNR
+ +    };
+ +
+ +    int  i, nvsite;
+ +    real a, b, dCGCE, tmp1, tmp2, mtot, mG, mrest;
+ +    real xCG, yCG, xCE1, yCE1, xCE2, yCE2;
+ +    /* CG, CE1 and CE2 stay and each get a part of the total mass,
+ +     * so the c-o-m stays the same.
+ +     */
+ +
+ +    if (bDoZ)
+ +    {
+ +        if (atNR != nrfound)
+ +        {
+ +            gmx_incons("Generating vsites on 6-rings");
+ +        }
+ +    }
+ +
+ +    /* constraints between CG, CE1 and CE2: */
+ +    dCGCE = sqrt( cosrule(bond_cc, bond_cc, ANGLE_6RING) );
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atCE1], dCGCE);
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atCE2], dCGCE);
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCE1], ats[atCE2], dCGCE);
+ +
+ +    /* rest will be vsite3 */
+ +    mtot   = 0;
+ +    nvsite = 0;
+ +    for (i = 0; i <  (bDoZ ? atNR : atHZ); i++)
+ +    {
+ +        mtot += at->atom[ats[i]].m;
+ +        if (i != atCG && i != atCE1 && i != atCE2 && (bDoZ || (i != atHZ && i != atCZ) ) )
+ +        {
+ +            at->atom[ats[i]].m    = at->atom[ats[i]].mB = 0;
+ +            (*vsite_type)[ats[i]] = F_VSITE3;
+ +            nvsite++;
+ +        }
+ +    }
+ +    /* Distribute mass so center-of-mass stays the same.
+ +     * The center-of-mass in the call is defined with x=0 at
+ +     * the CE1-CE2 bond and y=0 at the line from CG to the middle of CE1-CE2 bond.
+ +     */
+ +    xCG  = -bond_cc+bond_cc*cos(ANGLE_6RING);
+ +    yCG  = 0;
+ +    xCE1 = 0;
+ +    yCE1 = bond_cc*sin(0.5*ANGLE_6RING);
+ +    xCE2 = 0;
+ +    yCE2 = -bond_cc*sin(0.5*ANGLE_6RING);
+ +
+ +    mG                             = at->atom[ats[atCG]].m = at->atom[ats[atCG]].mB = xcom*mtot/xCG;
+ +    mrest                          = mtot-mG;
+ +    at->atom[ats[atCE1]].m         = at->atom[ats[atCE1]].mB =
+ +            at->atom[ats[atCE2]].m = at->atom[ats[atCE2]].mB = mrest / 2;
+ +
+ +    /* vsite3 construction: r_d = r_i + a r_ij + b r_ik */
+ +    tmp1  = dCGCE*sin(ANGLE_6RING*0.5);
+ +    tmp2  = bond_cc*cos(0.5*ANGLE_6RING) + tmp1;
+ +    tmp1 *= 2;
+ +    a     = b = -bond_ch / tmp1;
+ +    /* HE1 and HE2: */
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atHE1], ats[atCE1], ats[atCE2], ats[atCG], a, b);
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atHE2], ats[atCE2], ats[atCE1], ats[atCG], a, b);
+ +    /* CD1, CD2 and CZ: */
+ +    a = b = tmp2 / tmp1;
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atCD1], ats[atCE2], ats[atCE1], ats[atCG], a, b);
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atCD2], ats[atCE1], ats[atCE2], ats[atCG], a, b);
+ +    if (bDoZ)
+ +    {
+ +        add_vsite3_param(&plist[F_VSITE3],
+ +                         ats[atCZ], ats[atCG], ats[atCE1], ats[atCE2], a, b);
+ +    }
+ +    /* HD1, HD2 and HZ: */
+ +    a = b = ( bond_ch + tmp2 ) / tmp1;
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atHD1], ats[atCE2], ats[atCE1], ats[atCG], a, b);
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atHD2], ats[atCE1], ats[atCE2], ats[atCG], a, b);
+ +    if (bDoZ)
+ +    {
+ +        add_vsite3_param(&plist[F_VSITE3],
+ +                         ats[atHZ], ats[atCG], ats[atCE1], ats[atCE2], a, b);
+ +    }
+ +
+ +    return nvsite;
+ +}
+ +
+ +static int gen_vsites_phe(t_atoms *at, int *vsite_type[], t_params plist[],
+ +                          int nrfound, int *ats, t_vsitetop *vsitetop, int nvsitetop)
+ +{
+ +    real bond_cc, bond_ch;
+ +    real xcom, mtot;
+ +    int  i;
+ +    /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ +    enum {
+ +        atCG, atCD1, atHD1, atCD2, atHD2, atCE1, atHE1, atCE2, atHE2,
+ +        atCZ, atHZ, atNR
+ +    };
+ +    real x[atNR], y[atNR];
+ +    /* Aromatic rings have 6-fold symmetry, so we only need one bond length.
+ +     * (angle is always 120 degrees).
+ +     */
+ +    bond_cc = get_ddb_bond(vsitetop, nvsitetop, "PHE", "CD1", "CE1");
+ +    bond_ch = get_ddb_bond(vsitetop, nvsitetop, "PHE", "CD1", "HD1");
+ +
+ +    x[atCG]  = -bond_cc+bond_cc*cos(ANGLE_6RING);
+ +    y[atCG]  = 0;
+ +    x[atCD1] = -bond_cc;
+ +    y[atCD1] = bond_cc*sin(0.5*ANGLE_6RING);
+ +    x[atHD1] = x[atCD1]+bond_ch*cos(ANGLE_6RING);
+ +    y[atHD1] = y[atCD1]+bond_ch*sin(ANGLE_6RING);
+ +    x[atCE1] = 0;
+ +    y[atCE1] = y[atCD1];
+ +    x[atHE1] = x[atCE1]-bond_ch*cos(ANGLE_6RING);
+ +    y[atHE1] = y[atCE1]+bond_ch*sin(ANGLE_6RING);
+ +    x[atCD2] = x[atCD1];
+ +    y[atCD2] = -y[atCD1];
+ +    x[atHD2] = x[atHD1];
+ +    y[atHD2] = -y[atHD1];
+ +    x[atCE2] = x[atCE1];
+ +    y[atCE2] = -y[atCE1];
+ +    x[atHE2] = x[atHE1];
+ +    y[atHE2] = -y[atHE1];
+ +    x[atCZ]  = bond_cc*cos(0.5*ANGLE_6RING);
+ +    y[atCZ]  = 0;
+ +    x[atHZ]  = x[atCZ]+bond_ch;
+ +    y[atHZ]  = 0;
+ +
+ +    xcom = mtot = 0;
+ +    for (i = 0; i < atNR; i++)
+ +    {
+ +        xcom += x[i]*at->atom[ats[i]].m;
+ +        mtot += at->atom[ats[i]].m;
+ +    }
+ +    xcom /= mtot;
+ +
+ +    return gen_vsites_6ring(at, vsite_type, plist, nrfound, ats, bond_cc, bond_ch, xcom, TRUE);
+ +}
+ +
+ +static void calc_vsite3_param(real xd, real yd, real xi, real yi, real xj, real yj,
+ +                              real xk, real yk, real *a, real *b)
+ +{
+ +    /* determine parameters by solving the equation system, since we know the
+ +     * virtual site coordinates here.
+ +     */
+ +    real dx_ij, dx_ik, dy_ij, dy_ik;
+ +    real b_ij, b_ik;
+ +
+ +    dx_ij = xj-xi;
+ +    dy_ij = yj-yi;
+ +    dx_ik = xk-xi;
+ +    dy_ik = yk-yi;
+ +    b_ij  = sqrt(dx_ij*dx_ij+dy_ij*dy_ij);
+ +    b_ik  = sqrt(dx_ik*dx_ik+dy_ik*dy_ik);
+ +
+ +    *a = ( (xd-xi)*dy_ik - dx_ik*(yd-yi) ) / (dx_ij*dy_ik - dx_ik*dy_ij);
+ +    *b = ( yd - yi - (*a)*dy_ij ) / dy_ik;
+ +}
+ +
+ +
+ +static int gen_vsites_trp(gpp_atomtype_t atype, rvec *newx[],
+ +                          t_atom *newatom[], char ***newatomname[],
+ +                          int *o2n[], int *newvsite_type[], int *newcgnr[],
+ +                          t_symtab *symtab, int *nadd, rvec x[], int *cgnr[],
+ +                          t_atoms *at, int *vsite_type[], t_params plist[],
+ +                          int nrfound, int *ats, int add_shift,
+ +                          t_vsitetop *vsitetop, int nvsitetop)
+ +{
+ +#define NMASS 2
+ +    /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ +    enum {
+ +        atCB,  atCG,  atCD1, atHD1, atCD2, atNE1, atHE1, atCE2, atCE3, atHE3,
+ +        atCZ2, atHZ2, atCZ3, atHZ3, atCH2, atHH2, atNR
+ +    };
+ +    /* weights for determining the COM's of both rings (M1 and M2): */
+ +    real mw[NMASS][atNR] = {
+ +        {   0,     1,     1,     1,   0.5,     1,     1,   0.5,     0,     0,
+ +            0,     0,     0,     0,     0,     0 },
+ +        {   0,     0,     0,     0,   0.5,     0,     0,   0.5,     1,     1,
+ +            1,     1,     1,     1,     1,     1 }
+ +    };
+ +
+ +    real xi[atNR], yi[atNR];
+ +    real xcom[NMASS], ycom[NMASS], I, alpha;
+ +    real lineA, lineB, dist;
+ +    real b_CD2_CE2, b_NE1_CE2, b_CG_CD2, b_CH2_HH2, b_CE2_CZ2;
+ +    real b_NE1_HE1, b_CD2_CE3, b_CE3_CZ3, b_CB_CG;
+ +    real b_CZ2_CH2, b_CZ2_HZ2, b_CD1_HD1, b_CE3_HE3;
+ +    real b_CG_CD1, b_CZ3_HZ3;
+ +    real a_NE1_CE2_CD2, a_CE2_CD2_CG, a_CB_CG_CD2, a_CE2_CD2_CE3;
+ +    real a_CB_CG_CD1, a_CD2_CG_CD1, a_CE2_CZ2_HZ2, a_CZ2_CH2_HH2;
+ +    real a_CD2_CE2_CZ2, a_CD2_CE3_CZ3, a_CE3_CZ3_HZ3, a_CG_CD1_HD1;
+ +    real a_CE2_CZ2_CH2, a_HE1_NE1_CE2, a_CD2_CE3_HE3;
+ +    real xM[NMASS];
+ +    int  atM[NMASS], tpM, i, i0, j, nvsite;
+ +    real mwtot, mtot, mM[NMASS], dCBM1, dCBM2, dM1M2;
+ +    real a, b, c[MAXFORCEPARAM];
+ +    rvec r_ij, r_ik, t1, t2;
+ +    char name[10];
+ +
+ +    if (atNR != nrfound)
+ +    {
+ +        gmx_incons("atom types in gen_vsites_trp");
+ +    }
+ +    /* Get geometry from database */
+ +    b_CD2_CE2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CD2", "CE2");
+ +    b_NE1_CE2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "NE1", "CE2");
+ +    b_CG_CD1  = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CG", "CD1");
+ +    b_CG_CD2  = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CG", "CD2");
+ +    b_CB_CG   = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CB", "CG");
+ +    b_CE2_CZ2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CE2", "CZ2");
+ +    b_CD2_CE3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CD2", "CE3");
+ +    b_CE3_CZ3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CE3", "CZ3");
+ +    b_CZ2_CH2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CZ2", "CH2");
+ +
+ +    b_CD1_HD1 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CD1", "HD1");
+ +    b_CZ2_HZ2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CZ2", "HZ2");
+ +    b_NE1_HE1 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "NE1", "HE1");
+ +    b_CH2_HH2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CH2", "HH2");
+ +    b_CE3_HE3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CE3", "HE3");
+ +    b_CZ3_HZ3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CZ3", "HZ3");
+ +
+ +    a_NE1_CE2_CD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "NE1", "CE2", "CD2");
+ +    a_CE2_CD2_CG  = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CD2", "CG");
+ +    a_CB_CG_CD2   = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CB", "CG", "CD2");
+ +    a_CD2_CG_CD1  = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CG", "CD1");
+ +    a_CB_CG_CD1   = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CB", "CG", "CD1");
+ +
+ +    a_CE2_CD2_CE3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CD2", "CE3");
+ +    a_CD2_CE2_CZ2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CE2", "CZ2");
+ +    a_CD2_CE3_CZ3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CE3", "CZ3");
+ +    a_CE3_CZ3_HZ3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE3", "CZ3", "HZ3");
+ +    a_CZ2_CH2_HH2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CZ2", "CH2", "HH2");
+ +    a_CE2_CZ2_HZ2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CZ2", "HZ2");
+ +    a_CE2_CZ2_CH2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CZ2", "CH2");
+ +    a_CG_CD1_HD1  = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CG", "CD1", "HD1");
+ +    a_HE1_NE1_CE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "HE1", "NE1", "CE2");
+ +    a_CD2_CE3_HE3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CE3", "HE3");
+ +
+ +    /* Calculate local coordinates.
+ +     * y-axis (x=0) is the bond CD2-CE2.
+ +     * x-axis (y=0) is perpendicular to the bond CD2-CE2 and
+ +     * intersects the middle of the bond.
+ +     */
+ +    xi[atCD2] = 0;
+ +    yi[atCD2] = -0.5*b_CD2_CE2;
+ +
+ +    xi[atCE2] = 0;
+ +    yi[atCE2] = 0.5*b_CD2_CE2;
+ +
+ +    xi[atNE1] = -b_NE1_CE2*sin(a_NE1_CE2_CD2);
+ +    yi[atNE1] = yi[atCE2]-b_NE1_CE2*cos(a_NE1_CE2_CD2);
+ +
+ +    xi[atCG] = -b_CG_CD2*sin(a_CE2_CD2_CG);
+ +    yi[atCG] = yi[atCD2]+b_CG_CD2*cos(a_CE2_CD2_CG);
+ +
+ +    alpha    = a_CE2_CD2_CG + M_PI - a_CB_CG_CD2;
+ +    xi[atCB] = xi[atCG]-b_CB_CG*sin(alpha);
+ +    yi[atCB] = yi[atCG]+b_CB_CG*cos(alpha);
+ +
+ +    alpha     = a_CE2_CD2_CG + a_CD2_CG_CD1 - M_PI;
+ +    xi[atCD1] = xi[atCG]-b_CG_CD1*sin(alpha);
+ +    yi[atCD1] = yi[atCG]+b_CG_CD1*cos(alpha);
+ +
+ +    xi[atCE3] = b_CD2_CE3*sin(a_CE2_CD2_CE3);
+ +    yi[atCE3] = yi[atCD2]+b_CD2_CE3*cos(a_CE2_CD2_CE3);
+ +
+ +    xi[atCZ2] = b_CE2_CZ2*sin(a_CD2_CE2_CZ2);
+ +    yi[atCZ2] = yi[atCE2]-b_CE2_CZ2*cos(a_CD2_CE2_CZ2);
+ +
+ +    alpha     = a_CE2_CD2_CE3 + a_CD2_CE3_CZ3 - M_PI;
+ +    xi[atCZ3] = xi[atCE3]+b_CE3_CZ3*sin(alpha);
+ +    yi[atCZ3] = yi[atCE3]+b_CE3_CZ3*cos(alpha);
+ +
+ +    alpha     = a_CD2_CE2_CZ2 + a_CE2_CZ2_CH2 - M_PI;
+ +    xi[atCH2] = xi[atCZ2]+b_CZ2_CH2*sin(alpha);
+ +    yi[atCH2] = yi[atCZ2]-b_CZ2_CH2*cos(alpha);
+ +
+ +    /* hydrogens */
+ +    alpha     = a_CE2_CD2_CG + a_CD2_CG_CD1 - a_CG_CD1_HD1;
+ +    xi[atHD1] = xi[atCD1]-b_CD1_HD1*sin(alpha);
+ +    yi[atHD1] = yi[atCD1]+b_CD1_HD1*cos(alpha);
+ +
+ +    alpha     = a_NE1_CE2_CD2 + M_PI - a_HE1_NE1_CE2;
+ +    xi[atHE1] = xi[atNE1]-b_NE1_HE1*sin(alpha);
+ +    yi[atHE1] = yi[atNE1]-b_NE1_HE1*cos(alpha);
+ +
+ +    alpha     = a_CE2_CD2_CE3 + M_PI - a_CD2_CE3_HE3;
+ +    xi[atHE3] = xi[atCE3]+b_CE3_HE3*sin(alpha);
+ +    yi[atHE3] = yi[atCE3]+b_CE3_HE3*cos(alpha);
+ +
+ +    alpha     = a_CD2_CE2_CZ2 + M_PI - a_CE2_CZ2_HZ2;
+ +    xi[atHZ2] = xi[atCZ2]+b_CZ2_HZ2*sin(alpha);
+ +    yi[atHZ2] = yi[atCZ2]-b_CZ2_HZ2*cos(alpha);
+ +
+ +    alpha     = a_CD2_CE2_CZ2 + a_CE2_CZ2_CH2 - a_CZ2_CH2_HH2;
+ +    xi[atHZ3] = xi[atCZ3]+b_CZ3_HZ3*sin(alpha);
+ +    yi[atHZ3] = yi[atCZ3]+b_CZ3_HZ3*cos(alpha);
+ +
+ +    alpha     = a_CE2_CD2_CE3 + a_CD2_CE3_CZ3 - a_CE3_CZ3_HZ3;
+ +    xi[atHH2] = xi[atCH2]+b_CH2_HH2*sin(alpha);
+ +    yi[atHH2] = yi[atCH2]-b_CH2_HH2*cos(alpha);
+ +
+ +    /* Determine coeff. for the line CB-CG */
+ +    lineA = (yi[atCB]-yi[atCG])/(xi[atCB]-xi[atCG]);
+ +    lineB = yi[atCG]-lineA*xi[atCG];
+ +
+ +    /* Calculate masses for each ring and put it on the dummy masses */
+ +    for (j = 0; j < NMASS; j++)
+ +    {
+ +        mM[j] = xcom[j] = ycom[j] = 0;
+ +    }
+ +    for (i = 0; i < atNR; i++)
+ +    {
+ +        if (i != atCB)
+ +        {
+ +            for (j = 0; j < NMASS; j++)
+ +            {
+ +                mM[j]   += mw[j][i] * at->atom[ats[i]].m;
+ +                xcom[j] += xi[i] * mw[j][i] * at->atom[ats[i]].m;
+ +                ycom[j] += yi[i] * mw[j][i] * at->atom[ats[i]].m;
+ +            }
+ +        }
+ +    }
+ +    for (j = 0; j < NMASS; j++)
+ +    {
+ +        xcom[j] /= mM[j];
+ +        ycom[j] /= mM[j];
+ +    }
+ +
+ +    /* get dummy mass type */
+ +    tpM = vsite_nm2type("MW", atype);
+ +    /* make space for 2 masses: shift all atoms starting with CB */
+ +    i0 = ats[atCB];
+ +    for (j = 0; j < NMASS; j++)
+ +    {
+ +        atM[j] = i0+*nadd+j;
+ +    }
+ +    if (debug)
+ +    {
+ +        fprintf(stderr, "Inserting %d dummy masses at %d\n", NMASS, (*o2n)[i0]+1);
+ +    }
+ +    *nadd += NMASS;
+ +    for (j = i0; j < at->nr; j++)
+ +    {
+ +        (*o2n)[j] = j+*nadd;
+ +    }
+ +    srenew(*newx, at->nr+*nadd);
+ +    srenew(*newatom, at->nr+*nadd);
+ +    srenew(*newatomname, at->nr+*nadd);
+ +    srenew(*newvsite_type, at->nr+*nadd);
+ +    srenew(*newcgnr, at->nr+*nadd);
+ +    for (j = 0; j < NMASS; j++)
+ +    {
+ +        (*newatomname)[at->nr+*nadd-1-j] = NULL;
+ +    }
+ +
+ +    /* Dummy masses will be placed at the center-of-mass in each ring. */
+ +
+ +    /* calc initial position for dummy masses in real (non-local) coordinates.
+ +     * Cheat by using the routine to calculate virtual site parameters. It is
+ +     * much easier when we have the coordinates expressed in terms of
+ +     * CB, CG, CD2.
+ +     */
+ +    rvec_sub(x[ats[atCB]], x[ats[atCG]], r_ij);
+ +    rvec_sub(x[ats[atCD2]], x[ats[atCG]], r_ik);
+ +    calc_vsite3_param(xcom[0], ycom[0], xi[atCG], yi[atCG], xi[atCB], yi[atCB],
+ +                      xi[atCD2], yi[atCD2], &a, &b);
+ +    svmul(a, r_ij, t1);
+ +    svmul(b, r_ik, t2);
+ +    rvec_add(t1, t2, t1);
+ +    rvec_add(t1, x[ats[atCG]], (*newx)[atM[0]]);
+ +
+ +    calc_vsite3_param(xcom[1], ycom[1], xi[atCG], yi[atCG], xi[atCB], yi[atCB],
+ +                      xi[atCD2], yi[atCD2], &a, &b);
+ +    svmul(a, r_ij, t1);
+ +    svmul(b, r_ik, t2);
+ +    rvec_add(t1, t2, t1);
+ +    rvec_add(t1, x[ats[atCG]], (*newx)[atM[1]]);
+ +
+ +    /* set parameters for the masses */
+ +    for (j = 0; j < NMASS; j++)
+ +    {
+ +        sprintf(name, "MW%d", j+1);
+ +        (*newatomname)  [atM[j]]        = put_symtab(symtab, name);
+ +        (*newatom)      [atM[j]].m      = (*newatom)[atM[j]].mB    = mM[j];
+ +        (*newatom)      [atM[j]].q      = (*newatom)[atM[j]].qB    = 0.0;
+ +        (*newatom)      [atM[j]].type   = (*newatom)[atM[j]].typeB = tpM;
+ +        (*newatom)      [atM[j]].ptype  = eptAtom;
+ +        (*newatom)      [atM[j]].resind = at->atom[i0].resind;
++        (*newatom)      [atM[j]].elem[0] = 'M';
++        (*newatom)      [atM[j]].elem[1] = '\0';
+ +        (*newvsite_type)[atM[j]]        = NOTSET;
+ +        (*newcgnr)      [atM[j]]        = (*cgnr)[i0];
+ +    }
+ +    /* renumber cgnr: */
+ +    for (i = i0; i < at->nr; i++)
+ +    {
+ +        (*cgnr)[i]++;
+ +    }
+ +
+ +    /* constraints between CB, M1 and M2 */
+ +    /* 'add_shift' says which atoms won't be renumbered afterwards */
+ +    dCBM1 = sqrt( sqr(xcom[0]-xi[atCB]) + sqr(ycom[0]-yi[atCB]) );
+ +    dM1M2 = sqrt( sqr(xcom[0]-xcom[1]) + sqr(ycom[0]-ycom[1]) );
+ +    dCBM2 = sqrt( sqr(xcom[1]-xi[atCB]) + sqr(ycom[1]-yi[atCB]) );
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCB],       add_shift+atM[0], dCBM1);
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCB],       add_shift+atM[1], dCBM2);
+ +    my_add_param(&(plist[F_CONSTRNC]), add_shift+atM[0], add_shift+atM[1], dM1M2);
+ +
+ +    /* rest will be vsite3 */
+ +    nvsite = 0;
+ +    for (i = 0; i < atNR; i++)
+ +    {
+ +        if (i != atCB)
+ +        {
+ +            at->atom[ats[i]].m    = at->atom[ats[i]].mB = 0;
+ +            (*vsite_type)[ats[i]] = F_VSITE3;
+ +            nvsite++;
+ +        }
+ +    }
+ +
+ +    /* now define all vsites from M1, M2, CB, ie:
+ +       r_d = r_M1 + a r_M1_M2 + b r_M1_CB */
+ +    for (i = 0; i < atNR; i++)
+ +    {
+ +        if ( (*vsite_type)[ats[i]] == F_VSITE3)
+ +        {
+ +            calc_vsite3_param(xi[i], yi[i], xcom[0], ycom[0], xcom[1], ycom[1], xi[atCB], yi[atCB], &a, &b);
+ +            add_vsite3_param(&plist[F_VSITE3],
+ +                             ats[i], add_shift+atM[0], add_shift+atM[1], ats[atCB], a, b);
+ +        }
+ +    }
+ +    return nvsite;
+ +#undef NMASS
+ +}
+ +
+ +
+ +static int gen_vsites_tyr(gpp_atomtype_t atype, rvec *newx[],
+ +                          t_atom *newatom[], char ***newatomname[],
+ +                          int *o2n[], int *newvsite_type[], int *newcgnr[],
+ +                          t_symtab *symtab, int *nadd, rvec x[], int *cgnr[],
+ +                          t_atoms *at, int *vsite_type[], t_params plist[],
+ +                          int nrfound, int *ats, int add_shift,
+ +                          t_vsitetop *vsitetop, int nvsitetop)
+ +{
+ +    int  nvsite, i, i0, j, atM, tpM;
+ +    real dCGCE, dCEOH, dCGM, tmp1, a, b;
+ +    real bond_cc, bond_ch, bond_co, bond_oh, angle_coh;
+ +    real xcom, mtot;
+ +    real vmass, vdist, mM;
+ +    rvec r1;
+ +    char name[10];
+ +
+ +    /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ +    enum {
+ +        atCG, atCD1, atHD1, atCD2, atHD2, atCE1, atHE1, atCE2, atHE2,
+ +        atCZ, atOH, atHH, atNR
+ +    };
+ +    real xi[atNR], yi[atNR];
+ +    /* CG, CE1, CE2 (as in general 6-ring) and OH and HH stay,
+ +       rest gets virtualized.
+ +       Now we have two linked triangles with one improper keeping them flat */
+ +    if (atNR != nrfound)
+ +    {
+ +        gmx_incons("Number of atom types in gen_vsites_tyr");
+ +    }
+ +
+ +    /* Aromatic rings have 6-fold symmetry, so we only need one bond length
+ +     * for the ring part (angle is always 120 degrees).
+ +     */
+ +    bond_cc   = get_ddb_bond(vsitetop, nvsitetop, "TYR", "CD1", "CE1");
+ +    bond_ch   = get_ddb_bond(vsitetop, nvsitetop, "TYR", "CD1", "HD1");
+ +    bond_co   = get_ddb_bond(vsitetop, nvsitetop, "TYR", "CZ", "OH");
+ +    bond_oh   = get_ddb_bond(vsitetop, nvsitetop, "TYR", "OH", "HH");
+ +    angle_coh = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TYR", "CZ", "OH", "HH");
+ +
+ +    xi[atCG]  = -bond_cc+bond_cc*cos(ANGLE_6RING);
+ +    yi[atCG]  = 0;
+ +    xi[atCD1] = -bond_cc;
+ +    yi[atCD1] = bond_cc*sin(0.5*ANGLE_6RING);
+ +    xi[atHD1] = xi[atCD1]+bond_ch*cos(ANGLE_6RING);
+ +    yi[atHD1] = yi[atCD1]+bond_ch*sin(ANGLE_6RING);
+ +    xi[atCE1] = 0;
+ +    yi[atCE1] = yi[atCD1];
+ +    xi[atHE1] = xi[atCE1]-bond_ch*cos(ANGLE_6RING);
+ +    yi[atHE1] = yi[atCE1]+bond_ch*sin(ANGLE_6RING);
+ +    xi[atCD2] = xi[atCD1];
+ +    yi[atCD2] = -yi[atCD1];
+ +    xi[atHD2] = xi[atHD1];
+ +    yi[atHD2] = -yi[atHD1];
+ +    xi[atCE2] = xi[atCE1];
+ +    yi[atCE2] = -yi[atCE1];
+ +    xi[atHE2] = xi[atHE1];
+ +    yi[atHE2] = -yi[atHE1];
+ +    xi[atCZ]  = bond_cc*cos(0.5*ANGLE_6RING);
+ +    yi[atCZ]  = 0;
+ +    xi[atOH]  = xi[atCZ]+bond_co;
+ +    yi[atOH]  = 0;
+ +
+ +    xcom = mtot = 0;
+ +    for (i = 0; i < atOH; i++)
+ +    {
+ +        xcom += xi[i]*at->atom[ats[i]].m;
+ +        mtot += at->atom[ats[i]].m;
+ +    }
+ +    xcom /= mtot;
+ +
+ +    /* first do 6 ring as default,
+ +       except CZ (we'll do that different) and HZ (we don't have that): */
+ +    nvsite = gen_vsites_6ring(at, vsite_type, plist, nrfound, ats, bond_cc, bond_ch, xcom, FALSE);
+ +
+ +    /* then construct CZ from the 2nd triangle */
+ +    /* vsite3 construction: r_d = r_i + a r_ij + b r_ik */
+ +    a = b = 0.5 * bond_co / ( bond_co - bond_cc*cos(ANGLE_6RING) );
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atCZ], ats[atOH], ats[atCE1], ats[atCE2], a, b);
+ +    at->atom[ats[atCZ]].m = at->atom[ats[atCZ]].mB = 0;
+ +
+ +    /* constraints between CE1, CE2 and OH */
+ +    dCGCE = sqrt( cosrule(bond_cc, bond_cc, ANGLE_6RING) );
+ +    dCEOH = sqrt( cosrule(bond_cc, bond_co, ANGLE_6RING) );
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCE1], ats[atOH], dCEOH);
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCE2], ats[atOH], dCEOH);
+ +
+ +    /* We also want to constrain the angle C-O-H, but since CZ is constructed
+ +     * we need to introduce a constraint to CG.
+ +     * CG is much further away, so that will lead to instabilities in LINCS
+ +     * when we constrain both CG-HH and OH-HH distances. Instead of requiring
+ +     * the use of lincs_order=8 we introduce a dummy mass three times further
+ +     * away from OH than HH. The mass is accordingly a third, with the remaining
+ +     * 2/3 moved to OH. This shouldnt cause any problems since the forces will
+ +     * apply to the HH constructed atom and not directly on the virtual mass.
+ +     */
+ +
+ +    vdist                   = 2.0*bond_oh;
+ +    mM                      = at->atom[ats[atHH]].m/2.0;
+ +    at->atom[ats[atOH]].m  += mM; /* add 1/2 of original H mass */
+ +    at->atom[ats[atOH]].mB += mM; /* add 1/2 of original H mass */
+ +    at->atom[ats[atHH]].m   = at->atom[ats[atHH]].mB = 0;
+ +
+ +    /* get dummy mass type */
+ +    tpM = vsite_nm2type("MW", atype);
+ +    /* make space for 1 mass: shift HH only */
+ +    i0  = ats[atHH];
+ +    atM = i0+*nadd;
+ +    if (debug)
+ +    {
+ +        fprintf(stderr, "Inserting 1 dummy mass at %d\n", (*o2n)[i0]+1);
+ +    }
+ +    (*nadd)++;
+ +    for (j = i0; j < at->nr; j++)
+ +    {
+ +        (*o2n)[j] = j+*nadd;
+ +    }
+ +    srenew(*newx, at->nr+*nadd);
+ +    srenew(*newatom, at->nr+*nadd);
+ +    srenew(*newatomname, at->nr+*nadd);
+ +    srenew(*newvsite_type, at->nr+*nadd);
+ +    srenew(*newcgnr, at->nr+*nadd);
+ +    (*newatomname)[at->nr+*nadd-1] = NULL;
+ +
+ +    /* Calc the dummy mass initial position */
+ +    rvec_sub(x[ats[atHH]], x[ats[atOH]], r1);
+ +    svmul(2.0, r1, r1);
+ +    rvec_add(r1, x[ats[atHH]], (*newx)[atM]);
+ +
+ +    strcpy(name, "MW1");
+ +    (*newatomname)  [atM]        = put_symtab(symtab, name);
+ +    (*newatom)      [atM].m      = (*newatom)[atM].mB    = mM;
+ +    (*newatom)      [atM].q      = (*newatom)[atM].qB    = 0.0;
+ +    (*newatom)      [atM].type   = (*newatom)[atM].typeB = tpM;
+ +    (*newatom)      [atM].ptype  = eptAtom;
+ +    (*newatom)      [atM].resind = at->atom[i0].resind;
++    (*newatom)      [atM].elem[0] = 'M';
++    (*newatom)      [atM].elem[1] = '\0';
+ +    (*newvsite_type)[atM]        = NOTSET;
+ +    (*newcgnr)      [atM]        = (*cgnr)[i0];
+ +    /* renumber cgnr: */
+ +    for (i = i0; i < at->nr; i++)
+ +    {
+ +        (*cgnr)[i]++;
+ +    }
+ +
+ +    (*vsite_type)[ats[atHH]] = F_VSITE2;
+ +    nvsite++;
+ +    /* assume we also want the COH angle constrained: */
+ +    tmp1 = bond_cc*cos(0.5*ANGLE_6RING) + dCGCE*sin(ANGLE_6RING*0.5) + bond_co;
+ +    dCGM = sqrt( cosrule(tmp1, vdist, angle_coh) );
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCG], add_shift+atM, dCGM);
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atOH], add_shift+atM, vdist);
+ +
+ +    add_vsite2_param(&plist[F_VSITE2],
+ +                     ats[atHH], ats[atOH], add_shift+atM, 1.0/2.0);
+ +    return nvsite;
+ +}
+ +
+ +static int gen_vsites_his(t_atoms *at, int *vsite_type[], t_params plist[],
+ +                          int nrfound, int *ats, t_vsitetop *vsitetop, int nvsitetop)
+ +{
+ +    int  nvsite, i;
+ +    real a, b, alpha, dCGCE1, dCGNE2;
+ +    real sinalpha, cosalpha;
+ +    real xcom, ycom, mtot;
+ +    real mG, mrest, mCE1, mNE2;
+ +    real b_CG_ND1, b_ND1_CE1, b_CE1_NE2, b_CG_CD2, b_CD2_NE2;
+ +    real b_ND1_HD1, b_NE2_HE2, b_CE1_HE1, b_CD2_HD2;
+ +    real a_CG_ND1_CE1, a_CG_CD2_NE2, a_ND1_CE1_NE2, a_CE1_NE2_CD2;
+ +    real a_NE2_CE1_HE1, a_NE2_CD2_HD2, a_CE1_ND1_HD1, a_CE1_NE2_HE2;
+ +    char resname[10];
+ +
+ +    /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ +    enum {
+ +        atCG, atND1, atHD1, atCD2, atHD2, atCE1, atHE1, atNE2, atHE2, atNR
+ +    };
+ +    real x[atNR], y[atNR];
+ +
+ +    /* CG, CE1 and NE2 stay, each gets part of the total mass,
+ +       rest gets virtualized */
+ +    /* check number of atoms, 3 hydrogens may be missing: */
+ +    /* assert( nrfound >= atNR-3 || nrfound <= atNR );
+ +     * Don't understand the above logic. Shouldn't it be && rather than || ???
+ +     */
+ +    if ((nrfound < atNR-3) || (nrfound > atNR))
+ +    {
+ +        gmx_incons("Generating vsites for HIS");
+ +    }
+ +
+ +    /* avoid warnings about uninitialized variables */
+ +    b_ND1_HD1 = b_NE2_HE2 = b_CE1_HE1 = b_CD2_HD2 = a_NE2_CE1_HE1 =
+ +                        a_NE2_CD2_HD2 = a_CE1_ND1_HD1 = a_CE1_NE2_HE2 = 0;
+ +
+ +    if (ats[atHD1] != NOTSET)
+ +    {
+ +        if (ats[atHE2] != NOTSET)
+ +        {
+ +            sprintf(resname, "HISH");
+ +        }
+ +        else
+ +        {
+ +            sprintf(resname, "HISA");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        sprintf(resname, "HISB");
+ +    }
+ +
+ +    /* Get geometry from database */
+ +    b_CG_ND1      = get_ddb_bond(vsitetop, nvsitetop, resname, "CG", "ND1");
+ +    b_ND1_CE1     = get_ddb_bond(vsitetop, nvsitetop, resname, "ND1", "CE1");
+ +    b_CE1_NE2     = get_ddb_bond(vsitetop, nvsitetop, resname, "CE1", "NE2");
+ +    b_CG_CD2      = get_ddb_bond(vsitetop, nvsitetop, resname, "CG", "CD2");
+ +    b_CD2_NE2     = get_ddb_bond(vsitetop, nvsitetop, resname, "CD2", "NE2");
+ +    a_CG_ND1_CE1  = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CG", "ND1", "CE1");
+ +    a_CG_CD2_NE2  = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CG", "CD2", "NE2");
+ +    a_ND1_CE1_NE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "ND1", "CE1", "NE2");
+ +    a_CE1_NE2_CD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CE1", "NE2", "CD2");
+ +
+ +    if (ats[atHD1] != NOTSET)
+ +    {
+ +        b_ND1_HD1     = get_ddb_bond(vsitetop, nvsitetop, resname, "ND1", "HD1");
+ +        a_CE1_ND1_HD1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CE1", "ND1", "HD1");
+ +    }
+ +    if (ats[atHE2] != NOTSET)
+ +    {
+ +        b_NE2_HE2     = get_ddb_bond(vsitetop, nvsitetop, resname, "NE2", "HE2");
+ +        a_CE1_NE2_HE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CE1", "NE2", "HE2");
+ +    }
+ +    if (ats[atHD2] != NOTSET)
+ +    {
+ +        b_CD2_HD2     = get_ddb_bond(vsitetop, nvsitetop, resname, "CD2", "HD2");
+ +        a_NE2_CD2_HD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "NE2", "CD2", "HD2");
+ +    }
+ +    if (ats[atHE1] != NOTSET)
+ +    {
+ +        b_CE1_HE1     = get_ddb_bond(vsitetop, nvsitetop, resname, "CE1", "HE1");
+ +        a_NE2_CE1_HE1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "NE2", "CE1", "HE1");
+ +    }
+ +
+ +    /* constraints between CG, CE1 and NE1 */
+ +    dCGCE1   = sqrt( cosrule(b_CG_ND1, b_ND1_CE1, a_CG_ND1_CE1) );
+ +    dCGNE2   = sqrt( cosrule(b_CG_CD2, b_CD2_NE2, a_CG_CD2_NE2) );
+ +
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atCE1], dCGCE1);
+ +    my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atNE2], dCGNE2);
+ +    /* we already have a constraint CE1-NE2, so we don't add it again */
+ +
+ +    /* calculate the positions in a local frame of reference.
+ +     * The x-axis is the line from CG that makes a right angle
+ +     * with the bond CE1-NE2, and the y-axis the bond CE1-NE2.
+ +     */
+ +    /* First calculate the x-axis intersection with y-axis (=yCE1).
+ +     * Get cos(angle CG-CE1-NE2) :
+ +     */
+ +    cosalpha = acosrule(dCGNE2, dCGCE1, b_CE1_NE2);
+ +    x[atCE1] = 0;
+ +    y[atCE1] = cosalpha*dCGCE1;
+ +    x[atNE2] = 0;
+ +    y[atNE2] = y[atCE1]-b_CE1_NE2;
+ +    sinalpha = sqrt(1-cosalpha*cosalpha);
+ +    x[atCG]  = -sinalpha*dCGCE1;
+ +    y[atCG]  = 0;
+ +    x[atHE1] = x[atHE2] = x[atHD1] = x[atHD2] = 0;
+ +    y[atHE1] = y[atHE2] = y[atHD1] = y[atHD2] = 0;
+ +
+ +    /* calculate ND1 and CD2 positions from CE1 and NE2 */
+ +
+ +    x[atND1] = -b_ND1_CE1*sin(a_ND1_CE1_NE2);
+ +    y[atND1] = y[atCE1]-b_ND1_CE1*cos(a_ND1_CE1_NE2);
+ +
+ +    x[atCD2] = -b_CD2_NE2*sin(a_CE1_NE2_CD2);
+ +    y[atCD2] = y[atNE2]+b_CD2_NE2*cos(a_CE1_NE2_CD2);
+ +
+ +    /* And finally the hydrogen positions */
+ +    if (ats[atHE1] != NOTSET)
+ +    {
+ +        x[atHE1] = x[atCE1] + b_CE1_HE1*sin(a_NE2_CE1_HE1);
+ +        y[atHE1] = y[atCE1] - b_CE1_HE1*cos(a_NE2_CE1_HE1);
+ +    }
+ +    /* HD2 - first get (ccw) angle from (positive) y-axis */
+ +    if (ats[atHD2] != NOTSET)
+ +    {
+ +        alpha    = a_CE1_NE2_CD2 + M_PI - a_NE2_CD2_HD2;
+ +        x[atHD2] = x[atCD2] - b_CD2_HD2*sin(alpha);
+ +        y[atHD2] = y[atCD2] + b_CD2_HD2*cos(alpha);
+ +    }
+ +    if (ats[atHD1] != NOTSET)
+ +    {
+ +        /* HD1 - first get (cw) angle from (positive) y-axis */
+ +        alpha    = a_ND1_CE1_NE2 + M_PI - a_CE1_ND1_HD1;
+ +        x[atHD1] = x[atND1] - b_ND1_HD1*sin(alpha);
+ +        y[atHD1] = y[atND1] - b_ND1_HD1*cos(alpha);
+ +    }
+ +    if (ats[atHE2] != NOTSET)
+ +    {
+ +        x[atHE2] = x[atNE2] + b_NE2_HE2*sin(a_CE1_NE2_HE2);
+ +        y[atHE2] = y[atNE2] + b_NE2_HE2*cos(a_CE1_NE2_HE2);
+ +    }
+ +    /* Have all coordinates now */
+ +
+ +    /* calc center-of-mass; keep atoms CG, CE1, NE2 and
+ +     * set the rest to vsite3
+ +     */
+ +    mtot   = xcom = ycom = 0;
+ +    nvsite = 0;
+ +    for (i = 0; i < atNR; i++)
+ +    {
+ +        if (ats[i] != NOTSET)
+ +        {
+ +            mtot += at->atom[ats[i]].m;
+ +            xcom += x[i]*at->atom[ats[i]].m;
+ +            ycom += y[i]*at->atom[ats[i]].m;
+ +            if (i != atCG && i != atCE1 && i != atNE2)
+ +            {
+ +                at->atom[ats[i]].m    = at->atom[ats[i]].mB = 0;
+ +                (*vsite_type)[ats[i]] = F_VSITE3;
+ +                nvsite++;
+ +            }
+ +        }
+ +    }
+ +    if (nvsite+3 != nrfound)
+ +    {
+ +        gmx_incons("Generating vsites for HIS");
+ +    }
+ +
+ +    xcom /= mtot;
+ +    ycom /= mtot;
+ +
+ +    /* distribute mass so that com stays the same */
+ +    mG    = xcom*mtot/x[atCG];
+ +    mrest = mtot-mG;
+ +    mCE1  = (ycom-y[atNE2])*mrest/(y[atCE1]-y[atNE2]);
+ +    mNE2  = mrest-mCE1;
+ +
+ +    at->atom[ats[atCG]].m  = at->atom[ats[atCG]].mB = mG;
+ +    at->atom[ats[atCE1]].m = at->atom[ats[atCE1]].mB = mCE1;
+ +    at->atom[ats[atNE2]].m = at->atom[ats[atNE2]].mB = mNE2;
+ +
+ +    /* HE1 */
+ +    if (ats[atHE1] != NOTSET)
+ +    {
+ +        calc_vsite3_param(x[atHE1], y[atHE1], x[atCE1], y[atCE1], x[atNE2], y[atNE2],
+ +                          x[atCG], y[atCG], &a, &b);
+ +        add_vsite3_param(&plist[F_VSITE3],
+ +                         ats[atHE1], ats[atCE1], ats[atNE2], ats[atCG], a, b);
+ +    }
+ +    /* HE2 */
+ +    if (ats[atHE2] != NOTSET)
+ +    {
+ +        calc_vsite3_param(x[atHE2], y[atHE2], x[atNE2], y[atNE2], x[atCE1], y[atCE1],
+ +                          x[atCG], y[atCG], &a, &b);
+ +        add_vsite3_param(&plist[F_VSITE3],
+ +                         ats[atHE2], ats[atNE2], ats[atCE1], ats[atCG], a, b);
+ +    }
+ +
+ +    /* ND1 */
+ +    calc_vsite3_param(x[atND1], y[atND1], x[atNE2], y[atNE2], x[atCE1], y[atCE1],
+ +                      x[atCG], y[atCG], &a, &b);
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atND1], ats[atNE2], ats[atCE1], ats[atCG], a, b);
+ +
+ +    /* CD2 */
+ +    calc_vsite3_param(x[atCD2], y[atCD2], x[atCE1], y[atCE1], x[atNE2], y[atNE2],
+ +                      x[atCG], y[atCG], &a, &b);
+ +    add_vsite3_param(&plist[F_VSITE3],
+ +                     ats[atCD2], ats[atCE1], ats[atNE2], ats[atCG], a, b);
+ +
+ +    /* HD1 */
+ +    if (ats[atHD1] != NOTSET)
+ +    {
+ +        calc_vsite3_param(x[atHD1], y[atHD1], x[atNE2], y[atNE2], x[atCE1], y[atCE1],
+ +                          x[atCG], y[atCG], &a, &b);
+ +        add_vsite3_param(&plist[F_VSITE3],
+ +                         ats[atHD1], ats[atNE2], ats[atCE1], ats[atCG], a, b);
+ +    }
+ +    /* HD2 */
+ +    if (ats[atHD2] != NOTSET)
+ +    {
+ +        calc_vsite3_param(x[atHD2], y[atHD2], x[atCE1], y[atCE1], x[atNE2], y[atNE2],
+ +                          x[atCG], y[atCG], &a, &b);
+ +        add_vsite3_param(&plist[F_VSITE3],
+ +                         ats[atHD2], ats[atCE1], ats[atNE2], ats[atCG], a, b);
+ +    }
+ +    return nvsite;
+ +}
+ +
+ +static gmx_bool is_vsite(int vsite_type)
+ +{
+ +    if (vsite_type == NOTSET)
+ +    {
+ +        return FALSE;
+ +    }
+ +    switch (abs(vsite_type) )
+ +    {
+ +        case F_VSITE3:
+ +        case F_VSITE3FD:
+ +        case F_VSITE3OUT:
+ +        case F_VSITE3FAD:
+ +        case F_VSITE4FD:
+ +        case F_VSITE4FDN:
+ +            return TRUE;
+ +        default:
+ +            return FALSE;
+ +    }
+ +}
+ +
+ +static char atomnamesuffix[] = "1234";
+ +
+ +void do_vsites(int nrtp, t_restp rtp[], gpp_atomtype_t atype,
+ +               t_atoms *at, t_symtab *symtab, rvec *x[],
+ +               t_params plist[], int *vsite_type[], int *cgnr[],
+ +               real mHmult, gmx_bool bVsiteAromatics,
+ +               const char *ffdir)
+ +{
+ +#define MAXATOMSPERRESIDUE 16
+ +    int               i, j, k, m, i0, ni0, whatres, resind, add_shift, ftype, nvsite, nadd;
+ +    int               ai, aj, ak, al;
+ +    int               nrfound = 0, needed, nrbonds, nrHatoms, Heavy, nrheavies, tpM, tpHeavy;
+ +    int               Hatoms[4], heavies[4], bb;
+ +    gmx_bool          bWARNING, bAddVsiteParam, bFirstWater;
+ +    matrix            tmpmat;
+ +    gmx_bool         *bResProcessed;
+ +    real              mHtot, mtot, fact, fact2;
+ +    rvec              rpar, rperp, temp;
+ +    char              name[10], tpname[32], nexttpname[32], *ch;
+ +    rvec             *newx;
+ +    int              *o2n, *newvsite_type, *newcgnr, ats[MAXATOMSPERRESIDUE];
+ +    t_atom           *newatom;
+ +    t_params         *params;
+ +    char           ***newatomname;
+ +    char             *resnm = NULL;
+ +    int               ndb, f;
+ +    char            **db;
+ +    int               nvsiteconf, nvsitetop, cmplength;
+ +    gmx_bool          isN, planarN, bFound;
+ +    gmx_residuetype_t rt;
+ +
+ +    t_vsiteconf      *vsiteconflist;
+ +    /* pointer to a list of CH3/NH3/NH2 configuration entries.
+ +     * See comments in read_vsite_database. It isnt beautiful,
+ +     * but it had to be fixed, and I dont even want to try to
+ +     * maintain this part of the code...
+ +     */
+ +    t_vsitetop *vsitetop;
+ +    /* Pointer to a list of geometry (bond/angle) entries for
+ +     * residues like PHE, TRP, TYR, HIS, etc., where we need
+ +     * to know the geometry to construct vsite aromatics.
+ +     * Note that equilibrium geometry isnt necessarily the same
+ +     * as the individual bond and angle values given in the
+ +     * force field (rings can be strained).
+ +     */
+ +
+ +    /* if bVsiteAromatics=TRUE do_vsites will specifically convert atoms in
+ +       PHE, TRP, TYR and HIS to a construction of virtual sites */
+ +    enum                    {
+ +        resPHE, resTRP, resTYR, resHIS, resNR
+ +    };
+ +    const char *resnms[resNR]   = {   "PHE",  "TRP",  "TYR",  "HIS" };
+ +    /* Amber03 alternative names for termini */
+ +    const char *resnmsN[resNR]  = {  "NPHE", "NTRP", "NTYR", "NHIS" };
+ +    const char *resnmsC[resNR]  = {  "CPHE", "CTRP", "CTYR", "CHIS" };
+ +    /* HIS can be known as HISH, HIS1, HISA, HID, HIE, HIP, etc. too */
+ +    gmx_bool    bPartial[resNR]  = {  FALSE,  FALSE,  FALSE,   TRUE  };
+ +    /* the atnms for every residue MUST correspond to the enums in the
+ +       gen_vsites_* (one for each residue) routines! */
+ +    /* also the atom names in atnms MUST be in the same order as in the .rtp! */
+ +    const char *atnms[resNR][MAXATOMSPERRESIDUE+1] = {
+ +        { "CG", /* PHE */
+ +          "CD1", "HD1", "CD2", "HD2",
+ +          "CE1", "HE1", "CE2", "HE2",
+ +          "CZ", "HZ", NULL },
+ +        { "CB", /* TRP */
+ +          "CG",
+ +          "CD1", "HD1", "CD2",
+ +          "NE1", "HE1", "CE2", "CE3", "HE3",
+ +          "CZ2", "HZ2", "CZ3", "HZ3",
+ +          "CH2", "HH2", NULL },
+ +        { "CG", /* TYR */
+ +          "CD1", "HD1", "CD2", "HD2",
+ +          "CE1", "HE1", "CE2", "HE2",
+ +          "CZ", "OH", "HH", NULL },
+ +        { "CG", /* HIS */
+ +          "ND1", "HD1", "CD2", "HD2",
+ +          "CE1", "HE1", "NE2", "HE2", NULL }
+ +    };
+ +
+ +    if (debug)
+ +    {
+ +        printf("Searching for atoms to make virtual sites ...\n");
+ +        fprintf(debug, "# # # VSITES # # #\n");
+ +    }
+ +
+ +    ndb           = fflib_search_file_end(ffdir, ".vsd", FALSE, &db);
+ +    nvsiteconf    = 0;
+ +    vsiteconflist = NULL;
+ +    nvsitetop     = 0;
+ +    vsitetop      = NULL;
+ +    for (f = 0; f < ndb; f++)
+ +    {
+ +        read_vsite_database(db[f], &vsiteconflist, &nvsiteconf, &vsitetop, &nvsitetop);
+ +        sfree(db[f]);
+ +    }
+ +    sfree(db);
+ +
+ +    bFirstWater = TRUE;
+ +    nvsite      = 0;
+ +    nadd        = 0;
+ +    /* we need a marker for which atoms should *not* be renumbered afterwards */
+ +    add_shift = 10*at->nr;
+ +    /* make arrays where masses can be inserted into */
+ +    snew(newx, at->nr);
+ +    snew(newatom, at->nr);
+ +    snew(newatomname, at->nr);
+ +    snew(newvsite_type, at->nr);
+ +    snew(newcgnr, at->nr);
+ +    /* make index array to tell where the atoms go to when masses are inserted */
+ +    snew(o2n, at->nr);
+ +    for (i = 0; i < at->nr; i++)
+ +    {
+ +        o2n[i] = i;
+ +    }
+ +    /* make index to tell which residues were already processed */
+ +    snew(bResProcessed, at->nres);
+ +
+ +    gmx_residuetype_init(&rt);
+ +
+ +    /* generate vsite constructions */
+ +    /* loop over all atoms */
+ +    resind = -1;
+ +    for (i = 0; (i < at->nr); i++)
+ +    {
+ +        if (at->atom[i].resind != resind)
+ +        {
+ +            resind = at->atom[i].resind;
+ +            resnm  = *(at->resinfo[resind].name);
+ +        }
+ +        /* first check for aromatics to virtualize */
+ +        /* don't waste our effort on DNA, water etc. */
+ +        /* Only do the vsite aromatic stuff when we reach the
+ +         * CA atom, since there might be an X2/X3 group on the
+ +         * N-terminus that must be treated first.
+ +         */
+ +        if (bVsiteAromatics &&
+ +            !strcmp(*(at->atomname[i]), "CA") &&
+ +            !bResProcessed[resind] &&
+ +            gmx_residuetype_is_protein(rt, *(at->resinfo[resind].name)) )
+ +        {
+ +            /* mark this residue */
+ +            bResProcessed[resind] = TRUE;
+ +            /* find out if this residue needs converting */
+ +            whatres = NOTSET;
+ +            for (j = 0; j < resNR && whatres == NOTSET; j++)
+ +            {
+ +
+ +                cmplength = bPartial[j] ? strlen(resnm)-1 : strlen(resnm);
+ +
+ +                bFound = ((gmx_strncasecmp(resnm, resnms[j], cmplength) == 0) ||
+ +                          (gmx_strncasecmp(resnm, resnmsN[j], cmplength) == 0) ||
+ +                          (gmx_strncasecmp(resnm, resnmsC[j], cmplength) == 0));
+ +
+ +                if (bFound)
+ +                {
+ +                    whatres = j;
+ +                    /* get atoms we will be needing for the conversion */
+ +                    nrfound = 0;
+ +                    for (k = 0; atnms[j][k]; k++)
+ +                    {
+ +                        ats[k] = NOTSET;
+ +                        for (m = i; m < at->nr && at->atom[m].resind == resind && ats[k] == NOTSET; m++)
+ +                        {
+ +                            if (gmx_strcasecmp(*(at->atomname[m]), atnms[j][k]) == 0)
+ +                            {
+ +                                ats[k] = m;
+ +                                nrfound++;
+ +                            }
+ +                        }
+ +                    }
+ +
+ +                    /* now k is number of atom names in atnms[j] */
+ +                    if (j == resHIS)
+ +                    {
+ +                        needed = k-3;
+ +                    }
+ +                    else
+ +                    {
+ +                        needed = k;
+ +                    }
+ +                    if (nrfound < needed)
+ +                    {
+ +                        gmx_fatal(FARGS, "not enough atoms found (%d, need %d) in "
+ +                                  "residue %s %d while\n             "
+ +                                  "generating aromatics virtual site construction",
+ +                                  nrfound, needed, resnm, at->resinfo[resind].nr);
+ +                    }
+ +                    /* Advance overall atom counter */
+ +                    i++;
+ +                }
+ +            }
+ +            /* the enums for every residue MUST correspond to atnms[residue] */
+ +            switch (whatres)
+ +            {
+ +                case resPHE:
+ +                    if (debug)
+ +                    {
+ +                        fprintf(stderr, "PHE at %d\n", o2n[ats[0]]+1);
+ +                    }
+ +                    nvsite += gen_vsites_phe(at, vsite_type, plist, nrfound, ats, vsitetop, nvsitetop);
+ +                    break;
+ +                case resTRP:
+ +                    if (debug)
+ +                    {
+ +                        fprintf(stderr, "TRP at %d\n", o2n[ats[0]]+1);
+ +                    }
+ +                    nvsite += gen_vsites_trp(atype, &newx, &newatom, &newatomname, &o2n,
+ +                                             &newvsite_type, &newcgnr, symtab, &nadd, *x, cgnr,
+ +                                             at, vsite_type, plist, nrfound, ats, add_shift, vsitetop, nvsitetop);
+ +                    break;
+ +                case resTYR:
+ +                    if (debug)
+ +                    {
+ +                        fprintf(stderr, "TYR at %d\n", o2n[ats[0]]+1);
+ +                    }
+ +                    nvsite += gen_vsites_tyr(atype, &newx, &newatom, &newatomname, &o2n,
+ +                                             &newvsite_type, &newcgnr, symtab, &nadd, *x, cgnr,
+ +                                             at, vsite_type, plist, nrfound, ats, add_shift, vsitetop, nvsitetop);
+ +                    break;
+ +                case resHIS:
+ +                    if (debug)
+ +                    {
+ +                        fprintf(stderr, "HIS at %d\n", o2n[ats[0]]+1);
+ +                    }
+ +                    nvsite += gen_vsites_his(at, vsite_type, plist, nrfound, ats, vsitetop, nvsitetop);
+ +                    break;
+ +                case NOTSET:
+ +                    /* this means this residue won't be processed */
+ +                    break;
+ +                default:
+ +                    gmx_fatal(FARGS, "DEATH HORROR in do_vsites (%s:%d)",
+ +                              __FILE__, __LINE__);
+ +            } /* switch whatres */
+ +              /* skip back to beginning of residue */
+ +            while (i > 0 && at->atom[i-1].resind == resind)
+ +            {
+ +                i--;
+ +            }
+ +        } /* if bVsiteAromatics & is protein */
+ +
+ +        /* now process the rest of the hydrogens */
+ +        /* only process hydrogen atoms which are not already set */
+ +        if ( ((*vsite_type)[i] == NOTSET) && is_hydrogen(*(at->atomname[i])))
+ +        {
+ +            /* find heavy atom, count #bonds from it and #H atoms bound to it
+ +               and return H atom numbers (Hatoms) and heavy atom numbers (heavies) */
+ +            count_bonds(i, &plist[F_BONDS], at->atomname,
+ +                        &nrbonds, &nrHatoms, Hatoms, &Heavy, &nrheavies, heavies);
+ +            /* get Heavy atom type */
+ +            tpHeavy = get_atype(Heavy, at, nrtp, rtp, rt);
+ +            strcpy(tpname, get_atomtype_name(tpHeavy, atype));
+ +
+ +            bWARNING       = FALSE;
+ +            bAddVsiteParam = TRUE;
+ +            /* nested if's which check nrHatoms, nrbonds and atomname */
+ +            if (nrHatoms == 1)
+ +            {
+ +                switch (nrbonds)
+ +                {
+ +                    case 2: /* -O-H */
+ +                        (*vsite_type)[i] = F_BONDS;
+ +                        break;
+ +                    case 3: /* =CH-, -NH- or =NH+- */
+ +                        (*vsite_type)[i] = F_VSITE3FD;
+ +                        break;
+ +                    case 4: /* --CH- (tert) */
+ +                        /* The old type 4FD had stability issues, so
+ +                         * all new constructs should use 4FDN
+ +                         */
+ +                        (*vsite_type)[i] = F_VSITE4FDN;
+ +
+ +                        /* Check parity of heavy atoms from coordinates */
+ +                        ai = Heavy;
+ +                        aj = heavies[0];
+ +                        ak = heavies[1];
+ +                        al = heavies[2];
+ +                        rvec_sub((*x)[aj], (*x)[ai], tmpmat[0]);
+ +                        rvec_sub((*x)[ak], (*x)[ai], tmpmat[1]);
+ +                        rvec_sub((*x)[al], (*x)[ai], tmpmat[2]);
+ +
+ +                        if (det(tmpmat) > 0)
+ +                        {
+ +                            /* swap parity */
+ +                            heavies[1] = aj;
+ +                            heavies[0] = ak;
+ +                        }
+ +
+ +                        break;
+ +                    default: /* nrbonds != 2, 3 or 4 */
+ +                        bWARNING = TRUE;
+ +                }
+ +
+ +            }
+ +            else if ( /*(nrHatoms == 2) && (nrbonds == 2) && REMOVED this test
+ +                         DvdS 19-01-04 */
+ +                (gmx_strncasecmp(*at->atomname[Heavy], "OW", 2) == 0) )
+ +            {
+ +                bAddVsiteParam = FALSE; /* this is water: skip these hydrogens */
+ +                if (bFirstWater)
+ +                {
+ +                    bFirstWater = FALSE;
+ +                    if (debug)
+ +                    {
+ +                        fprintf(debug,
+ +                                "Not converting hydrogens in water to virtual sites\n");
+ +                    }
+ +                }
+ +            }
+ +            else if ( (nrHatoms == 2) && (nrbonds == 4) )
+ +            {
+ +                /* -CH2- , -NH2+- */
+ +                (*vsite_type)[Hatoms[0]] = F_VSITE3OUT;
+ +                (*vsite_type)[Hatoms[1]] = -F_VSITE3OUT;
+ +            }
+ +            else
+ +            {
+ +                /* 2 or 3 hydrogen atom, with 3 or 4 bonds in total to the heavy atom.
+ +                 * If it is a nitrogen, first check if it is planar.
+ +                 */
+ +                isN = planarN = FALSE;
+ +                if ((nrHatoms == 2) && ((*at->atomname[Heavy])[0] == 'N'))
+ +                {
+ +                    isN = TRUE;
+ +                    j   = nitrogen_is_planar(vsiteconflist, nvsiteconf, tpname);
+ +                    if (j < 0)
+ +                    {
+ +                        gmx_fatal(FARGS, "No vsite database NH2 entry for type %s\n", tpname);
+ +                    }
+ +                    planarN = (j == 1);
+ +                }
+ +                if ( (nrHatoms == 2) && (nrbonds == 3) && ( !isN || planarN ) )
+ +                {
+ +                    /* =CH2 or, if it is a nitrogen NH2, it is a planar one */
+ +                    (*vsite_type)[Hatoms[0]] = F_VSITE3FAD;
+ +                    (*vsite_type)[Hatoms[1]] = -F_VSITE3FAD;
+ +                }
+ +                else if ( ( (nrHatoms == 2) && (nrbonds == 3) &&
+ +                            ( isN && !planarN ) ) ||
+ +                          ( (nrHatoms == 3) && (nrbonds == 4) ) )
+ +                {
+ +                    /* CH3, NH3 or non-planar NH2 group */
+ +                    int      Hat_vsite_type[3] = { F_VSITE3, F_VSITE3OUT, F_VSITE3OUT };
+ +                    gmx_bool Hat_SwapParity[3] = { FALSE,    TRUE,        FALSE };
+ +
+ +                    if (debug)
+ +                    {
+ +                        fprintf(stderr, "-XH3 or nonplanar NH2 group at %d\n", i+1);
+ +                    }
+ +                    bAddVsiteParam = FALSE; /* we'll do this ourselves! */
+ +                    /* -NH2 (umbrella), -NH3+ or -CH3 */
+ +                    (*vsite_type)[Heavy]       = F_VSITE3;
+ +                    for (j = 0; j < nrHatoms; j++)
+ +                    {
+ +                        (*vsite_type)[Hatoms[j]] = Hat_vsite_type[j];
+ +                    }
+ +                    /* get dummy mass type from first char of heavy atom type (N or C) */
+ +
+ +                    strcpy(nexttpname, get_atomtype_name(get_atype(heavies[0], at, nrtp, rtp, rt), atype));
+ +                    ch = get_dummymass_name(vsiteconflist, nvsiteconf, tpname, nexttpname);
+ +
+ +                    if (ch == NULL)
+ +                    {
+ +                        if (ndb > 0)
+ +                        {
+ +                            gmx_fatal(FARGS, "Can't find dummy mass for type %s bonded to type %s in the virtual site database (.vsd files). Add it to the database!\n", tpname, nexttpname);
+ +                        }
+ +                        else
+ +                        {
+ +                            gmx_fatal(FARGS, "A dummy mass for type %s bonded to type %s is required, but no virtual site database (.vsd) files where found.\n", tpname, nexttpname);
+ +                        }
+ +                    }
+ +                    else
+ +                    {
+ +                        strcpy(name, ch);
+ +                    }
+ +
+ +                    tpM = vsite_nm2type(name, atype);
+ +                    /* make space for 2 masses: shift all atoms starting with 'Heavy' */
+ +#define NMASS 2
+ +                    i0  = Heavy;
+ +                    ni0 = i0+nadd;
+ +                    if (debug)
+ +                    {
+ +                        fprintf(stderr, "Inserting %d dummy masses at %d\n", NMASS, o2n[i0]+1);
+ +                    }
+ +                    nadd += NMASS;
+ +                    for (j = i0; j < at->nr; j++)
+ +                    {
+ +                        o2n[j] = j+nadd;
+ +                    }
+ +
+ +                    srenew(newx, at->nr+nadd);
+ +                    srenew(newatom, at->nr+nadd);
+ +                    srenew(newatomname, at->nr+nadd);
+ +                    srenew(newvsite_type, at->nr+nadd);
+ +                    srenew(newcgnr, at->nr+nadd);
+ +
+ +                    for (j = 0; j < NMASS; j++)
+ +                    {
+ +                        newatomname[at->nr+nadd-1-j] = NULL;
+ +                    }
+ +
+ +                    /* calculate starting position for the masses */
+ +                    mHtot = 0;
+ +                    /* get atom masses, and set Heavy and Hatoms mass to zero */
+ +                    for (j = 0; j < nrHatoms; j++)
+ +                    {
+ +                        mHtot                += get_amass(Hatoms[j], at, nrtp, rtp, rt);
+ +                        at->atom[Hatoms[j]].m = at->atom[Hatoms[j]].mB = 0;
+ +                    }
+ +                    mtot              = mHtot + get_amass(Heavy, at, nrtp, rtp, rt);
+ +                    at->atom[Heavy].m = at->atom[Heavy].mB = 0;
+ +                    if (mHmult != 1.0)
+ +                    {
+ +                        mHtot *= mHmult;
+ +                    }
+ +                    fact2 = mHtot/mtot;
+ +                    fact  = sqrt(fact2);
+ +                    /* generate vectors parallel and perpendicular to rotational axis:
+ +                     * rpar  = Heavy -> Hcom
+ +                     * rperp = Hcom  -> H1   */
+ +                    clear_rvec(rpar);
+ +                    for (j = 0; j < nrHatoms; j++)
+ +                    {
+ +                        rvec_inc(rpar, (*x)[Hatoms[j]]);
+ +                    }
+ +                    svmul(1.0/nrHatoms, rpar, rpar); /* rpar = ( H1+H2+H3 ) / 3 */
+ +                    rvec_dec(rpar, (*x)[Heavy]);     /*        - Heavy          */
+ +                    rvec_sub((*x)[Hatoms[0]], (*x)[Heavy], rperp);
+ +                    rvec_dec(rperp, rpar);           /* rperp = H1 - Heavy - rpar */
+ +                    /* calc mass positions */
+ +                    svmul(fact2, rpar, temp);
+ +                    for (j = 0; (j < NMASS); j++) /* xM = xN + fact2 * rpar +/- fact * rperp */
+ +                    {
+ +                        rvec_add((*x)[Heavy], temp, newx[ni0+j]);
+ +                    }
+ +                    svmul(fact, rperp, temp);
+ +                    rvec_inc(newx[ni0  ], temp);
+ +                    rvec_dec(newx[ni0+1], temp);
+ +                    /* set atom parameters for the masses */
+ +                    for (j = 0; (j < NMASS); j++)
+ +                    {
+ +                        /* make name: "M??#" or "M?#" (? is atomname, # is number) */
+ +                        name[0] = 'M';
+ +                        for (k = 0; (*at->atomname[Heavy])[k] && ( k < NMASS ); k++)
+ +                        {
+ +                            name[k+1] = (*at->atomname[Heavy])[k];
+ +                        }
+ +                        name[k+1]             = atomnamesuffix[j];
+ +                        name[k+2]             = '\0';
+ +                        newatomname[ni0+j]    = put_symtab(symtab, name);
+ +                        newatom[ni0+j].m      = newatom[ni0+j].mB    = mtot/NMASS;
+ +                        newatom[ni0+j].q      = newatom[ni0+j].qB    = 0.0;
+ +                        newatom[ni0+j].type   = newatom[ni0+j].typeB = tpM;
+ +                        newatom[ni0+j].ptype  = eptAtom;
+ +                        newatom[ni0+j].resind = at->atom[i0].resind;
++                        newatom[ni0+j].elem[0] = 'M';
++                        newatom[ni0+j].elem[1] = '\0';
+ +                        newvsite_type[ni0+j]  = NOTSET;
+ +                        newcgnr[ni0+j]        = (*cgnr)[i0];
+ +                    }
+ +                    /* add constraints between dummy masses and to heavies[0] */
+ +                    /* 'add_shift' says which atoms won't be renumbered afterwards */
+ +                    my_add_param(&(plist[F_CONSTRNC]), heavies[0],  add_shift+ni0,  NOTSET);
+ +                    my_add_param(&(plist[F_CONSTRNC]), heavies[0],  add_shift+ni0+1, NOTSET);
+ +                    my_add_param(&(plist[F_CONSTRNC]), add_shift+ni0, add_shift+ni0+1, NOTSET);
+ +
+ +                    /* generate Heavy, H1, H2 and H3 from M1, M2 and heavies[0] */
+ +                    /* note that vsite_type cannot be NOTSET, because we just set it */
+ +                    add_vsite3_atoms  (&plist[(*vsite_type)[Heavy]],
+ +                                       Heavy,     heavies[0], add_shift+ni0, add_shift+ni0+1,
+ +                                       FALSE);
+ +                    for (j = 0; j < nrHatoms; j++)
+ +                    {
+ +                        add_vsite3_atoms(&plist[(*vsite_type)[Hatoms[j]]],
+ +                                         Hatoms[j], heavies[0], add_shift+ni0, add_shift+ni0+1,
+ +                                         Hat_SwapParity[j]);
+ +                    }
+ +#undef NMASS
+ +                }
+ +                else
+ +                {
+ +                    bWARNING = TRUE;
+ +                }
+ +
+ +            }
+ +            if (bWARNING)
+ +            {
+ +                fprintf(stderr,
+ +                        "Warning: cannot convert atom %d %s (bound to a heavy atom "
+ +                        "%s with \n"
+ +                        "         %d bonds and %d bound hydrogens atoms) to virtual site\n",
+ +                        i+1, *(at->atomname[i]), tpname, nrbonds, nrHatoms);
+ +            }
+ +            if (bAddVsiteParam)
+ +            {
+ +                /* add vsite parameters to topology,
+ +                   also get rid of negative vsite_types */
+ +                add_vsites(plist, (*vsite_type), Heavy, nrHatoms, Hatoms,
+ +                           nrheavies, heavies);
+ +                /* transfer mass of virtual site to Heavy atom */
+ +                for (j = 0; j < nrHatoms; j++)
+ +                {
+ +                    if (is_vsite((*vsite_type)[Hatoms[j]]))
+ +                    {
+ +                        at->atom[Heavy].m    += at->atom[Hatoms[j]].m;
+ +                        at->atom[Heavy].mB    = at->atom[Heavy].m;
+ +                        at->atom[Hatoms[j]].m = at->atom[Hatoms[j]].mB = 0;
+ +                    }
+ +                }
+ +            }
+ +            nvsite += nrHatoms;
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "atom %d: ", o2n[i]+1);
+ +                print_bonds(debug, o2n, nrHatoms, Hatoms, Heavy, nrheavies, heavies);
+ +            }
+ +        } /* if vsite NOTSET & is hydrogen */
+ +
+ +    }     /* for i < at->nr */
+ +
+ +    gmx_residuetype_destroy(rt);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Before inserting new atoms:\n");
+ +        for (i = 0; i < at->nr; i++)
+ +        {
+ +            fprintf(debug, "%4d %4d %4s %4d %4s %6d %-10s\n", i+1, o2n[i]+1,
+ +                    at->atomname[i] ? *(at->atomname[i]) : "(NULL)",
+ +                    at->resinfo[at->atom[i].resind].nr,
+ +                    at->resinfo[at->atom[i].resind].name ?
+ +                    *(at->resinfo[at->atom[i].resind].name) : "(NULL)",
+ +                    (*cgnr)[i],
+ +                    ((*vsite_type)[i] == NOTSET) ?
+ +                    "NOTSET" : interaction_function[(*vsite_type)[i]].name);
+ +        }
+ +        fprintf(debug, "new atoms to be inserted:\n");
+ +        for (i = 0; i < at->nr+nadd; i++)
+ +        {
+ +            if (newatomname[i])
+ +            {
+ +                fprintf(debug, "%4d %4s %4d %6d %-10s\n", i+1,
+ +                        newatomname[i] ? *(newatomname[i]) : "(NULL)",
+ +                        newatom[i].resind, newcgnr[i],
+ +                        (newvsite_type[i] == NOTSET) ?
+ +                        "NOTSET" : interaction_function[newvsite_type[i]].name);
+ +            }
+ +        }
+ +    }
+ +
+ +    /* add all original atoms to the new arrays, using o2n index array */
+ +    for (i = 0; i < at->nr; i++)
+ +    {
+ +        newatomname  [o2n[i]] = at->atomname [i];
+ +        newatom      [o2n[i]] = at->atom     [i];
+ +        newvsite_type[o2n[i]] = (*vsite_type)[i];
+ +        newcgnr      [o2n[i]] = (*cgnr)      [i];
+ +        copy_rvec((*x)[i], newx[o2n[i]]);
+ +    }
+ +    /* throw away old atoms */
+ +    sfree(at->atom);
+ +    sfree(at->atomname);
+ +    sfree(*vsite_type);
+ +    sfree(*cgnr);
+ +    sfree(*x);
+ +    /* put in the new ones */
+ +    at->nr      += nadd;
+ +    at->atom     = newatom;
+ +    at->atomname = newatomname;
+ +    *vsite_type  = newvsite_type;
+ +    *cgnr        = newcgnr;
+ +    *x           = newx;
+ +    if (at->nr > add_shift)
+ +    {
+ +        gmx_fatal(FARGS, "Added impossible amount of dummy masses "
+ +                  "(%d on a total of %d atoms)\n", nadd, at->nr-nadd);
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "After inserting new atoms:\n");
+ +        for (i = 0; i < at->nr; i++)
+ +        {
+ +            fprintf(debug, "%4d %4s %4d %4s %6d %-10s\n", i+1,
+ +                    at->atomname[i] ? *(at->atomname[i]) : "(NULL)",
+ +                    at->resinfo[at->atom[i].resind].nr,
+ +                    at->resinfo[at->atom[i].resind].name ?
+ +                    *(at->resinfo[at->atom[i].resind].name) : "(NULL)",
+ +                    (*cgnr)[i],
+ +                    ((*vsite_type)[i] == NOTSET) ?
+ +                    "NOTSET" : interaction_function[(*vsite_type)[i]].name);
+ +        }
+ +    }
+ +
+ +    /* now renumber all the interactions because of the added atoms */
+ +    for (ftype = 0; ftype < F_NRE; ftype++)
+ +    {
+ +        params = &(plist[ftype]);
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Renumbering %d %s\n", params->nr,
+ +                    interaction_function[ftype].longname);
+ +        }
+ +        for (i = 0; i < params->nr; i++)
+ +        {
+ +            for (j = 0; j < NRAL(ftype); j++)
+ +            {
+ +                if (params->param[i].a[j] >= add_shift)
+ +                {
+ +                    if (debug)
+ +                    {
+ +                        fprintf(debug, " [%u -> %u]", params->param[i].a[j],
+ +                                params->param[i].a[j]-add_shift);
+ +                    }
+ +                    params->param[i].a[j] = params->param[i].a[j]-add_shift;
+ +                }
+ +                else
+ +                {
+ +                    if (debug)
+ +                    {
+ +                        fprintf(debug, " [%u -> %d]", params->param[i].a[j],
+ +                                o2n[params->param[i].a[j]]);
+ +                    }
+ +                    params->param[i].a[j] = o2n[params->param[i].a[j]];
+ +                }
+ +            }
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "\n");
+ +            }
+ +        }
+ +    }
+ +    /* now check if atoms in the added constraints are in increasing order */
+ +    params = &(plist[F_CONSTRNC]);
+ +    for (i = 0; i < params->nr; i++)
+ +    {
+ +        if (params->param[i].AI > params->param[i].AJ)
+ +        {
+ +            j                   = params->param[i].AJ;
+ +            params->param[i].AJ = params->param[i].AI;
+ +            params->param[i].AI = j;
+ +        }
+ +    }
+ +
+ +    /* clean up */
+ +    sfree(o2n);
+ +
+ +    /* tell the user what we did */
+ +    fprintf(stderr, "Marked %d virtual sites\n", nvsite);
+ +    fprintf(stderr, "Added %d dummy masses\n", nadd);
+ +    fprintf(stderr, "Added %d new constraints\n", plist[F_CONSTRNC].nr);
+ +}
+ +
+ +void do_h_mass(t_params *psb, int vsite_type[], t_atoms *at, real mHmult,
+ +               gmx_bool bDeuterate)
+ +{
+ +    int i, j, a;
+ +
+ +    /* loop over all atoms */
+ +    for (i = 0; i < at->nr; i++)
+ +    {
+ +        /* adjust masses if i is hydrogen and not a virtual site */
+ +        if (!is_vsite(vsite_type[i]) && is_hydrogen(*(at->atomname[i])) )
+ +        {
+ +            /* find bonded heavy atom */
+ +            a = NOTSET;
+ +            for (j = 0; (j < psb->nr) && (a == NOTSET); j++)
+ +            {
+ +                /* if other atom is not a virtual site, it is the one we want */
+ +                if ( (psb->param[j].AI == i) &&
+ +                     !is_vsite(vsite_type[psb->param[j].AJ]) )
+ +                {
+ +                    a = psb->param[j].AJ;
+ +                }
+ +                else if ( (psb->param[j].AJ == i) &&
+ +                          !is_vsite(vsite_type[psb->param[j].AI]) )
+ +                {
+ +                    a = psb->param[j].AI;
+ +                }
+ +            }
+ +            if (a == NOTSET)
+ +            {
+ +                gmx_fatal(FARGS, "Unbound hydrogen atom (%d) found while adjusting mass",
+ +                          i+1);
+ +            }
+ +
+ +            /* adjust mass of i (hydrogen) with mHmult
+ +               and correct mass of a (bonded atom) with same amount */
+ +            if (!bDeuterate)
+ +            {
+ +                at->atom[a].m  -= (mHmult-1.0)*at->atom[i].m;
+ +                at->atom[a].mB -= (mHmult-1.0)*at->atom[i].m;
+ +            }
+ +            at->atom[i].m  *= mHmult;
+ +            at->atom[i].mB *= mHmult;
+ +        }
+ +    }
+ +}
diff --cc src/gromacs/legacyheaders/gmx_simd_macros.h

index 4a769efda4615d320744f2d449407c2caa476253,0000000000000000000000000000000000000000..9cd3767d5ed51f0ee6db9abf0496ed18a9417863

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/gmx_simd_macros.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/gmx_simd_macros.h
@@@ -1,463 -1,0 +1,587 @@@
- /* Undefine all defines used below so we can include this file multiple times
-  * with different settings from the same source file.
-  */
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +/* The macros in this file are intended to be used for writing
+ + * architecture-independent SIMD intrinsics code.
+ + * To support a new architecture, adding macros here should be (nearly)
+ + * all that is needed.
+ + */
+ +
- #undef GMX_SIMD_WIDTH_HERE
++#ifdef _gmx_simd_macros_h_
++#error "gmx_simd_macros.h included twice"
++#else
++#define _gmx_simd_macros_h_
+ +
+ +/* NOTE: SSE2 acceleration does not include floor or blendv */
+ +
- /* float/double SIMD register type */
- #undef gmx_mm_pr
+ +
- /* integer SIMD register type, only used in the tabulated PME kernels */
- #undef gmx_epi32
++/* Uncomment the next line, without other SIMD active, for testing plain-C */
++/* #define GMX_SIMD_REFERENCE_PLAIN_C */
++#ifdef GMX_SIMD_REFERENCE_PLAIN_C
++/* Plain C SIMD reference implementation, also serves as documentation */
++#define GMX_HAVE_SIMD_MACROS
+ +
- #undef gmx_load_pr
- #undef gmx_load1_pr
- #undef gmx_set1_pr
- #undef gmx_setzero_pr
- #undef gmx_store_pr
++/* In general the reference SIMD supports any SIMD width, including 1.
++ * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
++ * The nbnxn 2xnn kernels are currently not supported.
++ */
++#define GMX_SIMD_REF_WIDTH  4
+ +
- #undef gmx_add_pr
- #undef gmx_sub_pr
- #undef gmx_mul_pr
++/* Include plain-C reference implementation, also serves as documentation */
++#include "gmx_simd_ref.h"
+ +
- /* d = gmx_madd_pr(a,b,c): d = a*b + c, could use FMA3 or FMA4 */
- #undef gmx_madd_pr
- /* d = gmx_nmsub_pr(a,b,c): d = -a*b + c, could use FMA3 or FMA4 */
- #undef gmx_nmsub_pr
- #undef gmx_max_pr
- #undef gmx_cmplt_pr
- /* gmx_blendzero_pr(real a, boolean b) does: (b ? a : 0) */
- #undef gmx_blendzero_pr
- /* Logical operations on SIMD booleans */
- #undef gmx_and_pr
- #undef gmx_or_pr
- #undef gmx_andnot_pr
++#define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
++
++/* float/double SIMD register type */
++#define gmx_mm_pr  gmx_simd_ref_pr
++
++/* boolean SIMD register type */
++#define gmx_mm_pb  gmx_simd_ref_pb
++
++/* integer SIMD register type, only for table indexing and exclusion masks */
++#define gmx_epi32  gmx_simd_ref_epi32
++#define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
++
++/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
++#define gmx_load_pr       gmx_simd_ref_load_pr
++/* Set all SIMD register elements to *r */
++#define gmx_load1_pr      gmx_simd_ref_load1_pr
++#define gmx_set1_pr       gmx_simd_ref_set1_pr
++#define gmx_setzero_pr    gmx_simd_ref_setzero_pr
++#define gmx_store_pr      gmx_simd_ref_store_pr
++
++#define gmx_add_pr        gmx_simd_ref_add_pr
++#define gmx_sub_pr        gmx_simd_ref_sub_pr
++#define gmx_mul_pr        gmx_simd_ref_mul_pr
+ +/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
- /* Only used for PBC in bonded interactions, can be avoided */
- #undef gmx_round_pr
++#define gmx_madd_pr       gmx_simd_ref_madd_pr
++#define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
++
++#define gmx_max_pr        gmx_simd_ref_max_pr
++#define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
++
++#define gmx_round_pr      gmx_simd_ref_round_pr
+ +
- #undef GMX_HAVE_SIMD_FLOOR
- #undef gmx_floor_pr
+ +/* Not required, only used to speed up the nbnxn tabulated PME kernels */
- #undef GMX_HAVE_SIMD_BLENDV
- #undef gmx_blendv_pr
- /* Not required, gmx_anytrue(x) returns if any of the boolean is x is True.
++#define GMX_SIMD_HAVE_FLOOR
++#ifdef GMX_SIMD_HAVE_FLOOR
++#define gmx_floor_pr      gmx_simd_ref_floor_pr
++#endif
+ +
+ +/* Not required, only used when blendv is faster than comparison */
- #undef GMX_HAVE_SIMD_ANYTRUE
- #undef gmx_anytrue_pr
++#define GMX_SIMD_HAVE_BLENDV
++#ifdef GMX_SIMD_HAVE_BLENDV
++#define gmx_blendv_pr     gmx_simd_ref_blendv_pr
++#endif
++
++/* Copy the sign of a to b, assumes b >= 0 for efficiency */
++#define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
++
++/* Very specific operation required in the non-bonded kernels */
++#define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
++
++/* Comparison */
++#define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
++
++/* Logical operations on SIMD booleans */
++#define gmx_and_pb        gmx_simd_ref_and_pb
++#define gmx_or_pb         gmx_simd_ref_or_pb
++
++/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
+ + * If this is not present, define GMX_SIMD_IS_TRUE(real x),
+ + * which should return x==True, where True is True as defined in SIMD.
+ + */
- /* Integer set and cast are only used for nbnxn exclusion masks */
- #undef gmx_set1_epi32
- #undef gmx_castsi_pr
++#define GMX_SIMD_HAVE_ANYTRUE
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
++#else
++/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
++#define gmx_store_pb      gmx_simd_ref_store_pb
++#endif
+ +
- #undef gmx_load_si
- /* If the same bit is set in both input masks, return all bits 1, otherwise 0 */
- #undef gmx_checkbitmask_epi32
+ +/* For topology exclusion pair checking we need: ((a & b) ? True : False)
+ + * when we do a bit-wise and between a and b.
+ + * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
+ + * Otherwise we do all operations, except for the set1, in reals.
+ + */
- #undef gmx_checkbitmask_pr
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#define gmx_set1_epi32          gmx_simd_ref_set1_epi32
++#define gmx_load_si             gmx_simd_ref_load_si
++#define gmx_checkbitmask_epi32  gmx_simd_ref_checkbitmask_epi32
++#endif
++
++/* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
++#define gmx_castsi_pr           gmx_simd_ref_castsi_pr
+ +/* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
+ + * identical 32-bit masks are set in one double and one or both can be used.
+ + */
- #undef gmx_cvttpr_epi32
- #undef gmx_cvtepi32_pr
- 
- #undef gmx_invsqrt_pr
- /* sqrt+inv+sin+cos+acos+atan2 are only used for bonded potentials */
- #undef gmx_sqrt_pr
- #undef gmx_inv_pr
- #undef gmx_sincos_pr
- #undef gmx_acos_pr
- #undef gmx_atan_pr
- 
- #undef gmx_calc_rsq_pr
- #undef gmx_sum4_pr
- 
- /* Only required for nbnxn analytical PME kernels */
- #undef gmx_pmecorrF_pr
- #undef gmx_pmecorrV_pr
- 
++#define gmx_checkbitmask_pr     gmx_simd_ref_checkbitmask_pr
++#endif
+ +
+ +/* Conversions only used for PME table lookup */
- /* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
- #undef gmx_mm_hpr
- 
- #undef gmx_load_hpr
- #undef gmx_load1_hpr
- #undef gmx_store_hpr
- #undef gmx_add_hpr
- #undef gmx_sub_hpr
++#define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
++#define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
+ +
- #undef gmx_sum4_hpr
++/* These two function only need to be approximate, Newton-Raphson iteration
++ * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
++ */
++#define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
++#define gmx_rcp_pr        gmx_simd_ref_rcp_pr
+ +
- #undef gmx_2hpr_to_pr
++/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
++#define GMX_SIMD_HAVE_EXP
++#ifdef GMX_SIMD_HAVE_EXP
++#define gmx_exp_pr        gmx_simd_ref_exp_pr
++#endif
++#define GMX_SIMD_HAVE_TRIGONOMETRIC
++#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
++#define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
++#define gmx_sincos_pr     gmx_simd_ref_sincos_pr
++#define gmx_acos_pr       gmx_simd_ref_acos_pr
++#define gmx_atan2_pr      gmx_simd_ref_atan2_pr
++#endif
+ +
- /* Generic macros for obtaining a SIMD aligned pointer from pointer x */
- #undef gmx_simd_align_real
- #undef gmx_simd_align_int
- 
- 
++#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
+ +
+ +
+ +/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
+ + * to instructions for) different SIMD width and float precision.
+ + *
+ + * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
+ + * The _pr suffix is replaced by _ps or _pd (for single or double precision).
+ + * Compiler settings will decide if 128-bit intrinsics will
+ + * be translated into SSE or AVX instructions.
+ + */
+ +
+ +
- #include "gmx_x86_simd_single.h"
- 
+ +#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
+ +#if defined GMX_X86_AVX_256
+ +/* We have half SIMD width support, continue */
+ +#else
+ +#error "half SIMD width intrinsics are not supported"
+ +#endif
+ +#endif
+ +
+ +
+ +#ifdef GMX_X86_SSE2
++/* This is for general x86 SIMD instruction sets that also support SSE2 */
++#define GMX_HAVE_SIMD_MACROS
++
++/* Include the highest supported x86 SIMD intrisics + math functions */
++#ifdef GMX_X86_AVX_256
++#include "gmx_x86_avx_256.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_avx_256_double.h"
++#else
++#include "gmx_math_x86_avx_256_single.h"
++#endif
++#else
++#ifdef GMX_X86_AVX_128_FMA
++#include "gmx_x86_avx_128_fma.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_avx_128_fma_double.h"
++#else
++#include "gmx_math_x86_avx_128_fma_single.h"
++#endif
++#else
++#ifdef GMX_X86_SSE4_1
++#include "gmx_x86_sse4_1.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_sse4_1_double.h"
++#else
++#include "gmx_math_x86_sse4_1_single.h"
++#endif
++#else
++#ifdef GMX_X86_SSE2
++#include "gmx_x86_sse2.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_sse2_double.h"
++#else
++#include "gmx_math_x86_sse2_single.h"
++#endif
++#else
++#error No x86 acceleration defined
++#endif
++#endif
++#endif
++#endif
++/* exp and trigonometric functions are included above */
++#define GMX_SIMD_HAVE_EXP
++#define GMX_SIMD_HAVE_TRIGONOMETRIC
+ +
+ +#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
+ +
+ +#ifndef GMX_DOUBLE
+ +
- #define gmx_cmplt_pr      _mm_cmplt_ps
+ +#define GMX_SIMD_WIDTH_HERE  4
+ +
+ +#define gmx_mm_pr  __m128
+ +
++#define gmx_mm_pb  __m128
++
+ +#define gmx_epi32  __m128i
++#define GMX_SIMD_EPI32_WIDTH  4
+ +
+ +#define gmx_load_pr       _mm_load_ps
+ +#define gmx_load1_pr      _mm_load1_ps
+ +#define gmx_set1_pr       _mm_set1_ps
+ +#define gmx_setzero_pr    _mm_setzero_ps
+ +#define gmx_store_pr      _mm_store_ps
+ +
+ +#define gmx_add_pr        _mm_add_ps
+ +#define gmx_sub_pr        _mm_sub_ps
+ +#define gmx_mul_pr        _mm_mul_ps
+ +#ifdef GMX_X86_AVX_128_FMA
+ +#define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
+ +#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
+ +#else
+ +#define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
+ +#define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
+ +#endif
+ +#define gmx_max_pr        _mm_max_ps
- #define gmx_and_pr        _mm_and_ps
- #define gmx_or_pr         _mm_or_ps
- #define gmx_andnot_pr     _mm_andnot_ps
+ +#define gmx_blendzero_pr  _mm_and_ps
- #define GMX_HAVE_SIMD_FLOOR
++
++#define gmx_cmplt_pr      _mm_cmplt_ps
++#define gmx_and_pb        _mm_and_ps
++#define gmx_or_pb         _mm_or_ps
+ +
+ +#ifdef GMX_X86_SSE4_1
+ +#define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+ +#define gmx_floor_pr      _mm_floor_ps
+ +#else
+ +#define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
+ +#endif
+ +
+ +#ifdef GMX_X86_SSE4_1
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr    _mm_movemask_ps
++#define GMX_SIMD_HAVE_BLENDV
+ +#define gmx_blendv_pr     _mm_blendv_ps
+ +#endif
+ +
- #define gmx_castsi_pr     gmx_mm_castsi128_ps
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++    /* The value -0.0 has only the sign-bit set */
++    gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
++    return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
++};
+ +
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
++
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb    _mm_movemask_ps
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ +#define gmx_set1_epi32    _mm_set1_epi32
- #define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
+ +#define gmx_load_si(i)    _mm_load_si128((__m128i *) (i))
- #define gmx_invsqrt_pr    gmx_mm_invsqrt_ps
++#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
+ +
+ +#define gmx_cvttpr_epi32  _mm_cvttps_epi32
+ +#define gmx_cvtepi32_pr   _mm_cvtepi32_ps
+ +
- #define gmx_inv_pr        gmx_mm_inv_ps
++#define gmx_rsqrt_pr      _mm_rsqrt_ps
++#define gmx_rcp_pr        _mm_rcp_ps
++
++#define gmx_exp_pr        gmx_mm_exp_ps
+ +#define gmx_sqrt_pr       gmx_mm_sqrt_ps
- #define gmx_calc_rsq_pr   gmx_mm_calc_rsq_ps
- #define gmx_sum4_pr       gmx_mm_sum4_ps
- 
- #define gmx_pmecorrF_pr   gmx_mm_pmecorrF_ps
- #define gmx_pmecorrV_pr   gmx_mm_pmecorrV_ps
- 
+ +#define gmx_sincos_pr     gmx_mm_sincos_ps
+ +#define gmx_acos_pr       gmx_mm_acos_ps
+ +#define gmx_atan2_pr      gmx_mm_atan2_ps
+ +
- #include "gmx_x86_simd_double.h"
- 
+ +#else /* ifndef GMX_DOUBLE */
+ +
- #define gmx_cmplt_pr      _mm_cmplt_pd
+ +#define GMX_SIMD_WIDTH_HERE  2
+ +
+ +#define gmx_mm_pr  __m128d
+ +
++#define gmx_mm_pb  __m128d
++
+ +#define gmx_epi32  __m128i
++#define GMX_SIMD_EPI32_WIDTH  4
+ +
+ +#define gmx_load_pr       _mm_load_pd
+ +#define gmx_load1_pr      _mm_load1_pd
+ +#define gmx_set1_pr       _mm_set1_pd
+ +#define gmx_setzero_pr    _mm_setzero_pd
+ +#define gmx_store_pr      _mm_store_pd
+ +
+ +#define gmx_add_pr        _mm_add_pd
+ +#define gmx_sub_pr        _mm_sub_pd
+ +#define gmx_mul_pr        _mm_mul_pd
+ +#ifdef GMX_X86_AVX_128_FMA
+ +#define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
+ +#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
+ +#else
+ +#define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
+ +#define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
+ +#endif
+ +#define gmx_max_pr        _mm_max_pd
- #define gmx_and_pr        _mm_and_pd
- #define gmx_or_pr         _mm_or_pd
- #define gmx_andnot_pr     _mm_andnot_pd
+ +#define gmx_blendzero_pr  _mm_and_pd
- #define GMX_HAVE_SIMD_FLOOR
+ +
+ +#ifdef GMX_X86_SSE4_1
+ +#define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+ +#define gmx_floor_pr      _mm_floor_pd
+ +#else
+ +#define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
+ +/* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
+ +#endif
+ +
+ +#ifdef GMX_X86_SSE4_1
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr    _mm_movemask_pd
++#define GMX_SIMD_HAVE_BLENDV
+ +#define gmx_blendv_pr     _mm_blendv_pd
+ +#endif
+ +
- #define gmx_castsi_pr     gmx_mm_castsi128_pd
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++    gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
++    return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
++};
++
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
+ +
++#define gmx_cmplt_pr      _mm_cmplt_pd
++
++#define gmx_and_pb        _mm_and_pd
++#define gmx_or_pb         _mm_or_pd
++
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb    _mm_movemask_pd
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ +#define gmx_set1_epi32    _mm_set1_epi32
- #define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
+ +#define gmx_load_si(i)    _mm_load_si128((__m128i *) (i))
- #define gmx_invsqrt_pr    gmx_mm_invsqrt_pd
++#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
+ +
+ +#define gmx_cvttpr_epi32  _mm_cvttpd_epi32
+ +#define gmx_cvtepi32_pr   _mm_cvtepi32_pd
+ +
- #define gmx_inv_pr        gmx_mm_inv_pd
++#define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
++#define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
++
++#define gmx_exp_pr        gmx_mm_exp_pd
+ +#define gmx_sqrt_pr       gmx_mm_sqrt_pd
- #define gmx_calc_rsq_pr   gmx_mm_calc_rsq_pd
- #define gmx_sum4_pr       gmx_mm_sum4_pd
- 
- #define gmx_pmecorrF_pr   gmx_mm_pmecorrF_pd
- #define gmx_pmecorrV_pr   gmx_mm_pmecorrV_pd
- 
+ +#define gmx_sincos_pr     gmx_mm_sincos_pd
+ +#define gmx_acos_pr       gmx_mm_acos_pd
+ +#define gmx_atan2_pr      gmx_mm_atan2_pd
+ +
- #include "gmx_x86_simd_single.h"
- 
+ +#endif /* ifndef GMX_DOUBLE */
+ +
+ +#else
+ +/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
+ + * so we use 256-bit SIMD.
+ + */
+ +
+ +#ifndef GMX_DOUBLE
+ +
- /* Less-than (we use ordered, non-signaling, but that's not required) */
- #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
+ +#define GMX_SIMD_WIDTH_HERE  8
+ +
+ +#define gmx_mm_pr  __m256
+ +
++#define gmx_mm_pb  __m256
++
+ +#define gmx_epi32  __m256i
++#define GMX_SIMD_EPI32_WIDTH  8
+ +
+ +#define gmx_load_pr       _mm256_load_ps
+ +#define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
+ +#define gmx_set1_pr       _mm256_set1_ps
+ +#define gmx_setzero_pr    _mm256_setzero_ps
+ +#define gmx_store_pr      _mm256_store_ps
+ +
+ +#define gmx_add_pr        _mm256_add_ps
+ +#define gmx_sub_pr        _mm256_sub_ps
+ +#define gmx_mul_pr        _mm256_mul_ps
+ +#define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
+ +#define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
+ +#define gmx_max_pr        _mm256_max_ps
- #define gmx_and_pr        _mm256_and_ps
- #define gmx_or_pr         _mm256_or_ps
- #define gmx_andnot_pr     _mm256_andnot_ps
+ +#define gmx_blendzero_pr  _mm256_and_ps
- #define GMX_HAVE_SIMD_FLOOR
+ +
+ +#define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+ +#define gmx_floor_pr      _mm256_floor_ps
+ +
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr    _mm256_movemask_ps
++#define GMX_SIMD_HAVE_BLENDV
+ +#define gmx_blendv_pr     _mm256_blendv_ps
+ +
- #define gmx_invsqrt_pr    gmx_mm256_invsqrt_ps
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++    gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
++    return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
++};
++
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
++
++/* Less-than (we use ordered, non-signaling, but that's not required) */
++#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
++#define gmx_and_pb        _mm256_and_ps
++#define gmx_or_pb         _mm256_or_ps
+ +
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb    _mm256_movemask_ps
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_PR
+ +#define gmx_set1_epi32    _mm256_set1_epi32
+ +#define gmx_castsi_pr     _mm256_castsi256_ps
+ +/* With <= 16 bits used the cast and conversion should not be required,
+ + * since only mantissa bits are set and that would give a non-zero float,
+ + * but with the Intel compiler this does not work correctly.
+ + */
+ +#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
+ +
+ +#define gmx_cvttpr_epi32  _mm256_cvttps_epi32
+ +
- #define gmx_inv_pr        gmx_mm256_inv_ps
++#define gmx_rsqrt_pr      _mm256_rsqrt_ps
++#define gmx_rcp_pr        _mm256_rcp_ps
++
++#define gmx_exp_pr        gmx_mm256_exp_ps
+ +#define gmx_sqrt_pr       gmx_mm256_sqrt_ps
- #define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_ps
- #define gmx_sum4_pr       gmx_mm256_sum4_ps
- 
- #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_ps
- #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_ps
- 
+ +#define gmx_sincos_pr     gmx_mm256_sincos_ps
+ +#define gmx_acos_pr       gmx_mm256_acos_ps
+ +#define gmx_atan2_pr      gmx_mm256_atan2_ps
+ +
- #include "gmx_x86_simd_double.h"
- 
+ +#else
+ +
- /* Less-than (we use ordered, non-signaling, but that's not required) */
- #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+ +#define GMX_SIMD_WIDTH_HERE  4
+ +
+ +#define gmx_mm_pr  __m256d
+ +
++#define gmx_mm_pb  __m256d
++
+ +/* We use 128-bit integer registers because of missing 256-bit operations */
+ +#define gmx_epi32  __m128i
++#define GMX_SIMD_EPI32_WIDTH  4
+ +
+ +#define gmx_load_pr       _mm256_load_pd
+ +#define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
+ +#define gmx_set1_pr       _mm256_set1_pd
+ +#define gmx_setzero_pr    _mm256_setzero_pd
+ +#define gmx_store_pr      _mm256_store_pd
+ +
+ +#define gmx_add_pr        _mm256_add_pd
+ +#define gmx_sub_pr        _mm256_sub_pd
+ +#define gmx_mul_pr        _mm256_mul_pd
+ +#define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
+ +#define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+ +#define gmx_max_pr        _mm256_max_pd
- #define gmx_and_pr        _mm256_and_pd
- #define gmx_or_pr         _mm256_or_pd
- #define gmx_andnot_pr     _mm256_andnot_pd
+ +#define gmx_blendzero_pr  _mm256_and_pd
- #define GMX_HAVE_SIMD_FLOOR
+ +
+ +#define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+ +#define gmx_floor_pr      _mm256_floor_pd
+ +
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr    _mm256_movemask_pd
++#define GMX_SIMD_HAVE_BLENDV
+ +#define gmx_blendv_pr     _mm256_blendv_pd
+ +
- #define gmx_invsqrt_pr    gmx_mm256_invsqrt_pd
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++    gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
++    return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
++};
++
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
++
++/* Less-than (we use ordered, non-signaling, but that's not required) */
++#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
++
++#define gmx_and_pb        _mm256_and_pd
++#define gmx_or_pb         _mm256_or_pd
+ +
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb    _mm256_movemask_pd
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_PR
+ +#define gmx_set1_epi32    _mm256_set1_epi32
+ +#define gmx_castsi_pr     _mm256_castsi256_pd
+ +/* With <= 16 bits used the cast and conversion should not be required,
+ + * since only mantissa bits are set and that would give a non-zero float,
+ + * but with the Intel compiler this does not work correctly.
+ + * Because AVX does not have int->double conversion, we convert via float.
+ + */
+ +#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
+ +
+ +#define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
+ +
- #define gmx_inv_pr        gmx_mm256_inv_pd
++#define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
++#define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
++
++#define gmx_exp_pr        gmx_mm256_exp_pd
+ +#define gmx_sqrt_pr       gmx_mm256_sqrt_pd
- #define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_pd
- #define gmx_sum4_pr       gmx_mm256_sum4_pd
- 
- #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_pd
- #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_pd
- 
+ +#define gmx_sincos_pr     gmx_mm256_sincos_pd
+ +#define gmx_acos_pr       gmx_mm256_acos_pd
+ +#define gmx_atan2_pr      gmx_mm256_atan2_pd
+ +
- /* Generic macros to extract a SIMD aligned pointer from a pointer x.
+ +#endif /* GMX_DOUBLE */
+ +
+ +#endif /* 128- or 256-bit x86 SIMD */
+ +
+ +#endif /* GMX_X86_SSE2 */
+ +
+ +
- #define gmx_simd_align_real(x)  (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))))
++#ifdef GMX_HAVE_SIMD_MACROS
++/* Generic functions to extract a SIMD aligned pointer from a pointer x.
+ + * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
+ + * to how many you want to use, to avoid indexing outside the aligned region.
+ + */
+ +
- #define gmx_simd_align_int(x)   (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))))
++static gmx_inline real *
++gmx_simd_align_real(const real *x)
++{
++    return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
++}
++
++static gmx_inline int *
++gmx_simd_align_int(const int *x)
++{
++    return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
++}
++
++
++/* Include the math functions which only need the above macros,
++ * generally these are the ones that don't need masking operations.
++ */
++#ifdef GMX_DOUBLE
++#include "gmx_simd_math_double.h"
++#else
++#include "gmx_simd_math_single.h"
++#endif
++
++#endif /* GMX_HAVE_SIMD_MACROS */
+ +
++#endif /* _gmx_simd_macros_h_ */
diff --cc src/gromacs/legacyheaders/gmx_simd_math_double.h

index 0000000000000000000000000000000000000000,6117dd61497c06764b51f54676602328dda9702e..6117dd61497c06764b51f54676602328dda9702e

mode 000000,100644..100644
--- /dev/null
--- 2/include/gmx_simd_math_double.h
+++ b/src/gromacs/legacyheaders/gmx_simd_math_double.h
diff --cc src/gromacs/legacyheaders/gmx_simd_math_single.h

index 0000000000000000000000000000000000000000,28feeaa07519c1a4fdc468cb421344d32b530751..28feeaa07519c1a4fdc468cb421344d32b530751

mode 000000,100644..100644
--- /dev/null
--- 2/include/gmx_simd_math_single.h
+++ b/src/gromacs/legacyheaders/gmx_simd_math_single.h
diff --cc src/gromacs/legacyheaders/gmx_simd_ref.h

index 0000000000000000000000000000000000000000,b20ab2f055309813a9b7cfa21865b7761c35689c..b20ab2f055309813a9b7cfa21865b7761c35689c

mode 000000,100644..100644
--- /dev/null
--- 2/include/gmx_simd_ref.h
+++ b/src/gromacs/legacyheaders/gmx_simd_ref.h
diff --cc src/gromacs/legacyheaders/gmx_simd_vec.h

index 0000000000000000000000000000000000000000,95dc0d611490cbd5999465d2ac4117d2fed471ab..95dc0d611490cbd5999465d2ac4117d2fed471ab

mode 000000,100644..100644
--- /dev/null
--- 2/include/gmx_simd_vec.h
+++ b/src/gromacs/legacyheaders/gmx_simd_vec.h
diff --cc src/gromacs/legacyheaders/types/nb_verlet.h

index 22e9297686918691a18278e77c3f23c74be9ebd1,0000000000000000000000000000000000000000..6caa7094222aa3f6e903d173b6b62d33f923c570

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/types/nb_verlet.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/types/nb_verlet.h
@@@ -1,139 -1,0 +1,145 @@@
- #if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_NBNXN_SIMD_BITWIDTH  256
- #else
- #define GMX_NBNXN_SIMD_BITWIDTH  128
- #endif
- 
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef NB_VERLET_H
+ +#define NB_VERLET_H
+ +
+ +#include "nbnxn_pairlist.h"
+ +#include "nbnxn_cuda_types_ext.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
++
++/* For testing the reference plain-C SIMD kernels, uncomment the next lines,
++ * as well as the GMX_SIMD_REFERENCE_PLAIN_C define in gmx_simd_macros.h
++ * The actual SIMD width is set in gmx_simd_macros.h
++ * The 4xN reference kernels support 2-, 4- and 8-way SIMD.
++ * The 2x(N+N) reference kernels support 8- and 16-way SIMD.
++ */
++/* #define GMX_NBNXN_SIMD */
++/* #define GMX_NBNXN_SIMD_4XN */
++/* #define GMX_NBNXN_SIMD_2XNN */
++
++
+ +#ifdef GMX_X86_SSE2
+ +/* Use SIMD accelerated nbnxn search and kernels */
+ +#define GMX_NBNXN_SIMD
+ +
+ +/* Uncomment the next line to use, slower, 128-bit SIMD with AVX-256 */
+ +/* #define GMX_NBNXN_HALF_WIDTH_SIMD */
+ +
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+ +/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ + * Currently the 2xNN SIMD kernels only make sense with:
+ + *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ + * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
+ + */
+ +#define GMX_NBNXN_SIMD_4XN
++#if defined GMX_X86_AVX_256 && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
+ +#define GMX_NBNXN_SIMD_2XNN
+ +#endif
+ +
+ +#endif
+ +
+ +
+ +/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+ +typedef enum
+ +{
+ +    nbnxnkNotSet = 0,
+ +    nbnxnk4x4_PlainC,
+ +    nbnxnk4xN_SIMD_4xN,
+ +    nbnxnk4xN_SIMD_2xNN,
+ +    nbnxnk8x8x8_CUDA,
+ +    nbnxnk8x8x8_PlainC,
+ +    nbnxnkNR
+ +} nbnxn_kernel_type;
+ +
+ +/*! Return a string indentifying the kernel type */
+ +const char *lookup_nbnxn_kernel_name(int kernel_type);
+ +
+ +enum {
+ +    ewaldexclTable, ewaldexclAnalytical
+ +};
+ +
+ +/* Atom locality indicator: local, non-local, all, used for calls to:
+ +   gridding, pair-search, force calculation, x/f buffer operations */
+ +enum {
+ +    eatLocal = 0, eatNonlocal = 1, eatAll
+ +};
+ +
+ +#define LOCAL_A(x)               ((x) == eatLocal)
+ +#define NONLOCAL_A(x)            ((x) == eatNonlocal)
+ +#define LOCAL_OR_NONLOCAL_A(x)   (LOCAL_A(x) || NONLOCAL_A(x))
+ +
+ +/* Interaction locality indicator (used in pair-list search/calculations):
+ +    - local interactions require local atom data and affect local output only;
+ +    - non-local interactions require both local and non-local atom data and
+ +      affect both local- and non-local output. */
+ +enum {
+ +    eintLocal = 0, eintNonlocal = 1
+ +};
+ +
+ +#define LOCAL_I(x)               ((x) == eintLocal)
+ +#define NONLOCAL_I(x)            ((x) == eintNonlocal)
+ +
+ +enum {
+ +    enbvClearFNo, enbvClearFYes
+ +};
+ +
+ +typedef struct {
+ +    nbnxn_pairlist_set_t  nbl_lists;   /* pair list(s)                       */
+ +    nbnxn_atomdata_t     *nbat;        /* atom data                          */
+ +    int                   kernel_type; /* non-bonded kernel - see enum above */
+ +    int                   ewald_excl;  /* Ewald exclusion - see enum above   */
+ +} nonbonded_verlet_group_t;
+ +
+ +/* non-bonded data structure with Verlet-type cut-off */
+ +typedef struct {
+ +    nbnxn_search_t           nbs;             /* n vs n atom pair searching data       */
+ +    int                      ngrp;            /* number of interaction groups          */
+ +    nonbonded_verlet_group_t grp[2];          /* local and non-local interaction group */
+ +
+ +    gmx_bool                 bUseGPU;         /* TRUE when GPU acceleration is used */
+ +    nbnxn_cuda_ptr_t         cu_nbv;          /* pointer to CUDA nb verlet data     */
+ +    int                      min_ci_balanced; /* pair list balancing parameter
+ +                                                 used for the 8x8x8 CUDA kernels    */
+ +} nonbonded_verlet_t;
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif /* NB_VERLET_H */
diff --cc src/gromacs/legacyheaders/types/nbnxn_pairlist.h

index 569cec073992632ec6d201794af1883ba14c372e,0000000000000000000000000000000000000000..77d269a16075df018cc6ec6eeb4feb3c0a5eb195

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
@@@ -1,252 -1,0 +1,269 @@@
-     int      cj;    /* The j-cluster                    */
-     unsigned excl;  /* The exclusion (interaction) bits */
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef _nbnxn_pairlist_h
+ +#define _nbnxn_pairlist_h
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +/* A buffer data structure of 64 bytes
+ + * to be placed at the beginning and end of structs
+ + * to avoid cache invalidation of the real contents
+ + * of the struct by writes to neighboring memory.
+ + */
+ +typedef struct {
+ +    int dummy[16];
+ +} gmx_cache_protect_t;
+ +
+ +/* Abstract type for pair searching data */
+ +typedef struct nbnxn_search * nbnxn_search_t;
+ +
+ +/* Function that should return a pointer *ptr to memory
+ + * of size nbytes.
+ + * Error handling should be done within this function.
+ + */
+ +typedef void nbnxn_alloc_t (void **ptr, size_t nbytes);
+ +
+ +/* Function that should free the memory pointed to by *ptr.
+ + * NULL should not be passed to this function.
+ + */
+ +typedef void nbnxn_free_t (void *ptr);
+ +
++/* This is the actual cluster-pair list j-entry.
++ * cj is the j-cluster.
++ * The interaction bits in excl are indexed i-major, j-minor.
++ * The cj entries are sorted such that ones with exclusions come first.
++ * This means that once a full mask (=NBNXN_INTERACTION_MASK_ALL)
++ * is found, all subsequent j-entries in the i-entry also have full masks.
++ */
+ +typedef struct {
-     unsigned pair[32];     /* Exclusion bits for one warp,                *
-                             * each unsigned has bit for 4*8 i clusters    */
++    int      cj;    /* The j-cluster                             */
++    unsigned excl;  /* The topology exclusion (interaction) bits */
+ +} nbnxn_cj_t;
+ +
+ +/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ + * The upper bits contain information for non-bonded kernel optimization.
+ + * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ + * But three flags can be used to skip interactions, currently only for subc=0
+ + * !(shift & NBNXN_CI_DO_LJ(subc))   => we can skip LJ for all pairs
+ + * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
+ + * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ + */
+ +#define NBNXN_CI_SHIFT          127
+ +#define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
+ +#define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
+ +#define NBNXN_CI_DO_COUL(subc)  (1<<(9+3*(subc)))
+ +
+ +/* Simple pair-list i-unit */
+ +typedef struct {
+ +    int ci;             /* i-cluster             */
+ +    int shift;          /* Shift vector index plus possible flags, see above */
+ +    int cj_ind_start;   /* Start index into cj   */
+ +    int cj_ind_end;     /* End index into cj     */
+ +} nbnxn_ci_t;
+ +
+ +/* Grouped pair-list i-unit */
+ +typedef struct {
+ +    int sci;            /* i-super-cluster       */
+ +    int shift;          /* Shift vector index plus possible flags */
+ +    int cj4_ind_start;  /* Start index into cj4  */
+ +    int cj4_ind_end;    /* End index into cj4    */
+ +} nbnxn_sci_t;
+ +
+ +typedef struct {
+ +    unsigned imask;        /* The i-cluster interactions mask for 1 warp  */
+ +    int      excl_ind;     /* Index into the exclusion array for 1 warp   */
+ +} nbnxn_im_ei_t;
+ +
+ +typedef struct {
+ +    int           cj[4];   /* The 4 j-clusters                            */
+ +    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
+ +} nbnxn_cj4_t;
+ +
+ +typedef struct {
-     real                    *simd_4xn_diag;   /* indices to set the SIMD 4xN diagonal masks    */
-     real                    *simd_2xnn_diag;  /* indices to set the SIMD 2x(N+N)diagonal masks */
-     unsigned                *simd_excl_mask;  /* exclusion masks for SIMD topology exclusions  */
++    unsigned pair[32];     /* Topology exclusion interaction bits for one warp,
++                            * each unsigned has bitS for 4*8 i clusters
++                            */
+ +} nbnxn_excl_t;
+ +
+ +typedef struct {
+ +    gmx_cache_protect_t cp0;
+ +
+ +    nbnxn_alloc_t      *alloc;
+ +    nbnxn_free_t       *free;
+ +
+ +    gmx_bool            bSimple;         /* Simple list has na_sc=na_s and uses cj   *
+ +                                          * Complex list uses cj4                    */
+ +
+ +    int                     na_ci;       /* The number of atoms per i-cluster        */
+ +    int                     na_cj;       /* The number of atoms per j-cluster        */
+ +    int                     na_sc;       /* The number of atoms per super cluster    */
+ +    real                    rlist;       /* The radius for constructing the list     */
+ +    int                     nci;         /* The number of i-clusters in the list     */
+ +    nbnxn_ci_t             *ci;          /* The i-cluster list, size nci             */
+ +    int                     ci_nalloc;   /* The allocation size of ci                */
+ +    int                     nsci;        /* The number of i-super-clusters in the list */
+ +    nbnxn_sci_t            *sci;         /* The i-super-cluster list                 */
+ +    int                     sci_nalloc;  /* The allocation size of sci               */
+ +
+ +    int                     ncj;         /* The number of j-clusters in the list     */
+ +    nbnxn_cj_t             *cj;          /* The j-cluster list, size ncj             */
+ +    int                     cj_nalloc;   /* The allocation size of cj                */
+ +
+ +    int                     ncj4;        /* The total number of 4*j clusters         */
+ +    nbnxn_cj4_t            *cj4;         /* The 4*j cluster list, size ncj4          */
+ +    int                     cj4_nalloc;  /* The allocation size of cj4               */
+ +    int                     nexcl;       /* The count for excl                       */
+ +    nbnxn_excl_t           *excl;        /* Atom interaction bits (non-exclusions)   */
+ +    int                     excl_nalloc; /* The allocation size for excl             */
+ +    int                     nci_tot;     /* The total number of i clusters           */
+ +
+ +    struct nbnxn_list_work *work;
+ +
+ +    gmx_cache_protect_t     cp1;
+ +} nbnxn_pairlist_t;
+ +
+ +typedef struct {
+ +    int                nnbl;        /* number of lists */
+ +    nbnxn_pairlist_t **nbl;         /* lists */
+ +    gmx_bool           bCombined;   /* TRUE if lists get combined into one (the 1st) */
+ +    gmx_bool           bSimple;     /* TRUE if the list of of type "simple"
+ +                                       (na_sc=na_s, no super-clusters used) */
+ +    int                natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
+ +    int                natpair_lj;  /* Total number of atom pairs for LJ kernel   */
+ +    int                natpair_q;   /* Total number of atom pairs for Q kernel    */
+ +} nbnxn_pairlist_set_t;
+ +
+ +enum {
+ +    nbatXYZ, nbatXYZQ, nbatX4, nbatX8
+ +};
+ +
+ +typedef struct {
+ +    real *f;      /* f, size natoms*fstride                             */
+ +    real *fshift; /* Shift force array, size SHIFTS*DIM                 */
+ +    int   nV;     /* The size of *Vvdw and *Vc                          */
+ +    real *Vvdw;   /* Temporary Van der Waals group energy storage       */
+ +    real *Vc;     /* Temporary Coulomb group energy storage             */
+ +    int   nVS;    /* The size of *VSvdw and *VSc                        */
+ +    real *VSvdw;  /* Temporary SIMD Van der Waals group energy storage  */
+ +    real *VSc;    /* Temporary SIMD Coulomb group energy storage        */
+ +} nbnxn_atomdata_output_t;
+ +
+ +/* Block size in atoms for the non-bonded thread force-buffer reduction,
+ + * should be a multiple of all cell and x86 SIMD sizes (i.e. 2, 4 and 8).
+ + * Should be small to reduce the reduction and zeroing cost,
+ + * but too small will result in overhead.
+ + * Currently the block size is NBNXN_BUFFERFLAG_SIZE*3*sizeof(real)=192 bytes.
+ + */
+ +#ifdef GMX_DOUBLE
+ +#define NBNXN_BUFFERFLAG_SIZE   8
+ +#else
+ +#define NBNXN_BUFFERFLAG_SIZE  16
+ +#endif
+ +
+ +/* We currently store the reduction flags as bits in an unsigned int.
+ + * In most cases this limits the number of flags to 32.
+ + * The reduction will automatically disable the flagging and do a full
+ + * reduction when the flags won't fit, but this will lead to very slow
+ + * reduction. As we anyhow don't expect reasonable performance with
+ + * more than 32 threads, we put in this hard limit.
+ + * You can increase this number, but the reduction will be very slow.
+ + */
+ +#define NBNXN_BUFFERFLAG_MAX_THREADS  32
+ +
+ +/* Flags for telling if threads write to force output buffers */
+ +typedef struct {
+ +    int       nflag;       /* The number of flag blocks                         */
+ +    unsigned *flag;        /* Bit i is set when thread i writes to a cell-block */
+ +    int       flag_nalloc; /* Allocation size of cxy_flag                       */
+ +} nbnxn_buffer_flags_t;
+ +
+ +/* LJ combination rules: geometric, Lorentz-Berthelot, none */
+ +enum {
+ +    ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR
+ +};
+ +
+ +typedef struct {
+ +    nbnxn_alloc_t           *alloc;
+ +    nbnxn_free_t            *free;
+ +    int                      ntype;           /* The number of different atom types                 */
+ +    real                    *nbfp;            /* Lennard-Jones 6*C6 and 12*C12 params, size ntype^2*2 */
+ +    int                      comb_rule;       /* Combination rule, see enum above                   */
+ +    real                    *nbfp_comb;       /* LJ parameter per atom type, size ntype*2           */
+ +    real                    *nbfp_s4;         /* As nbfp, but with stride 4, size ntype^2*4. This
+ +                                               * might suit 4-wide SIMD loads of two values (e.g.
+ +                                               * two floats in single precision on x86).            */
+ +    int                      natoms;          /* Number of atoms                                    */
+ +    int                      natoms_local;    /* Number of local atoms                           */
+ +    int                     *type;            /* Atom types                                         */
+ +    real                    *lj_comb;         /* LJ parameters per atom for combining for pairs     */
+ +    int                      XFormat;         /* The format of x (and q), enum                      */
+ +    int                      FFormat;         /* The format of f, enum                              */
+ +    real                    *q;               /* Charges, can be NULL if incorporated in x          */
+ +    int                      na_c;            /* The number of atoms per cluster                    */
+ +    int                      nenergrp;        /* The number of energy groups                        */
+ +    int                      neg_2log;        /* Log2 of nenergrp                                   */
+ +    int                     *energrp;         /* The energy groups per cluster, can be NULL         */
+ +    gmx_bool                 bDynamicBox;     /* Do we need to update shift_vec every step?    */
+ +    rvec                    *shift_vec;       /* Shift vectors, copied from t_forcerec              */
+ +    int                      xstride;         /* stride for a coordinate in x (usually 3 or 4)      */
+ +    int                      fstride;         /* stride for a coordinate in f (usually 3 or 4)      */
+ +    real                    *x;               /* x and possibly q, size natoms*xstride              */
++
++    /* j-atom minus i-atom index for generating self and Newton exclusions
++     * cluster-cluster pairs of the diagonal, for 4xn and 2xnn kernels.
++     */
++    real                    *simd_4xn_diagonal_j_minus_i;
++    real                    *simd_2xnn_diagonal_j_minus_i;
++    /* Filters for topology exclusion masks for the SIMD kernels.
++     * filter2 is the same as filter1, but with each element duplicated.
++     */
++    unsigned                *simd_exclusion_filter1;
++    unsigned                *simd_exclusion_filter2;
++
+ +    int                      nout;            /* The number of force arrays                         */
+ +    nbnxn_atomdata_output_t *out;             /* Output data structures               */
+ +    int                      nalloc;          /* Allocation size of all arrays (for x/f *x/fstride) */
+ +    gmx_bool                 bUseBufferFlags; /* Use the flags or operate on all atoms     */
+ +    nbnxn_buffer_flags_t     buffer_flags;    /* Flags for buffer zeroing+reduc.  */
+ +} nbnxn_atomdata_t;
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif
diff --cc src/gromacs/linearalgebra/CMakeLists.txt

index d1e4c23fe9cfee0749adcb06af3c37e085fa5341,0000000000000000000000000000000000000000..f7ec12ed8e69217b25ae407a9ec97a68e5052f5e

mode 100644,000000..100644
--- 1/src/gromacs/linearalgebra/CMakeLists.txt
--- /dev/null
+++ b/src/gromacs/linearalgebra/CMakeLists.txt
@@@ -1,24 -1,0 +1,24 @@@
- endif (NOT GMX_EXTERNAL_BLAS)
+ +file(GLOB LINEARALGEBRA_SOURCES *.c)
+ +
+ +if (NOT GMX_EXTERNAL_BLAS)
+ +    file(GLOB BLAS_SOURCES gmx_blas/*.c)
- endif (NOT GMX_EXTERNAL_LAPACK)
++endif()
+ +
+ +if (NOT GMX_EXTERNAL_LAPACK)
+ +    file(GLOB LAPACK_SOURCES gmx_lapack/*.c)
++endif()
+ +
+ +set(LINEARALGEBRA_SOURCES
+ +    ${LINEARALGEBRA_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES})
+ +
+ +set(LIBGROMACS_SOURCES
+ +    ${LIBGROMACS_SOURCES} ${LINEARALGEBRA_SOURCES} PARENT_SCOPE)
+ +
+ +set(LINEARALGEBRA_PUBLIC_HEADERS
+ +    eigensolver.h
+ +    matrix.h
+ +    mtxio.h
+ +    sparsematrix.h)
+ +install(FILES ${LINEARALGEBRA_PUBLIC_HEADERS}
+ +        DESTINATION ${INCL_INSTALL_DIR}/gromacs/linearalgebra
+ +        COMPONENT development)
diff --cc src/gromacs/mdlib/forcerec.c

index 9cb2caa0a64479c627714f3cb00906ae8d5c0029,0000000000000000000000000000000000000000..83da939f397af67a521273e35041c7a1ec6ff56d

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/forcerec.c
--- /dev/null
+++ b/src/gromacs/mdlib/forcerec.c
@@@ -1,2965 -1,0 +1,2972 @@@
-         case nbnxnkNotSet: returnvalue     = "not set"; break;
-         case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
- #ifndef GMX_NBNXN_SIMD
-         case nbnxnk4xN_SIMD_4xN: returnvalue  = "not available"; break;
-         case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
- #else
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROwing Monsters And Cloning Shrimps
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include <string.h>
+ +#include <assert.h>
+ +#include "sysstuff.h"
+ +#include "typedefs.h"
+ +#include "vec.h"
+ +#include "maths.h"
+ +#include "macros.h"
+ +#include "smalloc.h"
+ +#include "macros.h"
+ +#include "gmx_fatal.h"
+ +#include "gmx_fatal_collective.h"
+ +#include "physics.h"
+ +#include "force.h"
+ +#include "tables.h"
+ +#include "nonbonded.h"
+ +#include "invblock.h"
+ +#include "names.h"
+ +#include "network.h"
+ +#include "pbc.h"
+ +#include "ns.h"
+ +#include "mshift.h"
+ +#include "txtdump.h"
+ +#include "coulomb.h"
+ +#include "md_support.h"
+ +#include "md_logging.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "qmmm.h"
+ +#include "copyrite.h"
+ +#include "mtop_util.h"
+ +#include "nbnxn_search.h"
+ +#include "nbnxn_atomdata.h"
+ +#include "nbnxn_consts.h"
+ +#include "statutil.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "gmx_detect_hardware.h"
+ +
+ +#ifdef _MSC_VER
+ +/* MSVC definition for __cpuid() */
+ +#include <intrin.h>
+ +#endif
+ +
+ +#include "types/nbnxn_cuda_types_ext.h"
+ +#include "gpu_utils.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +#include "pmalloc_cuda.h"
+ +
+ +t_forcerec *mk_forcerec(void)
+ +{
+ +    t_forcerec *fr;
+ +
+ +    snew(fr, 1);
+ +
+ +    return fr;
+ +}
+ +
+ +#ifdef DEBUG
+ +static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
+ +{
+ +    int i, j;
+ +
+ +    for (i = 0; (i < atnr); i++)
+ +    {
+ +        for (j = 0; (j < atnr); j++)
+ +        {
+ +            fprintf(fp, "%2d - %2d", i, j);
+ +            if (bBHAM)
+ +            {
+ +                fprintf(fp, "  a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
+ +                        BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
+ +            }
+ +            else
+ +            {
+ +                fprintf(fp, "  c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
+ +                        C12(nbfp, atnr, i, j)/12.0);
+ +            }
+ +        }
+ +    }
+ +}
+ +#endif
+ +
+ +static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
+ +{
+ +    real *nbfp;
+ +    int   i, j, k, atnr;
+ +
+ +    atnr = idef->atnr;
+ +    if (bBHAM)
+ +    {
+ +        snew(nbfp, 3*atnr*atnr);
+ +        for (i = k = 0; (i < atnr); i++)
+ +        {
+ +            for (j = 0; (j < atnr); j++, k++)
+ +            {
+ +                BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
+ +                BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
+ +                /* nbfp now includes the 6.0 derivative prefactor */
+ +                BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        snew(nbfp, 2*atnr*atnr);
+ +        for (i = k = 0; (i < atnr); i++)
+ +        {
+ +            for (j = 0; (j < atnr); j++, k++)
+ +            {
+ +                /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                C6(nbfp, atnr, i, j)   = idef->iparams[k].lj.c6*6.0;
+ +                C12(nbfp, atnr, i, j)  = idef->iparams[k].lj.c12*12.0;
+ +            }
+ +        }
+ +    }
+ +
+ +    return nbfp;
+ +}
+ +
+ +/* This routine sets fr->solvent_opt to the most common solvent in the
+ + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
+ + * the fr->solvent_type array with the correct type (or esolNO).
+ + *
+ + * Charge groups that fulfill the conditions but are not identical to the
+ + * most common one will be marked as esolNO in the solvent_type array.
+ + *
+ + * TIP3p is identical to SPC for these purposes, so we call it
+ + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
+ + *
+ + * NOTE: QM particle should not
+ + * become an optimized solvent. Not even if there is only one charge
+ + * group in the Qm
+ + */
+ +
+ +typedef struct
+ +{
+ +    int    model;
+ +    int    count;
+ +    int    vdwtype[4];
+ +    real   charge[4];
+ +} solvent_parameters_t;
+ +
+ +static void
+ +check_solvent_cg(const gmx_moltype_t    *molt,
+ +                 int                     cg0,
+ +                 int                     nmol,
+ +                 const unsigned char    *qm_grpnr,
+ +                 const t_grps           *qm_grps,
+ +                 t_forcerec   *          fr,
+ +                 int                    *n_solvent_parameters,
+ +                 solvent_parameters_t  **solvent_parameters_p,
+ +                 int                     cginfo,
+ +                 int                    *cg_sp)
+ +{
+ +    const t_blocka     *  excl;
+ +    t_atom               *atom;
+ +    int                   j, k;
+ +    int                   j0, j1, nj;
+ +    gmx_bool              perturbed;
+ +    gmx_bool              has_vdw[4];
+ +    gmx_bool              match;
+ +    real                  tmp_charge[4];
+ +    int                   tmp_vdwtype[4];
+ +    int                   tjA;
+ +    gmx_bool              qm;
+ +    solvent_parameters_t *solvent_parameters;
+ +
+ +    /* We use a list with parameters for each solvent type.
+ +     * Every time we discover a new molecule that fulfills the basic
+ +     * conditions for a solvent we compare with the previous entries
+ +     * in these lists. If the parameters are the same we just increment
+ +     * the counter for that type, and otherwise we create a new type
+ +     * based on the current molecule.
+ +     *
+ +     * Once we've finished going through all molecules we check which
+ +     * solvent is most common, and mark all those molecules while we
+ +     * clear the flag on all others.
+ +     */
+ +
+ +    solvent_parameters = *solvent_parameters_p;
+ +
+ +    /* Mark the cg first as non optimized */
+ +    *cg_sp = -1;
+ +
+ +    /* Check if this cg has no exclusions with atoms in other charge groups
+ +     * and all atoms inside the charge group excluded.
+ +     * We only have 3 or 4 atom solvent loops.
+ +     */
+ +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
+ +        !GET_CGINFO_EXCL_INTRA(cginfo))
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Get the indices of the first atom in this charge group */
+ +    j0     = molt->cgs.index[cg0];
+ +    j1     = molt->cgs.index[cg0+1];
+ +
+ +    /* Number of atoms in our molecule */
+ +    nj     = j1 - j0;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug,
+ +                "Moltype '%s': there are %d atoms in this charge group\n",
+ +                *molt->name, nj);
+ +    }
+ +
+ +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
+ +     * otherwise skip it.
+ +     */
+ +    if (nj < 3 || nj > 4)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Check if we are doing QM on this group */
+ +    qm = FALSE;
+ +    if (qm_grpnr != NULL)
+ +    {
+ +        for (j = j0; j < j1 && !qm; j++)
+ +        {
+ +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
+ +        }
+ +    }
+ +    /* Cannot use solvent optimization with QM */
+ +    if (qm)
+ +    {
+ +        return;
+ +    }
+ +
+ +    atom = molt->atoms.atom;
+ +
+ +    /* Still looks like a solvent, time to check parameters */
+ +
+ +    /* If it is perturbed (free energy) we can't use the solvent loops,
+ +     * so then we just skip to the next molecule.
+ +     */
+ +    perturbed = FALSE;
+ +
+ +    for (j = j0; j < j1 && !perturbed; j++)
+ +    {
+ +        perturbed = PERTURBED(atom[j]);
+ +    }
+ +
+ +    if (perturbed)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Now it's only a question if the VdW and charge parameters
+ +     * are OK. Before doing the check we compare and see if they are
+ +     * identical to a possible previous solvent type.
+ +     * First we assign the current types and charges.
+ +     */
+ +    for (j = 0; j < nj; j++)
+ +    {
+ +        tmp_vdwtype[j] = atom[j0+j].type;
+ +        tmp_charge[j]  = atom[j0+j].q;
+ +    }
+ +
+ +    /* Does it match any previous solvent type? */
+ +    for (k = 0; k < *n_solvent_parameters; k++)
+ +    {
+ +        match = TRUE;
+ +
+ +
+ +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
+ +        if ( (solvent_parameters[k].model == esolSPC   && nj != 3)  ||
+ +             (solvent_parameters[k].model == esolTIP4P && nj != 4) )
+ +        {
+ +            match = FALSE;
+ +        }
+ +
+ +        /* Check that types & charges match for all atoms in molecule */
+ +        for (j = 0; j < nj && match == TRUE; j++)
+ +        {
+ +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
+ +            {
+ +                match = FALSE;
+ +            }
+ +            if (tmp_charge[j] != solvent_parameters[k].charge[j])
+ +            {
+ +                match = FALSE;
+ +            }
+ +        }
+ +        if (match == TRUE)
+ +        {
+ +            /* Congratulations! We have a matched solvent.
+ +             * Flag it with this type for later processing.
+ +             */
+ +            *cg_sp = k;
+ +            solvent_parameters[k].count += nmol;
+ +
+ +            /* We are done with this charge group */
+ +            return;
+ +        }
+ +    }
+ +
+ +    /* If we get here, we have a tentative new solvent type.
+ +     * Before we add it we must check that it fulfills the requirements
+ +     * of the solvent optimized loops. First determine which atoms have
+ +     * VdW interactions.
+ +     */
+ +    for (j = 0; j < nj; j++)
+ +    {
+ +        has_vdw[j] = FALSE;
+ +        tjA        = tmp_vdwtype[j];
+ +
+ +        /* Go through all other tpes and see if any have non-zero
+ +         * VdW parameters when combined with this one.
+ +         */
+ +        for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
+ +        {
+ +            /* We already checked that the atoms weren't perturbed,
+ +             * so we only need to check state A now.
+ +             */
+ +            if (fr->bBHAM)
+ +            {
+ +                has_vdw[j] = (has_vdw[j] ||
+ +                              (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ +                              (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ +                              (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ +            }
+ +            else
+ +            {
+ +                /* Standard LJ */
+ +                has_vdw[j] = (has_vdw[j] ||
+ +                              (C6(fr->nbfp, fr->ntype, tjA, k)  != 0.0) ||
+ +                              (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Now we know all we need to make the final check and assignment. */
+ +    if (nj == 3)
+ +    {
+ +        /* So, is it an SPC?
+ +         * For this we require thatn all atoms have charge,
+ +         * the charges on atom 2 & 3 should be the same, and only
+ +         * atom 1 might have VdW.
+ +         */
+ +        if (has_vdw[1] == FALSE &&
+ +            has_vdw[2] == FALSE &&
+ +            tmp_charge[0]  != 0 &&
+ +            tmp_charge[1]  != 0 &&
+ +            tmp_charge[2]  == tmp_charge[1])
+ +        {
+ +            srenew(solvent_parameters, *n_solvent_parameters+1);
+ +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
+ +            solvent_parameters[*n_solvent_parameters].count = nmol;
+ +            for (k = 0; k < 3; k++)
+ +            {
+ +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
+ +            }
+ +
+ +            *cg_sp = *n_solvent_parameters;
+ +            (*n_solvent_parameters)++;
+ +        }
+ +    }
+ +    else if (nj == 4)
+ +    {
+ +        /* Or could it be a TIP4P?
+ +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
+ +         * Only atom 1 mght have VdW.
+ +         */
+ +        if (has_vdw[1] == FALSE &&
+ +            has_vdw[2] == FALSE &&
+ +            has_vdw[3] == FALSE &&
+ +            tmp_charge[0]  == 0 &&
+ +            tmp_charge[1]  != 0 &&
+ +            tmp_charge[2]  == tmp_charge[1] &&
+ +            tmp_charge[3]  != 0)
+ +        {
+ +            srenew(solvent_parameters, *n_solvent_parameters+1);
+ +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
+ +            solvent_parameters[*n_solvent_parameters].count = nmol;
+ +            for (k = 0; k < 4; k++)
+ +            {
+ +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
+ +            }
+ +
+ +            *cg_sp = *n_solvent_parameters;
+ +            (*n_solvent_parameters)++;
+ +        }
+ +    }
+ +
+ +    *solvent_parameters_p = solvent_parameters;
+ +}
+ +
+ +static void
+ +check_solvent(FILE  *                fp,
+ +              const gmx_mtop_t  *    mtop,
+ +              t_forcerec  *          fr,
+ +              cginfo_mb_t           *cginfo_mb)
+ +{
+ +    const t_block     *   cgs;
+ +    const t_block     *   mols;
+ +    const gmx_moltype_t  *molt;
+ +    int                   mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
+ +    int                   n_solvent_parameters;
+ +    solvent_parameters_t *solvent_parameters;
+ +    int                 **cg_sp;
+ +    int                   bestsp, bestsol;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Going to determine what solvent types we have.\n");
+ +    }
+ +
+ +    mols = &mtop->mols;
+ +
+ +    n_solvent_parameters = 0;
+ +    solvent_parameters   = NULL;
+ +    /* Allocate temporary array for solvent type */
+ +    snew(cg_sp, mtop->nmolblock);
+ +
+ +    cg_offset = 0;
+ +    at_offset = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        molt = &mtop->moltype[mtop->molblock[mb].type];
+ +        cgs  = &molt->cgs;
+ +        /* Here we have to loop over all individual molecules
+ +         * because we need to check for QMMM particles.
+ +         */
+ +        snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
+ +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
+ +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
+ +        for (mol = 0; mol < nmol_ch; mol++)
+ +        {
+ +            cgm = mol*cgs->nr;
+ +            am  = mol*cgs->index[cgs->nr];
+ +            for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
+ +            {
+ +                check_solvent_cg(molt, cg_mol, nmol,
+ +                                 mtop->groups.grpnr[egcQMMM] ?
+ +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
+ +                                 &mtop->groups.grps[egcQMMM],
+ +                                 fr,
+ +                                 &n_solvent_parameters, &solvent_parameters,
+ +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
+ +                                 &cg_sp[mb][cgm+cg_mol]);
+ +            }
+ +        }
+ +        cg_offset += cgs->nr;
+ +        at_offset += cgs->index[cgs->nr];
+ +    }
+ +
+ +    /* Puh! We finished going through all charge groups.
+ +     * Now find the most common solvent model.
+ +     */
+ +
+ +    /* Most common solvent this far */
+ +    bestsp = -2;
+ +    for (i = 0; i < n_solvent_parameters; i++)
+ +    {
+ +        if (bestsp == -2 ||
+ +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
+ +        {
+ +            bestsp = i;
+ +        }
+ +    }
+ +
+ +    if (bestsp >= 0)
+ +    {
+ +        bestsol = solvent_parameters[bestsp].model;
+ +    }
+ +    else
+ +    {
+ +        bestsol = esolNO;
+ +    }
+ +
+ +#ifdef DISABLE_WATER_NLIST
+ +    bestsol = esolNO;
+ +#endif
+ +
+ +    fr->nWatMol = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        cgs  = &mtop->moltype[mtop->molblock[mb].type].cgs;
+ +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
+ +        for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
+ +        {
+ +            if (cg_sp[mb][i] == bestsp)
+ +            {
+ +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
+ +                fr->nWatMol += nmol;
+ +            }
+ +            else
+ +            {
+ +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
+ +            }
+ +        }
+ +        sfree(cg_sp[mb]);
+ +    }
+ +    sfree(cg_sp);
+ +
+ +    if (bestsol != esolNO && fp != NULL)
+ +    {
+ +        fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
+ +                esol_names[bestsol],
+ +                solvent_parameters[bestsp].count);
+ +    }
+ +
+ +    sfree(solvent_parameters);
+ +    fr->solvent_opt = bestsol;
+ +}
+ +
+ +enum {
+ +    acNONE = 0, acCONSTRAINT, acSETTLE
+ +};
+ +
+ +static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
+ +                                   t_forcerec *fr, gmx_bool bNoSolvOpt,
+ +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
+ +{
+ +    const t_block        *cgs;
+ +    const t_blocka       *excl;
+ +    const gmx_moltype_t  *molt;
+ +    const gmx_molblock_t *molb;
+ +    cginfo_mb_t          *cginfo_mb;
+ +    gmx_bool             *type_VDW;
+ +    int                  *cginfo;
+ +    int                   cg_offset, a_offset, cgm, am;
+ +    int                   mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
+ +    int                  *a_con;
+ +    int                   ftype;
+ +    int                   ia;
+ +    gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
+ +
+ +    ncg_tot = ncg_mtop(mtop);
+ +    snew(cginfo_mb, mtop->nmolblock);
+ +
+ +    snew(type_VDW, fr->ntype);
+ +    for (ai = 0; ai < fr->ntype; ai++)
+ +    {
+ +        type_VDW[ai] = FALSE;
+ +        for (j = 0; j < fr->ntype; j++)
+ +        {
+ +            type_VDW[ai] = type_VDW[ai] ||
+ +                fr->bBHAM ||
+ +                C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
+ +                C12(fr->nbfp, fr->ntype, ai, j) != 0;
+ +        }
+ +    }
+ +
+ +    *bExcl_IntraCGAll_InterCGNone = TRUE;
+ +
+ +    excl_nalloc = 10;
+ +    snew(bExcl, excl_nalloc);
+ +    cg_offset = 0;
+ +    a_offset  = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        molb = &mtop->molblock[mb];
+ +        molt = &mtop->moltype[molb->type];
+ +        cgs  = &molt->cgs;
+ +        excl = &molt->excls;
+ +
+ +        /* Check if the cginfo is identical for all molecules in this block.
+ +         * If so, we only need an array of the size of one molecule.
+ +         * Otherwise we make an array of #mol times #cgs per molecule.
+ +         */
+ +        bId = TRUE;
+ +        am  = 0;
+ +        for (m = 0; m < molb->nmol; m++)
+ +        {
+ +            am = m*cgs->index[cgs->nr];
+ +            for (cg = 0; cg < cgs->nr; cg++)
+ +            {
+ +                a0 = cgs->index[cg];
+ +                a1 = cgs->index[cg+1];
+ +                if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
+ +                    ggrpnr(&mtop->groups, egcENER, a_offset   +a0))
+ +                {
+ +                    bId = FALSE;
+ +                }
+ +                if (mtop->groups.grpnr[egcQMMM] != NULL)
+ +                {
+ +                    for (ai = a0; ai < a1; ai++)
+ +                    {
+ +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
+ +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
+ +                        {
+ +                            bId = FALSE;
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        cginfo_mb[mb].cg_start = cg_offset;
+ +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
+ +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
+ +        snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
+ +        cginfo = cginfo_mb[mb].cginfo;
+ +
+ +        /* Set constraints flags for constrained atoms */
+ +        snew(a_con, molt->atoms.nr);
+ +        for (ftype = 0; ftype < F_NRE; ftype++)
+ +        {
+ +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
+ +            {
+ +                int nral;
+ +
+ +                nral = NRAL(ftype);
+ +                for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
+ +                {
+ +                    int a;
+ +
+ +                    for (a = 0; a < nral; a++)
+ +                    {
+ +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
+ +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        for (m = 0; m < (bId ? 1 : molb->nmol); m++)
+ +        {
+ +            cgm = m*cgs->nr;
+ +            am  = m*cgs->index[cgs->nr];
+ +            for (cg = 0; cg < cgs->nr; cg++)
+ +            {
+ +                a0 = cgs->index[cg];
+ +                a1 = cgs->index[cg+1];
+ +
+ +                /* Store the energy group in cginfo */
+ +                gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
+ +                SET_CGINFO_GID(cginfo[cgm+cg], gid);
+ +
+ +                /* Check the intra/inter charge group exclusions */
+ +                if (a1-a0 > excl_nalloc)
+ +                {
+ +                    excl_nalloc = a1 - a0;
+ +                    srenew(bExcl, excl_nalloc);
+ +                }
+ +                /* bExclIntraAll: all intra cg interactions excluded
+ +                 * bExclInter:    any inter cg interactions excluded
+ +                 */
+ +                bExclIntraAll = TRUE;
+ +                bExclInter    = FALSE;
+ +                bHaveVDW      = FALSE;
+ +                bHaveQ        = FALSE;
+ +                for (ai = a0; ai < a1; ai++)
+ +                {
+ +                    /* Check VDW and electrostatic interactions */
+ +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
+ +                                            type_VDW[molt->atoms.atom[ai].typeB]);
+ +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
+ +                                            molt->atoms.atom[ai].qB != 0);
+ +
+ +                    /* Clear the exclusion list for atom ai */
+ +                    for (aj = a0; aj < a1; aj++)
+ +                    {
+ +                        bExcl[aj-a0] = FALSE;
+ +                    }
+ +                    /* Loop over all the exclusions of atom ai */
+ +                    for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
+ +                    {
+ +                        aj = excl->a[j];
+ +                        if (aj < a0 || aj >= a1)
+ +                        {
+ +                            bExclInter = TRUE;
+ +                        }
+ +                        else
+ +                        {
+ +                            bExcl[aj-a0] = TRUE;
+ +                        }
+ +                    }
+ +                    /* Check if ai excludes a0 to a1 */
+ +                    for (aj = a0; aj < a1; aj++)
+ +                    {
+ +                        if (!bExcl[aj-a0])
+ +                        {
+ +                            bExclIntraAll = FALSE;
+ +                        }
+ +                    }
+ +
+ +                    switch (a_con[ai])
+ +                    {
+ +                        case acCONSTRAINT:
+ +                            SET_CGINFO_CONSTR(cginfo[cgm+cg]);
+ +                            break;
+ +                        case acSETTLE:
+ +                            SET_CGINFO_SETTLE(cginfo[cgm+cg]);
+ +                            break;
+ +                        default:
+ +                            break;
+ +                    }
+ +                }
+ +                if (bExclIntraAll)
+ +                {
+ +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
+ +                }
+ +                if (bExclInter)
+ +                {
+ +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
+ +                }
+ +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
+ +                {
+ +                    /* The size in cginfo is currently only read with DD */
+ +                    gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
+ +                }
+ +                if (bHaveVDW)
+ +                {
+ +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
+ +                }
+ +                if (bHaveQ)
+ +                {
+ +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
+ +                }
+ +                /* Store the charge group size */
+ +                SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
+ +
+ +                if (!bExclIntraAll || bExclInter)
+ +                {
+ +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
+ +                }
+ +            }
+ +        }
+ +
+ +        sfree(a_con);
+ +
+ +        cg_offset += molb->nmol*cgs->nr;
+ +        a_offset  += molb->nmol*cgs->index[cgs->nr];
+ +    }
+ +    sfree(bExcl);
+ +
+ +    /* the solvent optimizer is called after the QM is initialized,
+ +     * because we don't want to have the QM subsystemto become an
+ +     * optimized solvent
+ +     */
+ +
+ +    check_solvent(fplog, mtop, fr, cginfo_mb);
+ +
+ +    if (getenv("GMX_NO_SOLV_OPT"))
+ +    {
+ +        if (fplog)
+ +        {
+ +            fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
+ +                    "Disabling all solvent optimization\n");
+ +        }
+ +        fr->solvent_opt = esolNO;
+ +    }
+ +    if (bNoSolvOpt)
+ +    {
+ +        fr->solvent_opt = esolNO;
+ +    }
+ +    if (!fr->solvent_opt)
+ +    {
+ +        for (mb = 0; mb < mtop->nmolblock; mb++)
+ +        {
+ +            for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
+ +            {
+ +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
+ +            }
+ +        }
+ +    }
+ +
+ +    return cginfo_mb;
+ +}
+ +
+ +static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
+ +{
+ +    int  ncg, mb, cg;
+ +    int *cginfo;
+ +
+ +    ncg = cgi_mb[nmb-1].cg_end;
+ +    snew(cginfo, ncg);
+ +    mb = 0;
+ +    for (cg = 0; cg < ncg; cg++)
+ +    {
+ +        while (cg >= cgi_mb[mb].cg_end)
+ +        {
+ +            mb++;
+ +        }
+ +        cginfo[cg] =
+ +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
+ +    }
+ +
+ +    return cginfo;
+ +}
+ +
+ +static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
+ +{
+ +    double         qsum, q2sum, q;
+ +    int            mb, nmol, i;
+ +    const t_atoms *atoms;
+ +
+ +    qsum  = 0;
+ +    q2sum = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        nmol  = mtop->molblock[mb].nmol;
+ +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +        for (i = 0; i < atoms->nr; i++)
+ +        {
+ +            q      = atoms->atom[i].q;
+ +            qsum  += nmol*q;
+ +            q2sum += nmol*q*q;
+ +        }
+ +    }
+ +    fr->qsum[0]  = qsum;
+ +    fr->q2sum[0] = q2sum;
+ +    if (fr->efep != efepNO)
+ +    {
+ +        qsum  = 0;
+ +        q2sum = 0;
+ +        for (mb = 0; mb < mtop->nmolblock; mb++)
+ +        {
+ +            nmol  = mtop->molblock[mb].nmol;
+ +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +            for (i = 0; i < atoms->nr; i++)
+ +            {
+ +                q      = atoms->atom[i].qB;
+ +                qsum  += nmol*q;
+ +                q2sum += nmol*q*q;
+ +            }
+ +            fr->qsum[1]  = qsum;
+ +            fr->q2sum[1] = q2sum;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fr->qsum[1]  = fr->qsum[0];
+ +        fr->q2sum[1] = fr->q2sum[0];
+ +    }
+ +    if (log)
+ +    {
+ +        if (fr->efep == efepNO)
+ +        {
+ +            fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
+ +        }
+ +        else
+ +        {
+ +            fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
+ +                    fr->qsum[0], fr->qsum[1]);
+ +        }
+ +    }
+ +}
+ +
+ +void update_forcerec(t_forcerec *fr, matrix box)
+ +{
+ +    if (fr->eeltype == eelGRF)
+ +    {
+ +        calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
+ +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
+ +    }
+ +}
+ +
+ +void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
+ +{
+ +    const t_atoms  *atoms, *atoms_tpi;
+ +    const t_blocka *excl;
+ +    int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
+ +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
+ +    long long int   npair, npair_ij, tmpi, tmpj;
+ +#else
+ +    double          npair, npair_ij, tmpi, tmpj;
+ +#endif
+ +    double          csix, ctwelve;
+ +    int             ntp, *typecount;
+ +    gmx_bool        bBHAM;
+ +    real           *nbfp;
+ +
+ +    ntp   = fr->ntype;
+ +    bBHAM = fr->bBHAM;
+ +    nbfp  = fr->nbfp;
+ +
+ +    for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
+ +    {
+ +        csix    = 0;
+ +        ctwelve = 0;
+ +        npair   = 0;
+ +        nexcl   = 0;
+ +        if (!fr->n_tpi)
+ +        {
+ +            /* Count the types so we avoid natoms^2 operations */
+ +            snew(typecount, ntp);
+ +            for (mb = 0; mb < mtop->nmolblock; mb++)
+ +            {
+ +                nmol  = mtop->molblock[mb].nmol;
+ +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +                for (i = 0; i < atoms->nr; i++)
+ +                {
+ +                    if (q == 0)
+ +                    {
+ +                        tpi = atoms->atom[i].type;
+ +                    }
+ +                    else
+ +                    {
+ +                        tpi = atoms->atom[i].typeB;
+ +                    }
+ +                    typecount[tpi] += nmol;
+ +                }
+ +            }
+ +            for (tpi = 0; tpi < ntp; tpi++)
+ +            {
+ +                for (tpj = tpi; tpj < ntp; tpj++)
+ +                {
+ +                    tmpi = typecount[tpi];
+ +                    tmpj = typecount[tpj];
+ +                    if (tpi != tpj)
+ +                    {
+ +                        npair_ij = tmpi*tmpj;
+ +                    }
+ +                    else
+ +                    {
+ +                        npair_ij = tmpi*(tmpi - 1)/2;
+ +                    }
+ +                    if (bBHAM)
+ +                    {
+ +                        /* nbfp now includes the 6.0 derivative prefactor */
+ +                        csix    += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ +                    }
+ +                    else
+ +                    {
+ +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                        csix    += npair_ij*   C6(nbfp, ntp, tpi, tpj)/6.0;
+ +                        ctwelve += npair_ij*  C12(nbfp, ntp, tpi, tpj)/12.0;
+ +                    }
+ +                    npair += npair_ij;
+ +                }
+ +            }
+ +            sfree(typecount);
+ +            /* Subtract the excluded pairs.
+ +             * The main reason for substracting exclusions is that in some cases
+ +             * some combinations might never occur and the parameters could have
+ +             * any value. These unused values should not influence the dispersion
+ +             * correction.
+ +             */
+ +            for (mb = 0; mb < mtop->nmolblock; mb++)
+ +            {
+ +                nmol  = mtop->molblock[mb].nmol;
+ +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
+ +                for (i = 0; (i < atoms->nr); i++)
+ +                {
+ +                    if (q == 0)
+ +                    {
+ +                        tpi = atoms->atom[i].type;
+ +                    }
+ +                    else
+ +                    {
+ +                        tpi = atoms->atom[i].typeB;
+ +                    }
+ +                    j1  = excl->index[i];
+ +                    j2  = excl->index[i+1];
+ +                    for (j = j1; j < j2; j++)
+ +                    {
+ +                        k = excl->a[j];
+ +                        if (k > i)
+ +                        {
+ +                            if (q == 0)
+ +                            {
+ +                                tpj = atoms->atom[k].type;
+ +                            }
+ +                            else
+ +                            {
+ +                                tpj = atoms->atom[k].typeB;
+ +                            }
+ +                            if (bBHAM)
+ +                            {
+ +                                /* nbfp now includes the 6.0 derivative prefactor */
+ +                                csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ +                            }
+ +                            else
+ +                            {
+ +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                                csix    -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ +                                ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
+ +                            }
+ +                            nexcl += nmol;
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            /* Only correct for the interaction of the test particle
+ +             * with the rest of the system.
+ +             */
+ +            atoms_tpi =
+ +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
+ +
+ +            npair = 0;
+ +            for (mb = 0; mb < mtop->nmolblock; mb++)
+ +            {
+ +                nmol  = mtop->molblock[mb].nmol;
+ +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +                for (j = 0; j < atoms->nr; j++)
+ +                {
+ +                    nmolc = nmol;
+ +                    /* Remove the interaction of the test charge group
+ +                     * with itself.
+ +                     */
+ +                    if (mb == mtop->nmolblock-1)
+ +                    {
+ +                        nmolc--;
+ +
+ +                        if (mb == 0 && nmol == 1)
+ +                        {
+ +                            gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
+ +                        }
+ +                    }
+ +                    if (q == 0)
+ +                    {
+ +                        tpj = atoms->atom[j].type;
+ +                    }
+ +                    else
+ +                    {
+ +                        tpj = atoms->atom[j].typeB;
+ +                    }
+ +                    for (i = 0; i < fr->n_tpi; i++)
+ +                    {
+ +                        if (q == 0)
+ +                        {
+ +                            tpi = atoms_tpi->atom[i].type;
+ +                        }
+ +                        else
+ +                        {
+ +                            tpi = atoms_tpi->atom[i].typeB;
+ +                        }
+ +                        if (bBHAM)
+ +                        {
+ +                            /* nbfp now includes the 6.0 derivative prefactor */
+ +                            csix    += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ +                        }
+ +                        else
+ +                        {
+ +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                            csix    += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ +                            ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
+ +                        }
+ +                        npair += nmolc;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        if (npair - nexcl <= 0 && fplog)
+ +        {
+ +            fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
+ +            csix     = 0;
+ +            ctwelve  = 0;
+ +        }
+ +        else
+ +        {
+ +            csix    /= npair - nexcl;
+ +            ctwelve /= npair - nexcl;
+ +        }
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Counted %d exclusions\n", nexcl);
+ +            fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
+ +            fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
+ +        }
+ +        fr->avcsix[q]    = csix;
+ +        fr->avctwelve[q] = ctwelve;
+ +    }
+ +    if (fplog != NULL)
+ +    {
+ +        if (fr->eDispCorr == edispcAllEner ||
+ +            fr->eDispCorr == edispcAllEnerPres)
+ +        {
+ +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ +                    fr->avcsix[0], fr->avctwelve[0]);
+ +        }
+ +        else
+ +        {
+ +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
+ +                           const gmx_mtop_t *mtop)
+ +{
+ +    const t_atoms *at1, *at2;
+ +    int            mt1, mt2, i, j, tpi, tpj, ntypes;
+ +    real           b, bmin;
+ +    real          *nbfp;
+ +
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
+ +    }
+ +    nbfp   = fr->nbfp;
+ +    ntypes = fr->ntype;
+ +
+ +    bmin           = -1;
+ +    fr->bham_b_max = 0;
+ +    for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
+ +    {
+ +        at1 = &mtop->moltype[mt1].atoms;
+ +        for (i = 0; (i < at1->nr); i++)
+ +        {
+ +            tpi = at1->atom[i].type;
+ +            if (tpi >= ntypes)
+ +            {
+ +                gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
+ +            }
+ +
+ +            for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
+ +            {
+ +                at2 = &mtop->moltype[mt2].atoms;
+ +                for (j = 0; (j < at2->nr); j++)
+ +                {
+ +                    tpj = at2->atom[j].type;
+ +                    if (tpj >= ntypes)
+ +                    {
+ +                        gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
+ +                    }
+ +                    b = BHAMB(nbfp, ntypes, tpi, tpj);
+ +                    if (b > fr->bham_b_max)
+ +                    {
+ +                        fr->bham_b_max = b;
+ +                    }
+ +                    if ((b < bmin) || (bmin == -1))
+ +                    {
+ +                        bmin = b;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
+ +                bmin, fr->bham_b_max);
+ +    }
+ +}
+ +
+ +static void make_nbf_tables(FILE *fp, const output_env_t oenv,
+ +                            t_forcerec *fr, real rtab,
+ +                            const t_commrec *cr,
+ +                            const char *tabfn, char *eg1, char *eg2,
+ +                            t_nblists *nbl)
+ +{
+ +    char buf[STRLEN];
+ +    int  i, j;
+ +
+ +    if (tabfn == NULL)
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
+ +        }
+ +        return;
+ +    }
+ +
+ +    sprintf(buf, "%s", tabfn);
+ +    if (eg1 && eg2)
+ +    {
+ +        /* Append the two energy group names */
+ +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
+ +                eg1, eg2, ftp2ext(efXVG));
+ +    }
+ +    nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
+ +    /* Copy the contents of the table to separate coulomb and LJ tables too,
+ +     * to improve cache performance.
+ +     */
+ +    /* For performance reasons we want
+ +     * the table data to be aligned to 16-byte. The pointers could be freed
+ +     * but currently aren't.
+ +     */
+ +    nbl->table_elec.interaction   = GMX_TABLE_INTERACTION_ELEC;
+ +    nbl->table_elec.format        = nbl->table_elec_vdw.format;
+ +    nbl->table_elec.r             = nbl->table_elec_vdw.r;
+ +    nbl->table_elec.n             = nbl->table_elec_vdw.n;
+ +    nbl->table_elec.scale         = nbl->table_elec_vdw.scale;
+ +    nbl->table_elec.scale_exp     = nbl->table_elec_vdw.scale_exp;
+ +    nbl->table_elec.formatsize    = nbl->table_elec_vdw.formatsize;
+ +    nbl->table_elec.ninteractions = 1;
+ +    nbl->table_elec.stride        = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
+ +    snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
+ +
+ +    nbl->table_vdw.interaction   = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
+ +    nbl->table_vdw.format        = nbl->table_elec_vdw.format;
+ +    nbl->table_vdw.r             = nbl->table_elec_vdw.r;
+ +    nbl->table_vdw.n             = nbl->table_elec_vdw.n;
+ +    nbl->table_vdw.scale         = nbl->table_elec_vdw.scale;
+ +    nbl->table_vdw.scale_exp     = nbl->table_elec_vdw.scale_exp;
+ +    nbl->table_vdw.formatsize    = nbl->table_elec_vdw.formatsize;
+ +    nbl->table_vdw.ninteractions = 2;
+ +    nbl->table_vdw.stride        = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
+ +    snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
+ +
+ +    for (i = 0; i <= nbl->table_elec_vdw.n; i++)
+ +    {
+ +        for (j = 0; j < 4; j++)
+ +        {
+ +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
+ +        }
+ +        for (j = 0; j < 8; j++)
+ +        {
+ +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
+ +        }
+ +    }
+ +}
+ +
+ +static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
+ +                         int *ncount, int **count)
+ +{
+ +    const gmx_moltype_t *molt;
+ +    const t_ilist       *il;
+ +    int                  mt, ftype, stride, i, j, tabnr;
+ +
+ +    for (mt = 0; mt < mtop->nmoltype; mt++)
+ +    {
+ +        molt = &mtop->moltype[mt];
+ +        for (ftype = 0; ftype < F_NRE; ftype++)
+ +        {
+ +            if (ftype == ftype1 || ftype == ftype2)
+ +            {
+ +                il     = &molt->ilist[ftype];
+ +                stride = 1 + NRAL(ftype);
+ +                for (i = 0; i < il->nr; i += stride)
+ +                {
+ +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
+ +                    if (tabnr < 0)
+ +                    {
+ +                        gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
+ +                    }
+ +                    if (tabnr >= *ncount)
+ +                    {
+ +                        srenew(*count, tabnr+1);
+ +                        for (j = *ncount; j < tabnr+1; j++)
+ +                        {
+ +                            (*count)[j] = 0;
+ +                        }
+ +                        *ncount = tabnr+1;
+ +                    }
+ +                    (*count)[tabnr]++;
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static bondedtable_t *make_bonded_tables(FILE *fplog,
+ +                                         int ftype1, int ftype2,
+ +                                         const gmx_mtop_t *mtop,
+ +                                         const char *basefn, const char *tabext)
+ +{
+ +    int            i, ncount, *count;
+ +    char           tabfn[STRLEN];
+ +    bondedtable_t *tab;
+ +
+ +    tab = NULL;
+ +
+ +    ncount = 0;
+ +    count  = NULL;
+ +    count_tables(ftype1, ftype2, mtop, &ncount, &count);
+ +
+ +    if (ncount > 0)
+ +    {
+ +        snew(tab, ncount);
+ +        for (i = 0; i < ncount; i++)
+ +        {
+ +            if (count[i] > 0)
+ +            {
+ +                sprintf(tabfn, "%s", basefn);
+ +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
+ +                        tabext, i, ftp2ext(efXVG));
+ +                tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
+ +            }
+ +        }
+ +        sfree(count);
+ +    }
+ +
+ +    return tab;
+ +}
+ +
+ +void forcerec_set_ranges(t_forcerec *fr,
+ +                         int ncg_home, int ncg_force,
+ +                         int natoms_force,
+ +                         int natoms_force_constr, int natoms_f_novirsum)
+ +{
+ +    fr->cg0 = 0;
+ +    fr->hcg = ncg_home;
+ +
+ +    /* fr->ncg_force is unused in the standard code,
+ +     * but it can be useful for modified code dealing with charge groups.
+ +     */
+ +    fr->ncg_force           = ncg_force;
+ +    fr->natoms_force        = natoms_force;
+ +    fr->natoms_force_constr = natoms_force_constr;
+ +
+ +    if (fr->natoms_force_constr > fr->nalloc_force)
+ +    {
+ +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
+ +
+ +        if (fr->bTwinRange)
+ +        {
+ +            srenew(fr->f_twin, fr->nalloc_force);
+ +        }
+ +    }
+ +
+ +    if (fr->bF_NoVirSum)
+ +    {
+ +        fr->f_novirsum_n = natoms_f_novirsum;
+ +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
+ +        {
+ +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
+ +            srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fr->f_novirsum_n = 0;
+ +    }
+ +}
+ +
+ +static real cutoff_inf(real cutoff)
+ +{
+ +    if (cutoff == 0)
+ +    {
+ +        cutoff = GMX_CUTOFF_INF;
+ +    }
+ +
+ +    return cutoff;
+ +}
+ +
+ +static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
+ +                                  t_forcerec *fr, const t_inputrec *ir,
+ +                                  const char *tabfn, const gmx_mtop_t *mtop,
+ +                                  matrix     box)
+ +{
+ +    char buf[STRLEN];
+ +    int  i, j;
+ +
+ +    if (tabfn == NULL)
+ +    {
+ +        gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
+ +        return;
+ +    }
+ +
+ +    snew(fr->atf_tabs, ir->adress->n_tf_grps);
+ +
+ +    sprintf(buf, "%s", tabfn);
+ +    for (i = 0; i < ir->adress->n_tf_grps; i++)
+ +    {
+ +        j = ir->adress->tf_table_index[i]; /* get energy group index */
+ +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
+ +                *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
+ +        }
+ +        fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
+ +    }
+ +
+ +}
+ +
+ +gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
+ +{
+ +    gmx_bool bAllvsAll;
+ +
+ +    bAllvsAll =
+ +        (
+ +            ir->rlist == 0            &&
+ +            ir->rcoulomb == 0         &&
+ +            ir->rvdw == 0             &&
+ +            ir->ePBC == epbcNONE      &&
+ +            ir->vdwtype == evdwCUT    &&
+ +            ir->coulombtype == eelCUT &&
+ +            ir->efep == efepNO        &&
+ +            (ir->implicit_solvent == eisNO ||
+ +             (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
+ +                                                  ir->gb_algorithm == egbHCT   ||
+ +                                                  ir->gb_algorithm == egbOBC))) &&
+ +            getenv("GMX_NO_ALLVSALL") == NULL
+ +        );
+ +
+ +    if (bAllvsAll && ir->opts.ngener > 1)
+ +    {
+ +        const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
+ +
+ +        if (bPrintNote)
+ +        {
+ +            if (MASTER(cr))
+ +            {
+ +                fprintf(stderr, "\n%s\n", note);
+ +            }
+ +            if (fp != NULL)
+ +            {
+ +                fprintf(fp, "\n%s\n", note);
+ +            }
+ +        }
+ +        bAllvsAll = FALSE;
+ +    }
+ +
+ +    if (bAllvsAll && fp && MASTER(cr))
+ +    {
+ +        fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+ +    }
+ +
+ +    return bAllvsAll;
+ +}
+ +
+ +
+ +static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
+ +{
+ +    int t, i;
+ +
+ +    /* These thread local data structures are used for bondeds only */
+ +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+ +
+ +    if (fr->nthreads > 1)
+ +    {
+ +        snew(fr->f_t, fr->nthreads);
+ +        /* Thread 0 uses the global force and energy arrays */
+ +        for (t = 1; t < fr->nthreads; t++)
+ +        {
+ +            fr->f_t[t].f        = NULL;
+ +            fr->f_t[t].f_nalloc = 0;
+ +            snew(fr->f_t[t].fshift, SHIFTS);
+ +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
+ +            for (i = 0; i < egNR; i++)
+ +            {
+ +                snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
+ +                                  int                         *kernel_type,
+ +                                  int                         *ewald_excl)
+ +{
+ +    *kernel_type = nbnxnk4x4_PlainC;
+ +    *ewald_excl  = ewaldexclTable;
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +    {
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +        *kernel_type = nbnxnk4xN_SIMD_4xN;
+ +#endif
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +        /* We expect the 2xNN kernels to be faster in most cases */
+ +        *kernel_type = nbnxnk4xN_SIMD_2xNN;
+ +#endif
+ +
+ +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+ +        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+ +        {
+ +            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ +             * 10% with HT, 50% without HT, but extra zeros interactions
+ +             * can compensate. As we currently don't detect the actual use
+ +             * of HT, switch to 4x8 to avoid a potential performance hit.
+ +             */
+ +            *kernel_type = nbnxnk4xN_SIMD_4xN;
+ +        }
+ +#endif
+ +        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
+ +        {
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +            *kernel_type = nbnxnk4xN_SIMD_4xN;
+ +#else
+ +            gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+ +#endif
+ +        }
+ +        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
+ +        {
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +            *kernel_type = nbnxnk4xN_SIMD_2xNN;
+ +#else
+ +            gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
+ +#endif
+ +        }
+ +
+ +        /* Analytical Ewald exclusion correction is only an option in the
+ +         * x86 SIMD kernel. This is faster in single precision
+ +         * on Bulldozer and slightly faster on Sandy Bridge.
+ +         */
+ +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+ +        *ewald_excl = ewaldexclAnalytical;
+ +#endif
+ +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
+ +        {
+ +            *ewald_excl = ewaldexclTable;
+ +        }
+ +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
+ +        {
+ +            *ewald_excl = ewaldexclAnalytical;
+ +        }
+ +
+ +    }
+ +#endif /* GMX_X86_SSE2 */
+ +}
+ +
+ +
+ +const char *lookup_nbnxn_kernel_name(int kernel_type)
+ +{
+ +    const char *returnvalue = NULL;
+ +    switch (kernel_type)
+ +    {
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
-             /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
-              * on compiler flags. As we use nearly identical intrinsics, using an AVX
-              * compiler flag without an AVX macro effectively results in AVX kernels.
++        case nbnxnkNotSet:
++            returnvalue = "not set";
++            break;
++        case nbnxnk4x4_PlainC:
++            returnvalue = "plain C";
++            break;
++        case nbnxnk4xN_SIMD_4xN:
++        case nbnxnk4xN_SIMD_2xNN:
++#ifdef GMX_NBNXN_SIMD
+ +#ifdef GMX_X86_SSE2
- #if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
- #ifndef GMX_X86_SSE4_1
-         case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE2"; break;
-         case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
++            /* We have x86 SSE2 compatible SIMD */
++#ifdef GMX_X86_AVX_128_FMA
++            returnvalue = "AVX-128-FMA";
++#else
++#if defined GMX_X86_AVX_256 || defined __AVX__
++            /* x86 SIMD intrinsics can be converted to SSE or AVX depending
++             * on compiler flags. As we use nearly identical intrinsics,
++             * compiling for AVX without an AVX macros effectively results
++             * in AVX kernels.
+ +             * For gcc we check for __AVX__
+ +             * At least a check for icc should be added (if there is a macro)
+ +             */
-         case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE4.1"; break;
-         case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
++#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
++            returnvalue = "AVX-256";
+ +#else
-         case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-128"; break;
-         case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
- #endif
- #endif
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
-         case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-256"; break;
-         case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
++            returnvalue = "AVX-128";
+ +#endif
+ +#else
- #else   /* not GMX_X86_SSE2 */
-         case nbnxnk4xN_SIMD_4xN: returnvalue  = "SIMD"; break;
-         case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
++#ifdef GMX_X86_SSE4_1
++            returnvalue  = "SSE4.1";
++#else
++            returnvalue  = "SSE2";
+ +#endif
+ +#endif
+ +#endif
++#else /* GMX_X86_SSE2 */
++            /* not GMX_X86_SSE2, but other SIMD */
++            returnvalue  = "SIMD";
++#endif /* GMX_X86_SSE2 */
++#else /* GMX_NBNXN_SIMD */
++            returnvalue = "not available";
++#endif /* GMX_NBNXN_SIMD */
++            break;
+ +        case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
+ +        case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
+ +
+ +        case nbnxnkNR:
+ +        default:
+ +            gmx_fatal(FARGS, "Illegal kernel type selected");
+ +            returnvalue = NULL;
+ +            break;
+ +    }
+ +    return returnvalue;
+ +};
+ +
+ +static void pick_nbnxn_kernel(FILE                *fp,
+ +                              const t_commrec     *cr,
+ +                              gmx_bool             use_cpu_acceleration,
+ +                              gmx_bool             bUseGPU,
+ +                              gmx_bool             bEmulateGPU,
+ +                              const t_inputrec    *ir,
+ +                              int                 *kernel_type,
+ +                              int                 *ewald_excl,
+ +                              gmx_bool             bDoNonbonded)
+ +{
+ +    assert(kernel_type);
+ +
+ +    *kernel_type = nbnxnkNotSet;
+ +    *ewald_excl  = ewaldexclTable;
+ +
+ +    if (bEmulateGPU)
+ +    {
+ +        *kernel_type = nbnxnk8x8x8_PlainC;
+ +
+ +        if (bDoNonbonded)
+ +        {
+ +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
+ +        }
+ +    }
+ +    else if (bUseGPU)
+ +    {
+ +        *kernel_type = nbnxnk8x8x8_CUDA;
+ +    }
+ +
+ +    if (*kernel_type == nbnxnkNotSet)
+ +    {
+ +        if (use_cpu_acceleration)
+ +        {
+ +            pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
+ +        }
+ +        else
+ +        {
+ +            *kernel_type = nbnxnk4x4_PlainC;
+ +        }
+ +    }
+ +
+ +    if (bDoNonbonded && fp != NULL)
+ +    {
+ +        fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
+ +                lookup_nbnxn_kernel_name(*kernel_type),
+ +                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ +                nbnxn_kernel_to_cj_size(*kernel_type));
+ +    }
+ +}
+ +
+ +static void pick_nbnxn_resources(const t_commrec     *cr,
+ +                                 const gmx_hw_info_t *hwinfo,
+ +                                 gmx_bool             bDoNonbonded,
+ +                                 gmx_bool            *bUseGPU,
+ +                                 gmx_bool            *bEmulateGPU)
+ +{
+ +    gmx_bool bEmulateGPUEnvVarSet;
+ +    char     gpu_err_str[STRLEN];
+ +
+ +    *bUseGPU = FALSE;
+ +
+ +    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
+ +
+ +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
+ +     * GPUs (currently) only handle non-bonded calculations, we will
+ +     * automatically switch to emulation if non-bonded calculations are
+ +     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
+ +     * way to turn off GPU initialization, data movement, and cleanup.
+ +     *
+ +     * GPU emulation can be useful to assess the performance one can expect by
+ +     * adding GPU(s) to the machine. The conditional below allows this even
+ +     * if mdrun is compiled without GPU acceleration support.
+ +     * Note that you should freezing the system as otherwise it will explode.
+ +     */
+ +    *bEmulateGPU = (bEmulateGPUEnvVarSet ||
+ +                    (!bDoNonbonded && hwinfo->bCanUseGPU));
+ +
+ +    /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
+ +     */
+ +    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+ +    {
+ +        /* Each PP node will use the intra-node id-th device from the
+ +         * list of detected/selected GPUs. */
+ +        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+ +        {
+ +            /* At this point the init should never fail as we made sure that
+ +             * we have all the GPUs we need. If it still does, we'll bail. */
+ +            gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+ +                      cr->nodeid,
+ +                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+ +                      gpu_err_str);
+ +        }
+ +
+ +        /* Here we actually turn on hardware GPU acceleration */
+ +        *bUseGPU = TRUE;
+ +    }
+ +}
+ +
+ +gmx_bool uses_simple_tables(int                 cutoff_scheme,
+ +                            nonbonded_verlet_t *nbv,
+ +                            int                 group)
+ +{
+ +    gmx_bool bUsesSimpleTables = TRUE;
+ +    int      grp_index;
+ +
+ +    switch (cutoff_scheme)
+ +    {
+ +        case ecutsGROUP:
+ +            bUsesSimpleTables = TRUE;
+ +            break;
+ +        case ecutsVERLET:
+ +            assert(NULL != nbv && NULL != nbv->grp);
+ +            grp_index         = (group < 0) ? 0 : (nbv->ngrp - 1);
+ +            bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
+ +            break;
+ +        default:
+ +            gmx_incons("unimplemented");
+ +    }
+ +    return bUsesSimpleTables;
+ +}
+ +
+ +static void init_ewald_f_table(interaction_const_t *ic,
+ +                               gmx_bool             bUsesSimpleTables,
+ +                               real                 rtab)
+ +{
+ +    real maxr;
+ +
+ +    if (bUsesSimpleTables)
+ +    {
+ +        /* With a spacing of 0.0005 we are at the force summation accuracy
+ +         * for the SSE kernels for "normal" atomistic simulations.
+ +         */
+ +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
+ +                                                   ic->rcoulomb);
+ +
+ +        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
+ +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
+ +    }
+ +    else
+ +    {
+ +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+ +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+ +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+ +    }
+ +
+ +    sfree_aligned(ic->tabq_coul_FDV0);
+ +    sfree_aligned(ic->tabq_coul_F);
+ +    sfree_aligned(ic->tabq_coul_V);
+ +
+ +    /* Create the original table data in FDV0 */
+ +    snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
+ +    snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
+ +    snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
+ +    table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
+ +                                ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
+ +}
+ +
+ +void init_interaction_const_tables(FILE                *fp,
+ +                                   interaction_const_t *ic,
+ +                                   gmx_bool             bUsesSimpleTables,
+ +                                   real                 rtab)
+ +{
+ +    real spacing;
+ +
+ +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+ +    {
+ +        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+ +
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
+ +                    1/ic->tabq_scale, ic->tabq_size);
+ +        }
+ +    }
+ +}
+ +
+ +void init_interaction_const(FILE                 *fp,
+ +                            interaction_const_t **interaction_const,
+ +                            const t_forcerec     *fr,
+ +                            real                  rtab)
+ +{
+ +    interaction_const_t *ic;
+ +    gmx_bool             bUsesSimpleTables = TRUE;
+ +
+ +    snew(ic, 1);
+ +
+ +    /* Just allocate something so we can free it */
+ +    snew_aligned(ic->tabq_coul_FDV0, 16, 32);
+ +    snew_aligned(ic->tabq_coul_F, 16, 32);
+ +    snew_aligned(ic->tabq_coul_V, 16, 32);
+ +
+ +    ic->rlist       = fr->rlist;
+ +    ic->rlistlong   = fr->rlistlong;
+ +
+ +    /* Lennard-Jones */
+ +    ic->rvdw        = fr->rvdw;
+ +    if (fr->vdw_modifier == eintmodPOTSHIFT)
+ +    {
+ +        ic->sh_invrc6 = pow(ic->rvdw, -6.0);
+ +    }
+ +    else
+ +    {
+ +        ic->sh_invrc6 = 0;
+ +    }
+ +
+ +    /* Electrostatics */
+ +    ic->eeltype     = fr->eeltype;
+ +    ic->rcoulomb    = fr->rcoulomb;
+ +    ic->epsilon_r   = fr->epsilon_r;
+ +    ic->epsfac      = fr->epsfac;
+ +
+ +    /* Ewald */
+ +    ic->ewaldcoeff  = fr->ewaldcoeff;
+ +    if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ +    {
+ +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+ +    }
+ +    else
+ +    {
+ +        ic->sh_ewald = 0;
+ +    }
+ +
+ +    /* Reaction-field */
+ +    if (EEL_RF(ic->eeltype))
+ +    {
+ +        ic->epsilon_rf = fr->epsilon_rf;
+ +        ic->k_rf       = fr->k_rf;
+ +        ic->c_rf       = fr->c_rf;
+ +    }
+ +    else
+ +    {
+ +        /* For plain cut-off we might use the reaction-field kernels */
+ +        ic->epsilon_rf = ic->epsilon_r;
+ +        ic->k_rf       = 0;
+ +        if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ +        {
+ +            ic->c_rf   = 1/ic->rcoulomb;
+ +        }
+ +        else
+ +        {
+ +            ic->c_rf   = 0;
+ +        }
+ +    }
+ +
+ +    if (fp != NULL)
+ +    {
+ +        fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
+ +                sqr(ic->sh_invrc6), ic->sh_invrc6);
+ +        if (ic->eeltype == eelCUT)
+ +        {
+ +            fprintf(fp, ", Coulomb %.3f", ic->c_rf);
+ +        }
+ +        else if (EEL_PME(ic->eeltype))
+ +        {
+ +            fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
+ +        }
+ +        fprintf(fp, "\n");
+ +    }
+ +
+ +    *interaction_const = ic;
+ +
+ +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
+ +    {
+ +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+ +    }
+ +
+ +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
+ +    init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
+ +}
+ +
+ +static void init_nb_verlet(FILE                *fp,
+ +                           nonbonded_verlet_t **nb_verlet,
+ +                           const t_inputrec    *ir,
+ +                           const t_forcerec    *fr,
+ +                           const t_commrec     *cr,
+ +                           const char          *nbpu_opt)
+ +{
+ +    nonbonded_verlet_t *nbv;
+ +    int                 i;
+ +    char               *env;
+ +    gmx_bool            bEmulateGPU, bHybridGPURun = FALSE;
+ +
+ +    nbnxn_alloc_t      *nb_alloc;
+ +    nbnxn_free_t       *nb_free;
+ +
+ +    snew(nbv, 1);
+ +
+ +    pick_nbnxn_resources(cr, fr->hwinfo,
+ +                         fr->bNonbonded,
+ +                         &nbv->bUseGPU,
+ +                         &bEmulateGPU);
+ +
+ +    nbv->nbs = NULL;
+ +
+ +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
+ +    for (i = 0; i < nbv->ngrp; i++)
+ +    {
+ +        nbv->grp[i].nbl_lists.nnbl = 0;
+ +        nbv->grp[i].nbat           = NULL;
+ +        nbv->grp[i].kernel_type    = nbnxnkNotSet;
+ +
+ +        if (i == 0) /* local */
+ +        {
+ +            pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+ +                              nbv->bUseGPU, bEmulateGPU, ir,
+ +                              &nbv->grp[i].kernel_type,
+ +                              &nbv->grp[i].ewald_excl,
+ +                              fr->bNonbonded);
+ +        }
+ +        else /* non-local */
+ +        {
+ +            if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
+ +            {
+ +                /* Use GPU for local, select a CPU kernel for non-local */
+ +                pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+ +                                  FALSE, FALSE, ir,
+ +                                  &nbv->grp[i].kernel_type,
+ +                                  &nbv->grp[i].ewald_excl,
+ +                                  fr->bNonbonded);
+ +
+ +                bHybridGPURun = TRUE;
+ +            }
+ +            else
+ +            {
+ +                /* Use the same kernel for local and non-local interactions */
+ +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
+ +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (nbv->bUseGPU)
+ +    {
+ +        /* init the NxN GPU data; the last argument tells whether we'll have
+ +         * both local and non-local NB calculation on GPU */
+ +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
+ +                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+ +                        (nbv->ngrp > 1) && !bHybridGPURun);
+ +
+ +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ +        {
+ +            char *end;
+ +
+ +            nbv->min_ci_balanced = strtol(env, &end, 10);
+ +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ +            {
+ +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ +            }
+ +
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ +                        nbv->min_ci_balanced);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ +                        nbv->min_ci_balanced);
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        nbv->min_ci_balanced = 0;
+ +    }
+ +
+ +    *nb_verlet = nbv;
+ +
+ +    nbnxn_init_search(&nbv->nbs,
+ +                      DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
+ +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
+ +                      gmx_omp_nthreads_get(emntNonbonded));
+ +
+ +    for (i = 0; i < nbv->ngrp; i++)
+ +    {
+ +        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
+ +        {
+ +            nb_alloc = &pmalloc;
+ +            nb_free  = &pfree;
+ +        }
+ +        else
+ +        {
+ +            nb_alloc = NULL;
+ +            nb_free  = NULL;
+ +        }
+ +
+ +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
+ +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ +                                /* 8x8x8 "non-simple" lists are ATM always combined */
+ +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ +                                nb_alloc, nb_free);
+ +
+ +        if (i == 0 ||
+ +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
+ +        {
+ +            snew(nbv->grp[i].nbat, 1);
+ +            nbnxn_atomdata_init(fp,
+ +                                nbv->grp[i].nbat,
+ +                                nbv->grp[i].kernel_type,
+ +                                fr->ntype, fr->nbfp,
+ +                                ir->opts.ngener,
+ +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+ +                                nb_alloc, nb_free);
+ +        }
+ +        else
+ +        {
+ +            nbv->grp[i].nbat = nbv->grp[0].nbat;
+ +        }
+ +    }
+ +}
+ +
+ +void init_forcerec(FILE              *fp,
+ +                   const output_env_t oenv,
+ +                   t_forcerec        *fr,
+ +                   t_fcdata          *fcd,
+ +                   const t_inputrec  *ir,
+ +                   const gmx_mtop_t  *mtop,
+ +                   const t_commrec   *cr,
+ +                   matrix             box,
+ +                   const char        *tabfn,
+ +                   const char        *tabafn,
+ +                   const char        *tabpfn,
+ +                   const char        *tabbfn,
+ +                   const char        *nbpu_opt,
+ +                   gmx_bool           bNoSolvOpt,
+ +                   real               print_force)
+ +{
+ +    int            i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
+ +    real           rtab;
+ +    char          *env;
+ +    double         dbl;
+ +    rvec           box_size;
+ +    const t_block *cgs;
+ +    gmx_bool       bGenericKernelOnly;
+ +    gmx_bool       bTab, bSep14tab, bNormalnblists;
+ +    t_nblists     *nbl;
+ +    int           *nm_ind, egp_flags;
+ +
+ +    if (fr->hwinfo == NULL)
+ +    {
+ +        /* Detect hardware, gather information.
+ +         * In mdrun, hwinfo has already been set before calling init_forcerec.
+ +         * Here we ignore GPUs, as tools will not use them anyhow.
+ +         */
+ +        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+ +    }
+ +
+ +    /* By default we turn acceleration on, but it might be turned off further down... */
+ +    fr->use_cpu_acceleration = TRUE;
+ +
+ +    fr->bDomDec = DOMAINDECOMP(cr);
+ +
+ +    natoms = mtop->natoms;
+ +
+ +    if (check_box(ir->ePBC, box))
+ +    {
+ +        gmx_fatal(FARGS, check_box(ir->ePBC, box));
+ +    }
+ +
+ +    /* Test particle insertion ? */
+ +    if (EI_TPI(ir->eI))
+ +    {
+ +        /* Set to the size of the molecule to be inserted (the last one) */
+ +        /* Because of old style topologies, we have to use the last cg
+ +         * instead of the last molecule type.
+ +         */
+ +        cgs       = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
+ +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
+ +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
+ +        {
+ +            gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fr->n_tpi = 0;
+ +    }
+ +
+ +    /* Copy AdResS parameters */
+ +    if (ir->bAdress)
+ +    {
+ +        fr->adress_type           = ir->adress->type;
+ +        fr->adress_const_wf       = ir->adress->const_wf;
+ +        fr->adress_ex_width       = ir->adress->ex_width;
+ +        fr->adress_hy_width       = ir->adress->hy_width;
+ +        fr->adress_icor           = ir->adress->icor;
+ +        fr->adress_site           = ir->adress->site;
+ +        fr->adress_ex_forcecap    = ir->adress->ex_forcecap;
+ +        fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
+ +
+ +
+ +        snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
+ +        for (i = 0; i < ir->adress->n_energy_grps; i++)
+ +        {
+ +            fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
+ +        }
+ +
+ +        fr->n_adress_tf_grps = ir->adress->n_tf_grps;
+ +        snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
+ +        for (i = 0; i < fr->n_adress_tf_grps; i++)
+ +        {
+ +            fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
+ +        }
+ +        copy_rvec(ir->adress->refs, fr->adress_refs);
+ +    }
+ +    else
+ +    {
+ +        fr->adress_type           = eAdressOff;
+ +        fr->adress_do_hybridpairs = FALSE;
+ +    }
+ +
+ +    /* Copy the user determined parameters */
+ +    fr->userint1  = ir->userint1;
+ +    fr->userint2  = ir->userint2;
+ +    fr->userint3  = ir->userint3;
+ +    fr->userint4  = ir->userint4;
+ +    fr->userreal1 = ir->userreal1;
+ +    fr->userreal2 = ir->userreal2;
+ +    fr->userreal3 = ir->userreal3;
+ +    fr->userreal4 = ir->userreal4;
+ +
+ +    /* Shell stuff */
+ +    fr->fc_stepsize = ir->fc_stepsize;
+ +
+ +    /* Free energy */
+ +    fr->efep        = ir->efep;
+ +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
+ +    if (ir->fepvals->bScCoul)
+ +    {
+ +        fr->sc_alphacoul  = ir->fepvals->sc_alpha;
+ +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
+ +    }
+ +    else
+ +    {
+ +        fr->sc_alphacoul  = 0;
+ +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
+ +    }
+ +    fr->sc_power      = ir->fepvals->sc_power;
+ +    fr->sc_r_power    = ir->fepvals->sc_r_power;
+ +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
+ +
+ +    env = getenv("GMX_SCSIGMA_MIN");
+ +    if (env != NULL)
+ +    {
+ +        dbl = 0;
+ +        sscanf(env, "%lf", &dbl);
+ +        fr->sc_sigma6_min = pow(dbl, 6);
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
+ +        }
+ +    }
+ +
+ +    fr->bNonbonded = TRUE;
+ +    if (getenv("GMX_NO_NONBONDED") != NULL)
+ +    {
+ +        /* turn off non-bonded calculations */
+ +        fr->bNonbonded = FALSE;
+ +        md_print_warn(cr, fp,
+ +                      "Found environment variable GMX_NO_NONBONDED.\n"
+ +                      "Disabling nonbonded calculations.\n");
+ +    }
+ +
+ +    bGenericKernelOnly = FALSE;
+ +
+ +    /* We now check in the NS code whether a particular combination of interactions
+ +     * can be used with water optimization, and disable it if that is not the case.
+ +     */
+ +
+ +    if (getenv("GMX_NB_GENERIC") != NULL)
+ +    {
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,
+ +                    "Found environment variable GMX_NB_GENERIC.\n"
+ +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
+ +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
+ +        }
+ +        bGenericKernelOnly = TRUE;
+ +    }
+ +
+ +    if (bGenericKernelOnly == TRUE)
+ +    {
+ +        bNoSolvOpt         = TRUE;
+ +    }
+ +
+ +    if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+ +    {
+ +        fr->use_cpu_acceleration = FALSE;
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,
+ +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
+ +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+ +        }
+ +    }
+ +
+ +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
+ +
+ +    /* Check if we can/should do all-vs-all kernels */
+ +    fr->bAllvsAll       = can_use_allvsall(ir, FALSE, NULL, NULL);
+ +    fr->AllvsAll_work   = NULL;
+ +    fr->AllvsAll_workgb = NULL;
+ +
+ +    /* All-vs-all kernels have not been implemented in 4.6, and
+ +     * the SIMD group kernels are also buggy in this case. Non-accelerated
+ +     * group kernels are OK. See Redmine #1249. */
+ +    if (fr->bAllvsAll)
+ +    {
+ +        fr->bAllvsAll            = FALSE;
+ +        fr->use_cpu_acceleration = FALSE;
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,
+ +                    "\nYour simulation settings would have triggered the efficient all-vs-all\n"
+ +                    "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
+ +                    "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
+ +                    "of an unfixed bug. The reference C kernels are correct, though, so\n"
+ +                    "we are proceeding by disabling all CPU architecture-specific\n"
+ +                    "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
+ +                    "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
+ +        }
+ +    }
+ +
+ +    /* Neighbour searching stuff */
+ +    fr->cutoff_scheme = ir->cutoff_scheme;
+ +    fr->bGrid         = (ir->ns_type == ensGRID);
+ +    fr->ePBC          = ir->ePBC;
+ +
+ +    /* Determine if we will do PBC for distances in bonded interactions */
+ +    if (fr->ePBC == epbcNONE)
+ +    {
+ +        fr->bMolPBC = FALSE;
+ +    }
+ +    else
+ +    {
+ +        if (!DOMAINDECOMP(cr))
+ +        {
+ +            /* The group cut-off scheme and SHAKE assume charge groups
+ +             * are whole, but not using molpbc is faster in most cases.
+ +             */
+ +            if (fr->cutoff_scheme == ecutsGROUP ||
+ +                (ir->eConstrAlg == econtSHAKE &&
+ +                 (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
+ +                  gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
+ +            {
+ +                fr->bMolPBC = ir->bPeriodicMols;
+ +            }
+ +            else
+ +            {
+ +                fr->bMolPBC = TRUE;
+ +                if (getenv("GMX_USE_GRAPH") != NULL)
+ +                {
+ +                    fr->bMolPBC = FALSE;
+ +                    if (fp)
+ +                    {
+ +                        fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
+ +        }
+ +    }
+ +    fr->bGB = (ir->implicit_solvent == eisGBSA);
+ +
+ +    fr->rc_scaling = ir->refcoord_scaling;
+ +    copy_rvec(ir->posres_com, fr->posres_com);
+ +    copy_rvec(ir->posres_comB, fr->posres_comB);
+ +    fr->rlist      = cutoff_inf(ir->rlist);
+ +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
+ +    fr->eeltype    = ir->coulombtype;
+ +    fr->vdwtype    = ir->vdwtype;
+ +
+ +    fr->coulomb_modifier = ir->coulomb_modifier;
+ +    fr->vdw_modifier     = ir->vdw_modifier;
+ +
+ +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
+ +    switch (fr->eeltype)
+ +    {
+ +        case eelCUT:
+ +            fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
+ +            break;
+ +
+ +        case eelRF:
+ +        case eelGRF:
+ +        case eelRF_NEC:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ +            break;
+ +
+ +        case eelRF_ZERO:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
+ +            break;
+ +
+ +        case eelSWITCH:
+ +        case eelSHIFT:
+ +        case eelUSER:
+ +        case eelENCADSHIFT:
+ +        case eelPMESWITCH:
+ +        case eelPMEUSER:
+ +        case eelPMEUSERSWITCH:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ +            break;
+ +
+ +        case eelPME:
+ +        case eelEWALD:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
+ +            break;
+ +
+ +        default:
+ +            gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
+ +            break;
+ +    }
+ +
+ +    /* Vdw: Translate from mdp settings to kernel format */
+ +    switch (fr->vdwtype)
+ +    {
+ +        case evdwCUT:
+ +            if (fr->bBHAM)
+ +            {
+ +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
+ +            }
+ +            else
+ +            {
+ +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
+ +            }
+ +            break;
+ +
+ +        case evdwSWITCH:
+ +        case evdwSHIFT:
+ +        case evdwUSER:
+ +        case evdwENCADSHIFT:
+ +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ +            break;
+ +
+ +        default:
+ +            gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
+ +            break;
+ +    }
+ +
+ +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
+ +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
+ +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
+ +
+ +    fr->bTwinRange = fr->rlistlong > fr->rlist;
+ +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
+ +
+ +    fr->reppow     = mtop->ffparams.reppow;
+ +
+ +    if (ir->cutoff_scheme == ecutsGROUP)
+ +    {
+ +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
+ +                          !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+ +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
+ +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
+ +                           fr->eeltype == eelEWALD ||
+ +                           fr->eeltype == eelPME ||
+ +                           fr->eeltype == eelRF ||
+ +                           fr->eeltype == eelRF_ZERO);
+ +
+ +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
+ +         * going to be faster to tabulate the interaction than calling the generic kernel.
+ +         */
+ +        if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
+ +        {
+ +            if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
+ +            {
+ +                fr->bcoultab = TRUE;
+ +            }
+ +        }
+ +        else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
+ +                 ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
+ +                   fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
+ +                   (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
+ +        {
+ +            if (fr->rcoulomb != fr->rvdw)
+ +            {
+ +                fr->bcoultab = TRUE;
+ +            }
+ +        }
+ +
+ +        if (getenv("GMX_REQUIRE_TABLES"))
+ +        {
+ +            fr->bvdwtab  = TRUE;
+ +            fr->bcoultab = TRUE;
+ +        }
+ +
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
+ +            fprintf(fp, "Table routines are used for vdw:     %s\n", bool_names[fr->bvdwtab ]);
+ +        }
+ +
+ +        if (fr->bvdwtab == TRUE)
+ +        {
+ +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ +            fr->nbkernel_vdw_modifier    = eintmodNONE;
+ +        }
+ +        if (fr->bcoultab == TRUE)
+ +        {
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ +            fr->nbkernel_elec_modifier    = eintmodNONE;
+ +        }
+ +    }
+ +
+ +    if (ir->cutoff_scheme == ecutsVERLET)
+ +    {
+ +        if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+ +        {
+ +            gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
+ +        }
+ +        fr->bvdwtab  = FALSE;
+ +        fr->bcoultab = FALSE;
+ +    }
+ +
+ +    /* Tables are used for direct ewald sum */
+ +    if (fr->bEwald)
+ +    {
+ +        if (EEL_PME(ir->coulombtype))
+ +        {
+ +            if (fp)
+ +            {
+ +                fprintf(fp, "Will do PME sum in reciprocal space.\n");
+ +            }
+ +            if (ir->coulombtype == eelP3M_AD)
+ +            {
+ +                please_cite(fp, "Hockney1988");
+ +                please_cite(fp, "Ballenegger2012");
+ +            }
+ +            else
+ +            {
+ +                please_cite(fp, "Essmann95a");
+ +            }
+ +
+ +            if (ir->ewald_geometry == eewg3DC)
+ +            {
+ +                if (fp)
+ +                {
+ +                    fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
+ +                }
+ +                please_cite(fp, "In-Chul99a");
+ +            }
+ +        }
+ +        fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ +        init_ewald_tab(&(fr->ewald_table), ir, fp);
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
+ +                    1/fr->ewaldcoeff);
+ +        }
+ +    }
+ +
+ +    /* Electrostatics */
+ +    fr->epsilon_r       = ir->epsilon_r;
+ +    fr->epsilon_rf      = ir->epsilon_rf;
+ +    fr->fudgeQQ         = mtop->ffparams.fudgeQQ;
+ +    fr->rcoulomb_switch = ir->rcoulomb_switch;
+ +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
+ +
+ +    /* Parameters for generalized RF */
+ +    fr->zsquare = 0.0;
+ +    fr->temp    = 0.0;
+ +
+ +    if (fr->eeltype == eelGRF)
+ +    {
+ +        init_generalized_rf(fp, mtop, ir, fr);
+ +    }
+ +    else if (fr->eeltype == eelSHIFT)
+ +    {
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            box_size[m] = box[m][m];
+ +        }
+ +
+ +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
+ +        {
+ +            set_shift_consts(fr->rcoulomb_switch, fr->rcoulomb, box_size);
+ +        }
+ +    }
+ +
+ +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
+ +                       gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
+ +                       gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
+ +                       IR_ELEC_FIELD(*ir) ||
+ +                       (fr->adress_icor != eAdressICOff)
+ +                       );
+ +
+ +    if (fr->cutoff_scheme == ecutsGROUP &&
+ +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
+ +    {
+ +        /* Count the total number of charge groups */
+ +        fr->cg_nalloc = ncg_mtop(mtop);
+ +        srenew(fr->cg_cm, fr->cg_nalloc);
+ +    }
+ +    if (fr->shift_vec == NULL)
+ +    {
+ +        snew(fr->shift_vec, SHIFTS);
+ +    }
+ +
+ +    if (fr->fshift == NULL)
+ +    {
+ +        snew(fr->fshift, SHIFTS);
+ +    }
+ +
+ +    if (fr->nbfp == NULL)
+ +    {
+ +        fr->ntype = mtop->ffparams.atnr;
+ +        fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+ +    }
+ +
+ +    /* Copy the energy group exclusions */
+ +    fr->egp_flags = ir->opts.egp_flags;
+ +
+ +    /* Van der Waals stuff */
+ +    fr->rvdw        = cutoff_inf(ir->rvdw);
+ +    fr->rvdw_switch = ir->rvdw_switch;
+ +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
+ +    {
+ +        if (fr->rvdw_switch >= fr->rvdw)
+ +        {
+ +            gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
+ +                      fr->rvdw_switch, fr->rvdw);
+ +        }
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
+ +                    (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
+ +                    fr->rvdw_switch, fr->rvdw);
+ +        }
+ +    }
+ +
+ +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
+ +    {
+ +        gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
+ +    }
+ +
+ +    if (fp)
+ +    {
+ +        fprintf(fp, "Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
+ +                fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
+ +    }
+ +
+ +    fr->eDispCorr = ir->eDispCorr;
+ +    if (ir->eDispCorr != edispcNO)
+ +    {
+ +        set_avcsixtwelve(fp, fr, mtop);
+ +    }
+ +
+ +    if (fr->bBHAM)
+ +    {
+ +        set_bham_b_max(fp, fr, mtop);
+ +    }
+ +
+ +    fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
+ +
+ +    /* Copy the GBSA data (radius, volume and surftens for each
+ +     * atomtype) from the topology atomtype section to forcerec.
+ +     */
+ +    snew(fr->atype_radius, fr->ntype);
+ +    snew(fr->atype_vol, fr->ntype);
+ +    snew(fr->atype_surftens, fr->ntype);
+ +    snew(fr->atype_gb_radius, fr->ntype);
+ +    snew(fr->atype_S_hct, fr->ntype);
+ +
+ +    if (mtop->atomtypes.nr > 0)
+ +    {
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_radius[i] = mtop->atomtypes.radius[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
+ +        }
+ +    }
+ +
+ +    /* Generate the GB table if needed */
+ +    if (fr->bGB)
+ +    {
+ +#ifdef GMX_DOUBLE
+ +        fr->gbtabscale = 2000;
+ +#else
+ +        fr->gbtabscale = 500;
+ +#endif
+ +
+ +        fr->gbtabr = 100;
+ +        fr->gbtab  = make_gb_table(oenv, fr);
+ +
+ +        init_gb(&fr->born, cr, fr, ir, mtop, ir->gb_algorithm);
+ +
+ +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
+ +        if (!DOMAINDECOMP(cr))
+ +        {
+ +            make_local_gb(cr, fr->born, ir->gb_algorithm);
+ +        }
+ +    }
+ +
+ +    /* Set the charge scaling */
+ +    if (fr->epsilon_r != 0)
+ +    {
+ +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
+ +    }
+ +    else
+ +    {
+ +        /* eps = 0 is infinite dieletric: no coulomb interactions */
+ +        fr->epsfac = 0;
+ +    }
+ +
+ +    /* Reaction field constants */
+ +    if (EEL_RF(fr->eeltype))
+ +    {
+ +        calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
+ +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
+ +    }
+ +
+ +    set_chargesum(fp, fr, mtop);
+ +
+ +    /* if we are using LR electrostatics, and they are tabulated,
+ +     * the tables will contain modified coulomb interactions.
+ +     * Since we want to use the non-shifted ones for 1-4
+ +     * coulombic interactions, we must have an extra set of tables.
+ +     */
+ +
+ +    /* Construct tables.
+ +     * A little unnecessary to make both vdw and coul tables sometimes,
+ +     * but what the heck... */
+ +
+ +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
+ +
+ +    bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
+ +                  fr->bBHAM || fr->bEwald) &&
+ +                 (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
+ +                  gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
+ +                  gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
+ +
+ +    negp_pp   = ir->opts.ngener - ir->nwall;
+ +    negptable = 0;
+ +    if (!bTab)
+ +    {
+ +        bNormalnblists = TRUE;
+ +        fr->nnblists   = 1;
+ +    }
+ +    else
+ +    {
+ +        bNormalnblists = (ir->eDispCorr != edispcNO);
+ +        for (egi = 0; egi < negp_pp; egi++)
+ +        {
+ +            for (egj = egi; egj < negp_pp; egj++)
+ +            {
+ +                egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ +                if (!(egp_flags & EGP_EXCL))
+ +                {
+ +                    if (egp_flags & EGP_TABLE)
+ +                    {
+ +                        negptable++;
+ +                    }
+ +                    else
+ +                    {
+ +                        bNormalnblists = TRUE;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        if (bNormalnblists)
+ +        {
+ +            fr->nnblists = negptable + 1;
+ +        }
+ +        else
+ +        {
+ +            fr->nnblists = negptable;
+ +        }
+ +        if (fr->nnblists > 1)
+ +        {
+ +            snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
+ +        }
+ +    }
+ +
+ +    if (ir->adress)
+ +    {
+ +        fr->nnblists *= 2;
+ +    }
+ +
+ +    snew(fr->nblists, fr->nnblists);
+ +
+ +    /* This code automatically gives table length tabext without cut-off's,
+ +     * in that case grompp should already have checked that we do not need
+ +     * normal tables and we only generate tables for 1-4 interactions.
+ +     */
+ +    rtab = ir->rlistlong + ir->tabext;
+ +
+ +    if (bTab)
+ +    {
+ +        /* make tables for ordinary interactions */
+ +        if (bNormalnblists)
+ +        {
+ +            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
+ +            if (ir->adress)
+ +            {
+ +                make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
+ +            }
+ +            if (!bSep14tab)
+ +            {
+ +                fr->tab14 = fr->nblists[0].table_elec_vdw;
+ +            }
+ +            m = 1;
+ +        }
+ +        else
+ +        {
+ +            m = 0;
+ +        }
+ +        if (negptable > 0)
+ +        {
+ +            /* Read the special tables for certain energy group pairs */
+ +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
+ +            for (egi = 0; egi < negp_pp; egi++)
+ +            {
+ +                for (egj = egi; egj < negp_pp; egj++)
+ +                {
+ +                    egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
+ +                    {
+ +                        nbl = &(fr->nblists[m]);
+ +                        if (fr->nnblists > 1)
+ +                        {
+ +                            fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
+ +                        }
+ +                        /* Read the table file with the two energy groups names appended */
+ +                        make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ +                                        *mtop->groups.grpname[nm_ind[egi]],
+ +                                        *mtop->groups.grpname[nm_ind[egj]],
+ +                                        &fr->nblists[m]);
+ +                        if (ir->adress)
+ +                        {
+ +                            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ +                                            *mtop->groups.grpname[nm_ind[egi]],
+ +                                            *mtop->groups.grpname[nm_ind[egj]],
+ +                                            &fr->nblists[fr->nnblists/2+m]);
+ +                        }
+ +                        m++;
+ +                    }
+ +                    else if (fr->nnblists > 1)
+ +                    {
+ +                        fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +    if (bSep14tab)
+ +    {
+ +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
+ +        fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
+ +                                GMX_MAKETABLES_14ONLY);
+ +    }
+ +
+ +    /* Read AdResS Thermo Force table if needed */
+ +    if (fr->adress_icor == eAdressICThermoForce)
+ +    {
+ +        /* old todo replace */
+ +
+ +        if (ir->adress->n_tf_grps > 0)
+ +        {
+ +            make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
+ +
+ +        }
+ +        else
+ +        {
+ +            /* load the default table */
+ +            snew(fr->atf_tabs, 1);
+ +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
+ +        }
+ +    }
+ +
+ +    /* Wall stuff */
+ +    fr->nwall = ir->nwall;
+ +    if (ir->nwall && ir->wall_type == ewtTABLE)
+ +    {
+ +        make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
+ +    }
+ +
+ +    if (fcd && tabbfn)
+ +    {
+ +        fcd->bondtab  = make_bonded_tables(fp,
+ +                                           F_TABBONDS, F_TABBONDSNC,
+ +                                           mtop, tabbfn, "b");
+ +        fcd->angletab = make_bonded_tables(fp,
+ +                                           F_TABANGLES, -1,
+ +                                           mtop, tabbfn, "a");
+ +        fcd->dihtab   = make_bonded_tables(fp,
+ +                                           F_TABDIHS, -1,
+ +                                           mtop, tabbfn, "d");
+ +    }
+ +    else
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
+ +        }
+ +    }
+ +
+ +    /* QM/MM initialization if requested
+ +     */
+ +    if (ir->bQMMM)
+ +    {
+ +        fprintf(stderr, "QM/MM calculation requested.\n");
+ +    }
+ +
+ +    fr->bQMMM      = ir->bQMMM;
+ +    fr->qr         = mk_QMMMrec();
+ +
+ +    /* Set all the static charge group info */
+ +    fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
+ +                                   &fr->bExcl_IntraCGAll_InterCGNone);
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        fr->cginfo = NULL;
+ +    }
+ +    else
+ +    {
+ +        fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
+ +    }
+ +
+ +    if (!DOMAINDECOMP(cr))
+ +    {
+ +        /* When using particle decomposition, the effect of the second argument,
+ +         * which sets fr->hcg, is corrected later in do_md and init_em.
+ +         */
+ +        forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
+ +                            mtop->natoms, mtop->natoms, mtop->natoms);
+ +    }
+ +
+ +    fr->print_force = print_force;
+ +
+ +
+ +    /* coarse load balancing vars */
+ +    fr->t_fnbf    = 0.;
+ +    fr->t_wait    = 0.;
+ +    fr->timesteps = 0;
+ +
+ +    /* Initialize neighbor search */
+ +    init_ns(fp, cr, &fr->ns, fr, mtop);
+ +
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        gmx_nonbonded_setup(fr, bGenericKernelOnly);
+ +        /*
+ +           if (ir->bAdress)
+ +            {
+ +                gmx_setup_adress_kernels(fp,bGenericKernelOnly);
+ +            }
+ +         */
+ +    }
+ +
+ +    /* Initialize the thread working data for bonded interactions */
+ +    init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+ +
+ +    snew(fr->excl_load, fr->nthreads+1);
+ +
+ +    if (fr->cutoff_scheme == ecutsVERLET)
+ +    {
+ +        if (ir->rcoulomb != ir->rvdw)
+ +        {
+ +            gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
+ +        }
+ +
+ +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+ +    }
+ +
+ +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ +    init_interaction_const(fp, &fr->ic, fr, rtab);
+ +    if (ir->eDispCorr != edispcNO)
+ +    {
+ +        calc_enervirdiff(fp, ir->eDispCorr, fr);
+ +    }
+ +}
+ +
+ +#define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
+ +#define pr_int(fp, i)  fprintf((fp), "%s: %d\n",#i, i)
+ +#define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
+ +
+ +void pr_forcerec(FILE *fp, t_forcerec *fr)
+ +{
+ +    int i;
+ +
+ +    pr_real(fp, fr->rlist);
+ +    pr_real(fp, fr->rcoulomb);
+ +    pr_real(fp, fr->fudgeQQ);
+ +    pr_bool(fp, fr->bGrid);
+ +    pr_bool(fp, fr->bTwinRange);
+ +    /*pr_int(fp,fr->cg0);
+ +       pr_int(fp,fr->hcg);*/
+ +    for (i = 0; i < fr->nnblists; i++)
+ +    {
+ +        pr_int(fp, fr->nblists[i].table_elec_vdw.n);
+ +    }
+ +    pr_real(fp, fr->rcoulomb_switch);
+ +    pr_real(fp, fr->rcoulomb);
+ +
+ +    fflush(fp);
+ +}
+ +
+ +void forcerec_set_excl_load(t_forcerec *fr,
+ +                            const gmx_localtop_t *top, const t_commrec *cr)
+ +{
+ +    const int *ind, *a;
+ +    int        t, i, j, ntot, n, ntarget;
+ +
+ +    if (cr != NULL && PARTDECOMP(cr))
+ +    {
+ +        /* No OpenMP with particle decomposition */
+ +        pd_at_range(cr,
+ +                    &fr->excl_load[0],
+ +                    &fr->excl_load[1]);
+ +
+ +        return;
+ +    }
+ +
+ +    ind = top->excls.index;
+ +    a   = top->excls.a;
+ +
+ +    ntot = 0;
+ +    for (i = 0; i < top->excls.nr; i++)
+ +    {
+ +        for (j = ind[i]; j < ind[i+1]; j++)
+ +        {
+ +            if (a[j] > i)
+ +            {
+ +                ntot++;
+ +            }
+ +        }
+ +    }
+ +
+ +    fr->excl_load[0] = 0;
+ +    n                = 0;
+ +    i                = 0;
+ +    for (t = 1; t <= fr->nthreads; t++)
+ +    {
+ +        ntarget = (ntot*t)/fr->nthreads;
+ +        while (i < top->excls.nr && n < ntarget)
+ +        {
+ +            for (j = ind[i]; j < ind[i+1]; j++)
+ +            {
+ +                if (a[j] > i)
+ +                {
+ +                    n++;
+ +                }
+ +            }
+ +            i++;
+ +        }
+ +        fr->excl_load[t] = i;
+ +    }
+ +}
diff --cc src/gromacs/mdlib/nbnxn_atomdata.c

index 10aa13a1a10a0f08276923f5964de027d9e1ac27,0000000000000000000000000000000000000000..45be26bacda3dedc14aa8b1e333b1c697ecaaf26

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_atomdata.c
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@@ -1,1362 -1,0 +1,1361 @@@
-         /* Set the diagonal cluster pair exclusion mask setup data.
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + */
+ +
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include <string.h>
+ +#include "smalloc.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "nbnxn_consts.h"
+ +#include "nbnxn_internal.h"
+ +#include "nbnxn_search.h"
+ +#include "nbnxn_atomdata.h"
+ +#include "gmx_omp_nthreads.h"
+ +
+ +/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
+ +void nbnxn_alloc_aligned(void **ptr, size_t nbytes)
+ +{
+ +    *ptr = save_malloc_aligned("ptr", __FILE__, __LINE__, nbytes, 1, NBNXN_MEM_ALIGN);
+ +}
+ +
+ +/* Free function for memory allocated with nbnxn_alloc_aligned */
+ +void nbnxn_free_aligned(void *ptr)
+ +{
+ +    sfree_aligned(ptr);
+ +}
+ +
+ +/* Reallocation wrapper function for nbnxn data structures */
+ +void nbnxn_realloc_void(void **ptr,
+ +                        int nbytes_copy, int nbytes_new,
+ +                        nbnxn_alloc_t *ma,
+ +                        nbnxn_free_t  *mf)
+ +{
+ +    void *ptr_new;
+ +
+ +    ma(&ptr_new, nbytes_new);
+ +
+ +    if (nbytes_new > 0 && ptr_new == NULL)
+ +    {
+ +        gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
+ +    }
+ +
+ +    if (nbytes_copy > 0)
+ +    {
+ +        if (nbytes_new < nbytes_copy)
+ +        {
+ +            gmx_incons("In nbnxn_realloc_void: new size less than copy size");
+ +        }
+ +        memcpy(ptr_new, *ptr, nbytes_copy);
+ +    }
+ +    if (*ptr != NULL)
+ +    {
+ +        mf(*ptr);
+ +    }
+ +    *ptr = ptr_new;
+ +}
+ +
+ +/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
+ +void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n)
+ +{
+ +    int t;
+ +
+ +    nbnxn_realloc_void((void **)&nbat->type,
+ +                       nbat->natoms*sizeof(*nbat->type),
+ +                       n*sizeof(*nbat->type),
+ +                       nbat->alloc, nbat->free);
+ +    nbnxn_realloc_void((void **)&nbat->lj_comb,
+ +                       nbat->natoms*2*sizeof(*nbat->lj_comb),
+ +                       n*2*sizeof(*nbat->lj_comb),
+ +                       nbat->alloc, nbat->free);
+ +    if (nbat->XFormat != nbatXYZQ)
+ +    {
+ +        nbnxn_realloc_void((void **)&nbat->q,
+ +                           nbat->natoms*sizeof(*nbat->q),
+ +                           n*sizeof(*nbat->q),
+ +                           nbat->alloc, nbat->free);
+ +    }
+ +    if (nbat->nenergrp > 1)
+ +    {
+ +        nbnxn_realloc_void((void **)&nbat->energrp,
+ +                           nbat->natoms/nbat->na_c*sizeof(*nbat->energrp),
+ +                           n/nbat->na_c*sizeof(*nbat->energrp),
+ +                           nbat->alloc, nbat->free);
+ +    }
+ +    nbnxn_realloc_void((void **)&nbat->x,
+ +                       nbat->natoms*nbat->xstride*sizeof(*nbat->x),
+ +                       n*nbat->xstride*sizeof(*nbat->x),
+ +                       nbat->alloc, nbat->free);
+ +    for (t = 0; t < nbat->nout; t++)
+ +    {
+ +        /* Allocate one element extra for possible signaling with CUDA */
+ +        nbnxn_realloc_void((void **)&nbat->out[t].f,
+ +                           nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
+ +                           n*nbat->fstride*sizeof(*nbat->out[t].f),
+ +                           nbat->alloc, nbat->free);
+ +    }
+ +    nbat->nalloc = n;
+ +}
+ +
+ +/* Initializes an nbnxn_atomdata_output_t data structure */
+ +static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
+ +                                       int nb_kernel_type,
+ +                                       int nenergrp, int stride,
+ +                                       nbnxn_alloc_t *ma)
+ +{
+ +    int cj_size;
+ +
+ +    out->f = NULL;
+ +    ma((void **)&out->fshift, SHIFTS*DIM*sizeof(*out->fshift));
+ +    out->nV = nenergrp*nenergrp;
+ +    ma((void **)&out->Vvdw, out->nV*sizeof(*out->Vvdw));
+ +    ma((void **)&out->Vc, out->nV*sizeof(*out->Vc  ));
+ +
+ +    if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+ +        nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
+ +    {
+ +        cj_size  = nbnxn_kernel_to_cj_size(nb_kernel_type);
+ +        out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
+ +        ma((void **)&out->VSvdw, out->nVS*sizeof(*out->VSvdw));
+ +        ma((void **)&out->VSc, out->nVS*sizeof(*out->VSc  ));
+ +    }
+ +    else
+ +    {
+ +        out->nVS = 0;
+ +    }
+ +}
+ +
+ +static void copy_int_to_nbat_int(const int *a, int na, int na_round,
+ +                                 const int *in, int fill, int *innb)
+ +{
+ +    int i, j;
+ +
+ +    j = 0;
+ +    for (i = 0; i < na; i++)
+ +    {
+ +        innb[j++] = in[a[i]];
+ +    }
+ +    /* Complete the partially filled last cell with fill */
+ +    for (; i < na_round; i++)
+ +    {
+ +        innb[j++] = fill;
+ +    }
+ +}
+ +
+ +static void clear_nbat_real(int na, int nbatFormat, real *xnb, int a0)
+ +{
+ +    int a, d, j, c;
+ +
+ +    switch (nbatFormat)
+ +    {
+ +        case nbatXYZ:
+ +            for (a = 0; a < na; a++)
+ +            {
+ +                for (d = 0; d < DIM; d++)
+ +                {
+ +                    xnb[(a0+a)*STRIDE_XYZ+d] = 0;
+ +                }
+ +            }
+ +            break;
+ +        case nbatXYZQ:
+ +            for (a = 0; a < na; a++)
+ +            {
+ +                for (d = 0; d < DIM; d++)
+ +                {
+ +                    xnb[(a0+a)*STRIDE_XYZQ+d] = 0;
+ +                }
+ +            }
+ +            break;
+ +        case nbatX4:
+ +            j = X4_IND_A(a0);
+ +            c = a0 & (PACK_X4-1);
+ +            for (a = 0; a < na; a++)
+ +            {
+ +                xnb[j+XX*PACK_X4] = 0;
+ +                xnb[j+YY*PACK_X4] = 0;
+ +                xnb[j+ZZ*PACK_X4] = 0;
+ +                j++;
+ +                c++;
+ +                if (c == PACK_X4)
+ +                {
+ +                    j += (DIM-1)*PACK_X4;
+ +                    c  = 0;
+ +                }
+ +            }
+ +            break;
+ +        case nbatX8:
+ +            j = X8_IND_A(a0);
+ +            c = a0 & (PACK_X8-1);
+ +            for (a = 0; a < na; a++)
+ +            {
+ +                xnb[j+XX*PACK_X8] = 0;
+ +                xnb[j+YY*PACK_X8] = 0;
+ +                xnb[j+ZZ*PACK_X8] = 0;
+ +                j++;
+ +                c++;
+ +                if (c == PACK_X8)
+ +                {
+ +                    j += (DIM-1)*PACK_X8;
+ +                    c  = 0;
+ +                }
+ +            }
+ +            break;
+ +    }
+ +}
+ +
+ +void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
+ +                            rvec *x, int nbatFormat, real *xnb, int a0,
+ +                            int cx, int cy, int cz)
+ +{
+ +    int i, j, c;
+ +
+ +/* We might need to place filler particles to fill up the cell to na_round.
+ + * The coefficients (LJ and q) for such particles are zero.
+ + * But we might still get NaN as 0*NaN when distances are too small.
+ + * We hope that -107 nm is far away enough from to zero
+ + * to avoid accidental short distances to particles shifted down for pbc.
+ + */
+ +#define NBAT_FAR_AWAY 107
+ +
+ +    switch (nbatFormat)
+ +    {
+ +        case nbatXYZ:
+ +            j = a0*STRIDE_XYZ;
+ +            for (i = 0; i < na; i++)
+ +            {
+ +                xnb[j++] = x[a[i]][XX];
+ +                xnb[j++] = x[a[i]][YY];
+ +                xnb[j++] = x[a[i]][ZZ];
+ +            }
+ +            /* Complete the partially filled last cell with copies of the last element.
+ +             * This simplifies the bounding box calculation and avoid
+ +             * numerical issues with atoms that are coincidentally close.
+ +             */
+ +            for (; i < na_round; i++)
+ +            {
+ +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
+ +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
+ +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
+ +            }
+ +            break;
+ +        case nbatXYZQ:
+ +            j = a0*STRIDE_XYZQ;
+ +            for (i = 0; i < na; i++)
+ +            {
+ +                xnb[j++] = x[a[i]][XX];
+ +                xnb[j++] = x[a[i]][YY];
+ +                xnb[j++] = x[a[i]][ZZ];
+ +                j++;
+ +            }
+ +            /* Complete the partially filled last cell with particles far apart */
+ +            for (; i < na_round; i++)
+ +            {
+ +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
+ +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
+ +                xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
+ +                j++;
+ +            }
+ +            break;
+ +        case nbatX4:
+ +            j = X4_IND_A(a0);
+ +            c = a0 & (PACK_X4-1);
+ +            for (i = 0; i < na; i++)
+ +            {
+ +                xnb[j+XX*PACK_X4] = x[a[i]][XX];
+ +                xnb[j+YY*PACK_X4] = x[a[i]][YY];
+ +                xnb[j+ZZ*PACK_X4] = x[a[i]][ZZ];
+ +                j++;
+ +                c++;
+ +                if (c == PACK_X4)
+ +                {
+ +                    j += (DIM-1)*PACK_X4;
+ +                    c  = 0;
+ +                }
+ +            }
+ +            /* Complete the partially filled last cell with particles far apart */
+ +            for (; i < na_round; i++)
+ +            {
+ +                xnb[j+XX*PACK_X4] = -NBAT_FAR_AWAY*(1 + cx);
+ +                xnb[j+YY*PACK_X4] = -NBAT_FAR_AWAY*(1 + cy);
+ +                xnb[j+ZZ*PACK_X4] = -NBAT_FAR_AWAY*(1 + cz + i);
+ +                j++;
+ +                c++;
+ +                if (c == PACK_X4)
+ +                {
+ +                    j += (DIM-1)*PACK_X4;
+ +                    c  = 0;
+ +                }
+ +            }
+ +            break;
+ +        case nbatX8:
+ +            j = X8_IND_A(a0);
+ +            c = a0 & (PACK_X8 - 1);
+ +            for (i = 0; i < na; i++)
+ +            {
+ +                xnb[j+XX*PACK_X8] = x[a[i]][XX];
+ +                xnb[j+YY*PACK_X8] = x[a[i]][YY];
+ +                xnb[j+ZZ*PACK_X8] = x[a[i]][ZZ];
+ +                j++;
+ +                c++;
+ +                if (c == PACK_X8)
+ +                {
+ +                    j += (DIM-1)*PACK_X8;
+ +                    c  = 0;
+ +                }
+ +            }
+ +            /* Complete the partially filled last cell with particles far apart */
+ +            for (; i < na_round; i++)
+ +            {
+ +                xnb[j+XX*PACK_X8] = -NBAT_FAR_AWAY*(1 + cx);
+ +                xnb[j+YY*PACK_X8] = -NBAT_FAR_AWAY*(1 + cy);
+ +                xnb[j+ZZ*PACK_X8] = -NBAT_FAR_AWAY*(1 + cz + i);
+ +                j++;
+ +                c++;
+ +                if (c == PACK_X8)
+ +                {
+ +                    j += (DIM-1)*PACK_X8;
+ +                    c  = 0;
+ +                }
+ +            }
+ +            break;
+ +        default:
+ +            gmx_incons("Unsupported nbnxn_atomdata_t format");
+ +    }
+ +}
+ +
+ +/* Determines the combination rule (or none) to be used, stores it,
+ + * and sets the LJ parameters required with the rule.
+ + */
+ +static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
+ +{
+ +    int  nt, i, j;
+ +    real c6, c12;
+ +
+ +    nt = nbat->ntype;
+ +
+ +    switch (nbat->comb_rule)
+ +    {
+ +        case  ljcrGEOM:
+ +            nbat->comb_rule = ljcrGEOM;
+ +
+ +            for (i = 0; i < nt; i++)
+ +            {
+ +                /* Copy the diagonal from the nbfp matrix */
+ +                nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
+ +                nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
+ +            }
+ +            break;
+ +        case ljcrLB:
+ +            for (i = 0; i < nt; i++)
+ +            {
+ +                /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
+ +                c6  = nbat->nbfp[(i*nt+i)*2  ];
+ +                c12 = nbat->nbfp[(i*nt+i)*2+1];
+ +                if (c6 > 0 && c12 > 0)
+ +                {
+ +                    /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
+ +                     * so we get 6*C6 and 12*C12 after combining.
+ +                     */
+ +                    nbat->nbfp_comb[i*2  ] = 0.5*pow(c12/c6, 1.0/6.0);
+ +                    nbat->nbfp_comb[i*2+1] = sqrt(c6*c6/c12);
+ +                }
+ +                else
+ +                {
+ +                    nbat->nbfp_comb[i*2  ] = 0;
+ +                    nbat->nbfp_comb[i*2+1] = 0;
+ +                }
+ +            }
+ +            break;
+ +        case ljcrNONE:
+ +            /* nbfp_s4 stores two parameters using a stride of 4,
+ +             * because this would suit x86 SIMD single-precision
+ +             * quad-load intrinsics. There's a slight inefficiency in
+ +             * allocating and initializing nbfp_s4 when it might not
+ +             * be used, but introducing the conditional code is not
+ +             * really worth it. */
+ +            nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
+ +            for (i = 0; i < nt; i++)
+ +            {
+ +                for (j = 0; j < nt; j++)
+ +                {
+ +                    nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
+ +                    nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
+ +                    nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
+ +                    nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+ +                }
+ +            }
+ +            break;
+ +        default:
+ +            gmx_incons("Unknown combination rule");
+ +            break;
+ +    }
+ +}
+ +
+ +/* Initializes an nbnxn_atomdata_t data structure */
+ +void nbnxn_atomdata_init(FILE *fp,
+ +                         nbnxn_atomdata_t *nbat,
+ +                         int nb_kernel_type,
+ +                         int ntype, const real *nbfp,
+ +                         int n_energygroups,
+ +                         int nout,
+ +                         nbnxn_alloc_t *alloc,
+ +                         nbnxn_free_t  *free)
+ +{
+ +    int      i, j;
+ +    real     c6, c12, tol;
+ +    char    *ptr;
+ +    gmx_bool simple, bCombGeom, bCombLB;
+ +
+ +    if (alloc == NULL)
+ +    {
+ +        nbat->alloc = nbnxn_alloc_aligned;
+ +    }
+ +    else
+ +    {
+ +        nbat->alloc = alloc;
+ +    }
+ +    if (free == NULL)
+ +    {
+ +        nbat->free = nbnxn_free_aligned;
+ +    }
+ +    else
+ +    {
+ +        nbat->free = free;
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype);
+ +    }
+ +    nbat->ntype = ntype + 1;
+ +    nbat->alloc((void **)&nbat->nbfp,
+ +                nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp));
+ +    nbat->alloc((void **)&nbat->nbfp_comb, nbat->ntype*2*sizeof(*nbat->nbfp_comb));
+ +
+ +    /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
+ +     * force-field floating point parameters.
+ +     */
+ +    tol = 1e-5;
+ +    ptr = getenv("GMX_LJCOMB_TOL");
+ +    if (ptr != NULL)
+ +    {
+ +        double dbl;
+ +
+ +        sscanf(ptr, "%lf", &dbl);
+ +        tol = dbl;
+ +    }
+ +    bCombGeom = TRUE;
+ +    bCombLB   = TRUE;
+ +
+ +    /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
+ +     * to check for the LB rule.
+ +     */
+ +    for (i = 0; i < ntype; i++)
+ +    {
+ +        c6  = nbfp[(i*ntype+i)*2  ]/6.0;
+ +        c12 = nbfp[(i*ntype+i)*2+1]/12.0;
+ +        if (c6 > 0 && c12 > 0)
+ +        {
+ +            nbat->nbfp_comb[i*2  ] = pow(c12/c6, 1.0/6.0);
+ +            nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12;
+ +        }
+ +        else if (c6 == 0 && c12 == 0)
+ +        {
+ +            nbat->nbfp_comb[i*2  ] = 0;
+ +            nbat->nbfp_comb[i*2+1] = 0;
+ +        }
+ +        else
+ +        {
+ +            /* Can not use LB rule with only dispersion or repulsion */
+ +            bCombLB = FALSE;
+ +        }
+ +    }
+ +
+ +    for (i = 0; i < nbat->ntype; i++)
+ +    {
+ +        for (j = 0; j < nbat->ntype; j++)
+ +        {
+ +            if (i < ntype && j < ntype)
+ +            {
+ +                /* fr->nbfp has been updated, so that array too now stores c6/c12 including
+ +                 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
+ +                 */
+ +                c6  = nbfp[(i*ntype+j)*2  ];
+ +                c12 = nbfp[(i*ntype+j)*2+1];
+ +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = c6;
+ +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12;
+ +
+ +                /* Compare 6*C6 and 12*C12 for geometric cobination rule */
+ +                bCombGeom = bCombGeom &&
+ +                    gmx_within_tol(c6*c6, nbfp[(i*ntype+i)*2  ]*nbfp[(j*ntype+j)*2  ], tol) &&
+ +                    gmx_within_tol(c12*c12, nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1], tol);
+ +
+ +                /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
+ +                c6     /= 6.0;
+ +                c12    /= 12.0;
+ +                bCombLB = bCombLB &&
+ +                    ((c6 == 0 && c12 == 0 &&
+ +                      (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
+ +                     (c6 > 0 && c12 > 0 &&
+ +                      gmx_within_tol(pow(c12/c6, 1.0/6.0), 0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]), tol) &&
+ +                      gmx_within_tol(0.25*c6*c6/c12, sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]), tol)));
+ +            }
+ +            else
+ +            {
+ +                /* Add zero parameters for the additional dummy atom type */
+ +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = 0;
+ +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0;
+ +            }
+ +        }
+ +    }
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
+ +                bCombGeom, bCombLB);
+ +    }
+ +
+ +    simple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
+ +
+ +    if (simple)
+ +    {
+ +        /* We prefer the geometic combination rule,
+ +         * as that gives a slightly faster kernel than the LB rule.
+ +         */
+ +        if (bCombGeom)
+ +        {
+ +            nbat->comb_rule = ljcrGEOM;
+ +        }
+ +        else if (bCombLB)
+ +        {
+ +            nbat->comb_rule = ljcrLB;
+ +        }
+ +        else
+ +        {
+ +            nbat->comb_rule = ljcrNONE;
+ +
+ +            nbat->free(nbat->nbfp_comb);
+ +        }
+ +
+ +        if (fp)
+ +        {
+ +            if (nbat->comb_rule == ljcrNONE)
+ +            {
+ +                fprintf(fp, "Using full Lennard-Jones parameter combination matrix\n\n");
+ +            }
+ +            else
+ +            {
+ +                fprintf(fp, "Using %s Lennard-Jones combination rule\n\n",
+ +                        nbat->comb_rule == ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
+ +            }
+ +        }
+ +
+ +        set_combination_rule_data(nbat);
+ +    }
+ +    else
+ +    {
+ +        nbat->comb_rule = ljcrNONE;
+ +
+ +        nbat->free(nbat->nbfp_comb);
+ +    }
+ +
+ +    nbat->natoms  = 0;
+ +    nbat->type    = NULL;
+ +    nbat->lj_comb = NULL;
+ +    if (simple)
+ +    {
+ +        int pack_x;
+ +
+ +        switch (nb_kernel_type)
+ +        {
+ +            case nbnxnk4xN_SIMD_4xN:
+ +            case nbnxnk4xN_SIMD_2xNN:
+ +                pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+ +                             nbnxn_kernel_to_cj_size(nb_kernel_type));
+ +                switch (pack_x)
+ +                {
+ +                    case 4:
+ +                        nbat->XFormat = nbatX4;
+ +                        break;
+ +                    case 8:
+ +                        nbat->XFormat = nbatX8;
+ +                        break;
+ +                    default:
+ +                        gmx_incons("Unsupported packing width");
+ +                }
+ +                break;
+ +            default:
+ +                nbat->XFormat = nbatXYZ;
+ +                break;
+ +        }
+ +
+ +        nbat->FFormat = nbat->XFormat;
+ +    }
+ +    else
+ +    {
+ +        nbat->XFormat = nbatXYZQ;
+ +        nbat->FFormat = nbatXYZ;
+ +    }
+ +    nbat->q        = NULL;
+ +    nbat->nenergrp = n_energygroups;
+ +    if (!simple)
+ +    {
+ +        /* Energy groups not supported yet for super-sub lists */
+ +        if (n_energygroups > 1 && fp != NULL)
+ +        {
+ +            fprintf(fp, "\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
+ +        }
+ +        nbat->nenergrp = 1;
+ +    }
+ +    /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
+ +    if (nbat->nenergrp > 64)
+ +    {
+ +        gmx_fatal(FARGS, "With NxN kernels not more than 64 energy groups are supported\n");
+ +    }
+ +    nbat->neg_2log = 1;
+ +    while (nbat->nenergrp > (1<<nbat->neg_2log))
+ +    {
+ +        nbat->neg_2log++;
+ +    }
+ +    nbat->energrp = NULL;
+ +    nbat->alloc((void **)&nbat->shift_vec, SHIFTS*sizeof(*nbat->shift_vec));
+ +    nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
+ +    nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
+ +    nbat->x       = NULL;
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +    if (simple)
+ +    {
-          * Here we store j - i for generating the mask for the first i,
++        /* Set the diagonal cluster pair interaction mask setup data.
+ +         * In the kernel we check 0 < j - i to generate the masks.
-          * In the kernel we can subtract 1 to generate the subsequent mask.
++         * Here we store j - i for generating the mask for the first i (i=0);
+ +         * we substract 0.5 to avoid rounding issues.
-         const int simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
-         int       simd_4xn_diag_size, real_excl, simd_excl_size, j, s;
++         * In the kernel we can subtract 1 to generate the mask for the next i.
+ +         */
-         simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
-         snew_aligned(nbat->simd_4xn_diag, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
-         for (j = 0; j < simd_4xn_diag_size; j++)
++        const int simd_width = GMX_SIMD_WIDTH_HERE;
++        int       simd_4xn_diag_ind_size, simd_interaction_size, j;
+ +
-             nbat->simd_4xn_diag[j] = j - 0.5;
++        simd_4xn_diag_ind_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
++        snew_aligned(nbat->simd_4xn_diagonal_j_minus_i,
++                     simd_4xn_diag_ind_size, NBNXN_MEM_ALIGN);
++        for (j = 0; j < simd_4xn_diag_ind_size; j++)
+ +        {
-         snew_aligned(nbat->simd_2xnn_diag, simd_width, NBNXN_MEM_ALIGN);
++            nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
+ +        }
+ +
-             nbat->simd_2xnn_diag[j]              = j - 0.5;
++        snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i,
++                     simd_width, NBNXN_MEM_ALIGN);
+ +        for (j = 0; j < simd_width/2; j++)
+ +        {
+ +            /* The j-cluster size is half the SIMD width */
-             nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
++            nbat->simd_2xnn_diagonal_j_minus_i[j]              = j - 0.5;
+ +            /* The next half of the SIMD width is for i + 1 */
-         /* We always use 32-bit integer exclusion masks. When we use
-          * double precision, we fit two integers in a double SIMD register.
++            nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
+ +        }
+ +
-         real_excl = sizeof(real)/sizeof(*nbat->simd_excl_mask);
-         /* Set bits for use with both 4xN and 2x(N+N) kernels */
-         simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width*real_excl;
-         snew_aligned(nbat->simd_excl_mask, simd_excl_size*real_excl, NBNXN_MEM_ALIGN);
-         for (j = 0; j < simd_excl_size; j++)
++        /* We use up to 32 bits for exclusion masking.
++         * The same masks are used for the 4xN and 2x(N+N) kernels.
++         * The masks are read either into epi32 SIMD registers or into
++         * real SIMD registers (together with a cast).
++         * In single precision this means the real and epi32 SIMD registers
++         * are of equal size.
++         * In double precision the epi32 registers can be smaller than
++         * the real registers, so depending on the architecture, we might
++         * need to use two, identical, 32-bit masks per real.
+ +         */
-             /* Set the consecutive bits for masking pair exclusions.
-              * For double a single-bit mask would be enough.
-              * But using two bits avoids endianness issues.
-              */
-             for (s = 0; s < real_excl; s++)
-             {
-                 /* Set the consecutive bits for masking pair exclusions */
-                 nbat->simd_excl_mask[j*real_excl + s] = (1U << j);
-             }
++        simd_interaction_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
++        snew_aligned(nbat->simd_exclusion_filter1, simd_interaction_size,   NBNXN_MEM_ALIGN);
++        snew_aligned(nbat->simd_exclusion_filter2, simd_interaction_size*2, NBNXN_MEM_ALIGN);
++        
++        for (j = 0; j < simd_interaction_size; j++)
+ +        {
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
- 
++            /* Set the consecutive bits for filters pair exclusions masks */
++            nbat->simd_exclusion_filter1[j]       = (1U << j);
++            nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
++            nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
+ +        }
+ +    }
+ +#endif
+ +
+ +    /* Initialize the output data structures */
+ +    nbat->nout    = nout;
+ +    snew(nbat->out, nbat->nout);
+ +    nbat->nalloc  = 0;
+ +    for (i = 0; i < nbat->nout; i++)
+ +    {
+ +        nbnxn_atomdata_output_init(&nbat->out[i],
+ +                                   nb_kernel_type,
+ +                                   nbat->nenergrp, 1<<nbat->neg_2log,
+ +                                   nbat->alloc);
+ +    }
+ +    nbat->buffer_flags.flag        = NULL;
+ +    nbat->buffer_flags.flag_nalloc = 0;
+ +}
+ +
+ +static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
+ +                                       const int *type, int na,
+ +                                       real *ljparam_at)
+ +{
+ +    int is, k, i;
+ +
+ +    /* The LJ params follow the combination rule:
+ +     * copy the params for the type array to the atom array.
+ +     */
+ +    for (is = 0; is < na; is += PACK_X4)
+ +    {
+ +        for (k = 0; k < PACK_X4; k++)
+ +        {
+ +            i = is + k;
+ +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
+ +            ljparam_at[is*2+PACK_X4+k] = ljparam_type[type[i]*2+1];
+ +        }
+ +    }
+ +}
+ +
+ +static void copy_lj_to_nbat_lj_comb_x8(const real *ljparam_type,
+ +                                       const int *type, int na,
+ +                                       real *ljparam_at)
+ +{
+ +    int is, k, i;
+ +
+ +    /* The LJ params follow the combination rule:
+ +     * copy the params for the type array to the atom array.
+ +     */
+ +    for (is = 0; is < na; is += PACK_X8)
+ +    {
+ +        for (k = 0; k < PACK_X8; k++)
+ +        {
+ +            i = is + k;
+ +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
+ +            ljparam_at[is*2+PACK_X8+k] = ljparam_type[type[i]*2+1];
+ +        }
+ +    }
+ +}
+ +
+ +/* Sets the atom type and LJ data in nbnxn_atomdata_t */
+ +static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t    *nbat,
+ +                                         int                  ngrid,
+ +                                         const nbnxn_search_t nbs,
+ +                                         const int           *type)
+ +{
+ +    int                 g, i, ncz, ash;
+ +    const nbnxn_grid_t *grid;
+ +
+ +    for (g = 0; g < ngrid; g++)
+ +    {
+ +        grid = &nbs->grid[g];
+ +
+ +        /* Loop over all columns and copy and fill */
+ +        for (i = 0; i < grid->ncx*grid->ncy; i++)
+ +        {
+ +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
+ +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
+ +
+ +            copy_int_to_nbat_int(nbs->a+ash, grid->cxy_na[i], ncz*grid->na_sc,
+ +                                 type, nbat->ntype-1, nbat->type+ash);
+ +
+ +            if (nbat->comb_rule != ljcrNONE)
+ +            {
+ +                if (nbat->XFormat == nbatX4)
+ +                {
+ +                    copy_lj_to_nbat_lj_comb_x4(nbat->nbfp_comb,
+ +                                               nbat->type+ash, ncz*grid->na_sc,
+ +                                               nbat->lj_comb+ash*2);
+ +                }
+ +                else if (nbat->XFormat == nbatX8)
+ +                {
+ +                    copy_lj_to_nbat_lj_comb_x8(nbat->nbfp_comb,
+ +                                               nbat->type+ash, ncz*grid->na_sc,
+ +                                               nbat->lj_comb+ash*2);
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Sets the charges in nbnxn_atomdata_t *nbat */
+ +static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t    *nbat,
+ +                                       int                  ngrid,
+ +                                       const nbnxn_search_t nbs,
+ +                                       const real          *charge)
+ +{
+ +    int                 g, cxy, ncz, ash, na, na_round, i, j;
+ +    real               *q;
+ +    const nbnxn_grid_t *grid;
+ +
+ +    for (g = 0; g < ngrid; g++)
+ +    {
+ +        grid = &nbs->grid[g];
+ +
+ +        /* Loop over all columns and copy and fill */
+ +        for (cxy = 0; cxy < grid->ncx*grid->ncy; cxy++)
+ +        {
+ +            ash      = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+ +            na       = grid->cxy_na[cxy];
+ +            na_round = (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
+ +
+ +            if (nbat->XFormat == nbatXYZQ)
+ +            {
+ +                q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1;
+ +                for (i = 0; i < na; i++)
+ +                {
+ +                    *q = charge[nbs->a[ash+i]];
+ +                    q += STRIDE_XYZQ;
+ +                }
+ +                /* Complete the partially filled last cell with zeros */
+ +                for (; i < na_round; i++)
+ +                {
+ +                    *q = 0;
+ +                    q += STRIDE_XYZQ;
+ +                }
+ +            }
+ +            else
+ +            {
+ +                q = nbat->q + ash;
+ +                for (i = 0; i < na; i++)
+ +                {
+ +                    *q = charge[nbs->a[ash+i]];
+ +                    q++;
+ +                }
+ +                /* Complete the partially filled last cell with zeros */
+ +                for (; i < na_round; i++)
+ +                {
+ +                    *q = 0;
+ +                    q++;
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Copies the energy group indices to a reordered and packed array */
+ +static void copy_egp_to_nbat_egps(const int *a, int na, int na_round,
+ +                                  int na_c, int bit_shift,
+ +                                  const int *in, int *innb)
+ +{
+ +    int i, j, sa, at;
+ +    int comb;
+ +
+ +    j = 0;
+ +    for (i = 0; i < na; i += na_c)
+ +    {
+ +        /* Store na_c energy group numbers into one int */
+ +        comb = 0;
+ +        for (sa = 0; sa < na_c; sa++)
+ +        {
+ +            at = a[i+sa];
+ +            if (at >= 0)
+ +            {
+ +                comb |= (GET_CGINFO_GID(in[at]) << (sa*bit_shift));
+ +            }
+ +        }
+ +        innb[j++] = comb;
+ +    }
+ +    /* Complete the partially filled last cell with fill */
+ +    for (; i < na_round; i += na_c)
+ +    {
+ +        innb[j++] = 0;
+ +    }
+ +}
+ +
+ +/* Set the energy group indices for atoms in nbnxn_atomdata_t */
+ +static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t    *nbat,
+ +                                            int                  ngrid,
+ +                                            const nbnxn_search_t nbs,
+ +                                            const int           *atinfo)
+ +{
+ +    int                 g, i, ncz, ash;
+ +    const nbnxn_grid_t *grid;
+ +
+ +    for (g = 0; g < ngrid; g++)
+ +    {
+ +        grid = &nbs->grid[g];
+ +
+ +        /* Loop over all columns and copy and fill */
+ +        for (i = 0; i < grid->ncx*grid->ncy; i++)
+ +        {
+ +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
+ +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
+ +
+ +            copy_egp_to_nbat_egps(nbs->a+ash, grid->cxy_na[i], ncz*grid->na_sc,
+ +                                  nbat->na_c, nbat->neg_2log,
+ +                                  atinfo, nbat->energrp+(ash>>grid->na_c_2log));
+ +        }
+ +    }
+ +}
+ +
+ +/* Sets all required atom parameter data in nbnxn_atomdata_t */
+ +void nbnxn_atomdata_set(nbnxn_atomdata_t    *nbat,
+ +                        int                  locality,
+ +                        const nbnxn_search_t nbs,
+ +                        const t_mdatoms     *mdatoms,
+ +                        const int           *atinfo)
+ +{
+ +    int ngrid;
+ +
+ +    if (locality == eatLocal)
+ +    {
+ +        ngrid = 1;
+ +    }
+ +    else
+ +    {
+ +        ngrid = nbs->ngrid;
+ +    }
+ +
+ +    nbnxn_atomdata_set_atomtypes(nbat, ngrid, nbs, mdatoms->typeA);
+ +
+ +    nbnxn_atomdata_set_charges(nbat, ngrid, nbs, mdatoms->chargeA);
+ +
+ +    if (nbat->nenergrp > 1)
+ +    {
+ +        nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo);
+ +    }
+ +}
+ +
+ +/* Copies the shift vector array to nbnxn_atomdata_t */
+ +void nbnxn_atomdata_copy_shiftvec(gmx_bool          bDynamicBox,
+ +                                  rvec             *shift_vec,
+ +                                  nbnxn_atomdata_t *nbat)
+ +{
+ +    int i;
+ +
+ +    nbat->bDynamicBox = bDynamicBox;
+ +    for (i = 0; i < SHIFTS; i++)
+ +    {
+ +        copy_rvec(shift_vec[i], nbat->shift_vec[i]);
+ +    }
+ +}
+ +
+ +/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
+ +void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
+ +                                     int                  locality,
+ +                                     gmx_bool             FillLocal,
+ +                                     rvec                *x,
+ +                                     nbnxn_atomdata_t    *nbat)
+ +{
+ +    int g0 = 0, g1 = 0;
+ +    int nth, th;
+ +
+ +    switch (locality)
+ +    {
+ +        case eatAll:
+ +            g0 = 0;
+ +            g1 = nbs->ngrid;
+ +            break;
+ +        case eatLocal:
+ +            g0 = 0;
+ +            g1 = 1;
+ +            break;
+ +        case eatNonlocal:
+ +            g0 = 1;
+ +            g1 = nbs->ngrid;
+ +            break;
+ +    }
+ +
+ +    if (FillLocal)
+ +    {
+ +        nbat->natoms_local = nbs->grid[0].nc*nbs->grid[0].na_sc;
+ +    }
+ +
+ +    nth = gmx_omp_nthreads_get(emntPairsearch);
+ +
+ +#pragma omp parallel for num_threads(nth) schedule(static)
+ +    for (th = 0; th < nth; th++)
+ +    {
+ +        int g;
+ +
+ +        for (g = g0; g < g1; g++)
+ +        {
+ +            const nbnxn_grid_t *grid;
+ +            int                 cxy0, cxy1, cxy;
+ +
+ +            grid = &nbs->grid[g];
+ +
+ +            cxy0 = (grid->ncx*grid->ncy* th   +nth-1)/nth;
+ +            cxy1 = (grid->ncx*grid->ncy*(th+1)+nth-1)/nth;
+ +
+ +            for (cxy = cxy0; cxy < cxy1; cxy++)
+ +            {
+ +                int na, ash, na_fill;
+ +
+ +                na  = grid->cxy_na[cxy];
+ +                ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+ +
+ +                if (g == 0 && FillLocal)
+ +                {
+ +                    na_fill =
+ +                        (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
+ +                }
+ +                else
+ +                {
+ +                    /* We fill only the real particle locations.
+ +                     * We assume the filling entries at the end have been
+ +                     * properly set before during ns.
+ +                     */
+ +                    na_fill = na;
+ +                }
+ +                copy_rvec_to_nbat_real(nbs->a+ash, na, na_fill, x,
+ +                                       nbat->XFormat, nbat->x, ash,
+ +                                       0, 0, 0);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void
+ +nbnxn_atomdata_clear_reals(real * gmx_restrict dest,
+ +                           int i0, int i1)
+ +{
+ +    int i;
+ +
+ +    for (i = i0; i < i1; i++)
+ +    {
+ +        dest[i] = 0;
+ +    }
+ +}
+ +
+ +static void
+ +nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
+ +                            gmx_bool bDestSet,
+ +                            real ** gmx_restrict src,
+ +                            int nsrc,
+ +                            int i0, int i1)
+ +{
+ +    int i, s;
+ +
+ +    if (bDestSet)
+ +    {
+ +        /* The destination buffer contains data, add to it */
+ +        for (i = i0; i < i1; i++)
+ +        {
+ +            for (s = 0; s < nsrc; s++)
+ +            {
+ +                dest[i] += src[s][i];
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* The destination buffer is unitialized, set it first */
+ +        for (i = i0; i < i1; i++)
+ +        {
+ +            dest[i] = src[0][i];
+ +            for (s = 1; s < nsrc; s++)
+ +            {
+ +                dest[i] += src[s][i];
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void
+ +nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
+ +                                 gmx_bool bDestSet,
+ +                                 real ** gmx_restrict src,
+ +                                 int nsrc,
+ +                                 int i0, int i1)
+ +{
+ +#ifdef GMX_NBNXN_SIMD
+ +/* The SIMD width here is actually independent of that in the kernels,
+ + * but we use the same width for simplicity (usually optimal anyhow).
+ + */
+ +    int       i, s;
+ +    gmx_mm_pr dest_SSE, src_SSE;
+ +
+ +    if (bDestSet)
+ +    {
+ +        for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
+ +        {
+ +            dest_SSE = gmx_load_pr(dest+i);
+ +            for (s = 0; s < nsrc; s++)
+ +            {
+ +                src_SSE  = gmx_load_pr(src[s]+i);
+ +                dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
+ +            }
+ +            gmx_store_pr(dest+i, dest_SSE);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
+ +        {
+ +            dest_SSE = gmx_load_pr(src[0]+i);
+ +            for (s = 1; s < nsrc; s++)
+ +            {
+ +                src_SSE  = gmx_load_pr(src[s]+i);
+ +                dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
+ +            }
+ +            gmx_store_pr(dest+i, dest_SSE);
+ +        }
+ +    }
+ +#endif
+ +}
+ +
+ +/* Add part of the force array(s) from nbnxn_atomdata_t to f */
+ +static void
+ +nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
+ +                                    const nbnxn_atomdata_t *nbat,
+ +                                    nbnxn_atomdata_output_t *out,
+ +                                    int nfa,
+ +                                    int a0, int a1,
+ +                                    rvec *f)
+ +{
+ +    int         a, i, fa;
+ +    const int  *cell;
+ +    const real *fnb;
+ +
+ +    cell = nbs->cell;
+ +
+ +    /* Loop over all columns and copy and fill */
+ +    switch (nbat->FFormat)
+ +    {
+ +        case nbatXYZ:
+ +        case nbatXYZQ:
+ +            if (nfa == 1)
+ +            {
+ +                fnb = out[0].f;
+ +
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    i = cell[a]*nbat->fstride;
+ +
+ +                    f[a][XX] += fnb[i];
+ +                    f[a][YY] += fnb[i+1];
+ +                    f[a][ZZ] += fnb[i+2];
+ +                }
+ +            }
+ +            else
+ +            {
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    i = cell[a]*nbat->fstride;
+ +
+ +                    for (fa = 0; fa < nfa; fa++)
+ +                    {
+ +                        f[a][XX] += out[fa].f[i];
+ +                        f[a][YY] += out[fa].f[i+1];
+ +                        f[a][ZZ] += out[fa].f[i+2];
+ +                    }
+ +                }
+ +            }
+ +            break;
+ +        case nbatX4:
+ +            if (nfa == 1)
+ +            {
+ +                fnb = out[0].f;
+ +
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    i = X4_IND_A(cell[a]);
+ +
+ +                    f[a][XX] += fnb[i+XX*PACK_X4];
+ +                    f[a][YY] += fnb[i+YY*PACK_X4];
+ +                    f[a][ZZ] += fnb[i+ZZ*PACK_X4];
+ +                }
+ +            }
+ +            else
+ +            {
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    i = X4_IND_A(cell[a]);
+ +
+ +                    for (fa = 0; fa < nfa; fa++)
+ +                    {
+ +                        f[a][XX] += out[fa].f[i+XX*PACK_X4];
+ +                        f[a][YY] += out[fa].f[i+YY*PACK_X4];
+ +                        f[a][ZZ] += out[fa].f[i+ZZ*PACK_X4];
+ +                    }
+ +                }
+ +            }
+ +            break;
+ +        case nbatX8:
+ +            if (nfa == 1)
+ +            {
+ +                fnb = out[0].f;
+ +
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    i = X8_IND_A(cell[a]);
+ +
+ +                    f[a][XX] += fnb[i+XX*PACK_X8];
+ +                    f[a][YY] += fnb[i+YY*PACK_X8];
+ +                    f[a][ZZ] += fnb[i+ZZ*PACK_X8];
+ +                }
+ +            }
+ +            else
+ +            {
+ +                for (a = a0; a < a1; a++)
+ +                {
+ +                    i = X8_IND_A(cell[a]);
+ +
+ +                    for (fa = 0; fa < nfa; fa++)
+ +                    {
+ +                        f[a][XX] += out[fa].f[i+XX*PACK_X8];
+ +                        f[a][YY] += out[fa].f[i+YY*PACK_X8];
+ +                        f[a][ZZ] += out[fa].f[i+ZZ*PACK_X8];
+ +                    }
+ +                }
+ +            }
+ +            break;
+ +        default:
+ +            gmx_incons("Unsupported nbnxn_atomdata_t format");
+ +    }
+ +}
+ +
+ +/* Add the force array(s) from nbnxn_atomdata_t to f */
+ +void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t    nbs,
+ +                                    int                     locality,
+ +                                    const nbnxn_atomdata_t *nbat,
+ +                                    rvec                   *f)
+ +{
+ +    int a0 = 0, na = 0;
+ +    int nth, th;
+ +
+ +    nbs_cycle_start(&nbs->cc[enbsCCreducef]);
+ +
+ +    switch (locality)
+ +    {
+ +        case eatAll:
+ +            a0 = 0;
+ +            na = nbs->natoms_nonlocal;
+ +            break;
+ +        case eatLocal:
+ +            a0 = 0;
+ +            na = nbs->natoms_local;
+ +            break;
+ +        case eatNonlocal:
+ +            a0 = nbs->natoms_local;
+ +            na = nbs->natoms_nonlocal - nbs->natoms_local;
+ +            break;
+ +    }
+ +
+ +    nth = gmx_omp_nthreads_get(emntNonbonded);
+ +
+ +    if (nbat->nout > 1)
+ +    {
+ +        if (locality != eatAll)
+ +        {
+ +            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
+ +        }
+ +
+ +        /* Reduce the force thread output buffers into buffer 0, before adding
+ +         * them to the, differently ordered, "real" force buffer.
+ +         */
+ +#pragma omp parallel for num_threads(nth) schedule(static)
+ +        for (th = 0; th < nth; th++)
+ +        {
+ +            const nbnxn_buffer_flags_t *flags;
+ +            int   b0, b1, b;
+ +            int   i0, i1;
+ +            int   nfptr;
+ +            real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
+ +            int   out;
+ +
+ +            flags = &nbat->buffer_flags;
+ +
+ +            /* Calculate the cell-block range for our thread */
+ +            b0 = (flags->nflag* th   )/nth;
+ +            b1 = (flags->nflag*(th+1))/nth;
+ +
+ +            for (b = b0; b < b1; b++)
+ +            {
+ +                i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+ +                i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+ +
+ +                nfptr = 0;
+ +                for (out = 1; out < nbat->nout; out++)
+ +                {
+ +                    if (flags->flag[b] & (1U<<out))
+ +                    {
+ +                        fptr[nfptr++] = nbat->out[out].f;
+ +                    }
+ +                }
+ +                if (nfptr > 0)
+ +                {
+ +#ifdef GMX_NBNXN_SIMD
+ +                    nbnxn_atomdata_reduce_reals_simd
+ +#else
+ +                    nbnxn_atomdata_reduce_reals
+ +#endif
+ +                        (nbat->out[0].f,
+ +                        flags->flag[b] & (1U<<0),
+ +                        fptr, nfptr,
+ +                        i0, i1);
+ +                }
+ +                else if (!(flags->flag[b] & (1U<<0)))
+ +                {
+ +                    nbnxn_atomdata_clear_reals(nbat->out[0].f,
+ +                                               i0, i1);
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +#pragma omp parallel for num_threads(nth) schedule(static)
+ +    for (th = 0; th < nth; th++)
+ +    {
+ +        nbnxn_atomdata_add_nbat_f_to_f_part(nbs, nbat,
+ +                                            nbat->out,
+ +                                            1,
+ +                                            a0+((th+0)*na)/nth,
+ +                                            a0+((th+1)*na)/nth,
+ +                                            f);
+ +    }
+ +
+ +    nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
+ +}
+ +
+ +/* Adds the shift forces from nbnxn_atomdata_t to fshift */
+ +void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
+ +                                              rvec                   *fshift)
+ +{
+ +    const nbnxn_atomdata_output_t *out;
+ +    int  th;
+ +    int  s;
+ +    rvec sum;
+ +
+ +    out = nbat->out;
+ +
+ +    for (s = 0; s < SHIFTS; s++)
+ +    {
+ +        clear_rvec(sum);
+ +        for (th = 0; th < nbat->nout; th++)
+ +        {
+ +            sum[XX] += out[th].fshift[s*DIM+XX];
+ +            sum[YY] += out[th].fshift[s*DIM+YY];
+ +            sum[ZZ] += out[th].fshift[s*DIM+ZZ];
+ +        }
+ +        rvec_inc(fshift[s], sum);
+ +    }
+ +}
diff --cc src/gromacs/mdlib/nbnxn_consts.h

index bf9e92b7758722e545df17c7441bf0615d7118c6,0000000000000000000000000000000000000000..dc8e9a2c98813bd33eb785c3198b2bcc75d8f3ab

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_consts.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@@ -1,105 -1,0 +1,120 @@@
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef _nbnxn_consts_h
+ +#define _nbnxn_consts_h
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +
+ +/* The number of pair-search sub-cells per super-cell, used for GPU */
+ +#define GPU_NSUBCELL_Z 2
+ +#define GPU_NSUBCELL_Y 2
+ +#define GPU_NSUBCELL_X 2
+ +#define GPU_NSUBCELL   (GPU_NSUBCELL_Z*GPU_NSUBCELL_Y*GPU_NSUBCELL_X)
+ +/* In the non-bonded GPU kernel we operate on cluster-pairs, not cells.
+ + * The number of cluster in a super-cluster matches the number of sub-cells
+ + * in a pair-search cell, so we introduce a new name for the same value.
+ + */
+ +#define NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER  GPU_NSUBCELL
+ +
+ +/* With CPU kernels the i-cluster size is always 4 atoms.
+ + * With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4.
+ + */
+ +#define NBNXN_CPU_CLUSTER_I_SIZE       4
+ +
+ +#define NBNXN_CPU_CLUSTER_I_SIZE_2LOG  2
+ +
+ +/* With GPU kernels the cluster size is 8 atoms */
+ +#define NBNXN_GPU_CLUSTER_SIZE         8
+ +
+ +/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
+ + * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ + */
+ +#define NBNXN_GPU_JGROUP_SIZE       4
+ +#define NBNXN_GPU_JGROUP_SIZE_2LOG  2
+ +
+ +/* To avoid NaN when excluded atoms are at zero distance, we add a small
+ + * number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
+ + */
+ +#ifndef GMX_DOUBLE
+ +#define NBNXN_AVOID_SING_R2_INC  1.0e-12f
+ +#else
+ +/* The double prec. x86 SIMD kernels use a single prec. invsqrt, so > 1e-38 */
+ +#define NBNXN_AVOID_SING_R2_INC  1.0e-36
+ +#endif
+ +
+ +/* Coulomb force table size chosen such that it fits along the non-bonded
+ +   parameters in the texture cache. */
+ +#define GPU_EWALD_COULOMB_FORCE_TABLE_SIZE 1536
+ +
+ +
+ +/* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
+ +#define STRIDE_XYZ   3
+ +#define STRIDE_XYZQ  4
+ +/* Size of packs of x, y or z with SSE/AVX packed coords/forces */
+ +#define PACK_X4      4
+ +#define PACK_X8      8
+ +/* Strides for a pack of 4 and 8 coordinates/forces */
+ +#define STRIDE_P4    (DIM*PACK_X4)
+ +#define STRIDE_P8    (DIM*PACK_X8)
+ +
+ +/* Index of atom a into the SSE/AVX coordinate/force array */
+ +#define X4_IND_A(a)  (STRIDE_P4*((a) >> 2) + ((a) & (PACK_X4 - 1)))
+ +#define X8_IND_A(a)  (STRIDE_P8*((a) >> 3) + ((a) & (PACK_X8 - 1)))
+ +
+ +
++/* Cluster-pair Interaction masks for 4xN and 2xNN kernels.
++ * Bit i*CJ_SIZE + j tells if atom i and j interact.
++ */
++/* All interaction mask is the same for all kernels */
++#define NBNXN_INTERACTION_MASK_ALL        0xffffffff
++/* 4x4 kernel diagonal mask */
++#define NBNXN_INTERACTION_MASK_DIAG       0x08ce
++/* 4x2 kernel diagonal masks */
++#define NBNXN_INTERACTION_MASK_DIAG_J2_0  0x0002
++#define NBNXN_INTERACTION_MASK_DIAG_J2_1  0x002F
++/* 4x8 kernel diagonal masks */
++#define NBNXN_INTERACTION_MASK_DIAG_J8_0  0xf0f8fcfe
++#define NBNXN_INTERACTION_MASK_DIAG_J8_1  0x0080c0e0
++
++
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif
diff --cc src/gromacs/mdlib/nbnxn_internal.h

index 37c45fb62ecacbed4e521eb62990d6f04abe0e16,0000000000000000000000000000000000000000..836201be04933e8268a9fdc953e7b941dbf8f6a1

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_internal.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_internal.h
@@@ -1,253 -1,0 +1,259 @@@
- #define NBNXN_MEM_ALIGN  (GMX_NBNXN_SIMD_BITWIDTH/8)
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef _nbnxn_internal_h
+ +#define _nbnxn_internal_h
+ +
+ +#include "typedefs.h"
+ +#include "domdec.h"
+ +#include "gmx_cyclecounter.h"
+ +
++#ifdef GMX_NBNXN_SIMD
++/* The include below sets the SIMD instruction type (precision+width)
++ * for all nbnxn SIMD search and non-bonded kernel code.
++ */
++#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
++#define GMX_USE_HALF_WIDTH_SIMD_HERE
++#endif
++#include "gmx_simd_macros.h"
++#endif
++
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +
+ +#ifdef GMX_X86_SSE2
+ +/* Use 4-way SIMD for, always, single precision bounding box calculations */
+ +#define NBNXN_SEARCH_BB_SSE
+ +#endif
+ +
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +/* Memory alignment in bytes as required by SIMD aligned loads/stores */
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
++#define NBNXN_MEM_ALIGN  (GMX_SIMD_WIDTH_HERE*sizeof(real))
+ +#else
+ +/* No alignment required, but set it so we can call the same routines */
+ +#define NBNXN_MEM_ALIGN  32
+ +#endif
+ +
+ +
+ +/* A pair-search grid struct for one domain decomposition zone */
+ +typedef struct {
+ +    rvec     c0;               /* The lower corner of the (local) grid        */
+ +    rvec     c1;               /* The upper corner of the (local) grid        */
+ +    real     atom_density;     /* The atom number density for the local grid  */
+ +
+ +    gmx_bool bSimple;          /* Is this grid simple or super/sub            */
+ +    int      na_c;             /* Number of atoms per cluster                 */
+ +    int      na_cj;            /* Number of atoms for list j-clusters         */
+ +    int      na_sc;            /* Number of atoms per super-cluster           */
+ +    int      na_c_2log;        /* 2log of na_c                                */
+ +
+ +    int      ncx;              /* Number of (super-)cells along x             */
+ +    int      ncy;              /* Number of (super-)cells along y             */
+ +    int      nc;               /* Total number of (super-)cells               */
+ +
+ +    real     sx;               /* x-size of a (super-)cell                    */
+ +    real     sy;               /* y-size of a (super-)cell                    */
+ +    real     inv_sx;           /* 1/sx                                        */
+ +    real     inv_sy;           /* 1/sy                                        */
+ +
+ +    int      cell0;            /* Index in nbs->cell corresponding to cell 0  */
+ +
+ +    int     *cxy_na;           /* The number of atoms for each column in x,y  */
+ +    int     *cxy_ind;          /* Grid (super)cell index, offset from cell0   */
+ +    int      cxy_nalloc;       /* Allocation size for cxy_na and cxy_ind      */
+ +
+ +    int     *nsubc;            /* The number of sub cells for each super cell */
+ +    float   *bbcz;             /* Bounding boxes in z for the super cells     */
+ +    float   *bb;               /* 3D bounding boxes for the sub cells         */
+ +    float   *bbj;              /* 3D j-b.boxes for SSE-double or AVX-single   */
+ +    int     *flags;            /* Flag for the super cells                    */
+ +    int      nc_nalloc;        /* Allocation size for the pointers above      */
+ +
+ +    float   *bbcz_simple;      /* bbcz for simple grid converted from super   */
+ +    float   *bb_simple;        /* bb for simple grid converted from super     */
+ +    int     *flags_simple;     /* flags for simple grid converted from super  */
+ +    int      nc_nalloc_simple; /* Allocation size for the pointers above   */
+ +
+ +    int      nsubc_tot;        /* Total number of subcell, used for printing  */
+ +} nbnxn_grid_t;
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +
+ +typedef struct nbnxn_x_ci_simd_4xn {
+ +    /* The i-cluster coordinates for simple search */
+ +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
+ +    gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
+ +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
+ +    gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
+ +} nbnxn_x_ci_simd_4xn_t;
+ +
+ +typedef struct nbnxn_x_ci_simd_2xnn {
+ +    /* The i-cluster coordinates for simple search */
+ +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
+ +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
+ +} nbnxn_x_ci_simd_2xnn_t;
+ +
+ +#endif
+ +
+ +/* Working data for the actual i-supercell during pair search */
+ +typedef struct nbnxn_list_work {
+ +    gmx_cache_protect_t     cp0;   /* Protect cache between threads               */
+ +
+ +    float                  *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
+ +    real                   *x_ci;  /* The coordinates, pbc shifted, for each atom       */
+ +#ifdef GMX_NBNXN_SIMD
+ +    nbnxn_x_ci_simd_4xn_t  *x_ci_simd_4xn;
+ +    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
+ +#endif
+ +    int                     cj_ind;          /* The current cj_ind index for the current list     */
+ +    int                     cj4_init;        /* The first unitialized cj4 block                   */
+ +
+ +    float                  *d2;              /* Bounding box distance work array                  */
+ +
+ +    nbnxn_cj_t             *cj;              /* The j-cell list                                   */
+ +    int                     cj_nalloc;       /* Allocation size of cj                             */
+ +
+ +    int                     ncj_noq;         /* Nr. of cluster pairs without Coul for flop count  */
+ +    int                     ncj_hlj;         /* Nr. of cluster pairs with 1/2 LJ for flop count   */
+ +
+ +    int                    *sort;            /* Sort index                    */
+ +    int                     sort_nalloc;     /* Allocation size of sort       */
+ +
+ +    nbnxn_sci_t            *sci_sort;        /* Second sci array, for sorting */
+ +    int                     sci_sort_nalloc; /* Allocation size of sci_sort   */
+ +
+ +    gmx_cache_protect_t     cp1;             /* Protect cache between threads               */
+ +} nbnxn_list_work_t;
+ +
+ +/* Function type for setting the i-atom coordinate working data */
+ +typedef void
+ +    gmx_icell_set_x_t (int ci,
+ +                       real shx, real shy, real shz,
+ +                       int na_c,
+ +                       int stride, const real *x,
+ +                       nbnxn_list_work_t *work);
+ +
+ +static gmx_icell_set_x_t icell_set_x_simple;
+ +#ifdef GMX_NBNXN_SIMD
+ +static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+ +static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
+ +#endif
+ +static gmx_icell_set_x_t icell_set_x_supersub;
+ +#ifdef NBNXN_SEARCH_SSE
+ +static gmx_icell_set_x_t icell_set_x_supersub_sse8;
+ +#endif
+ +
+ +/* Local cycle count struct for profiling */
+ +typedef struct {
+ +    int          count;
+ +    gmx_cycles_t c;
+ +    gmx_cycles_t start;
+ +} nbnxn_cycle_t;
+ +
+ +/* Local cycle count enum for profiling */
+ +enum {
+ +    enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr
+ +};
+ +
+ +/* Thread-local work struct, contains part of nbnxn_grid_t */
+ +typedef struct {
+ +    gmx_cache_protect_t  cp0;
+ +
+ +    int                 *cxy_na;
+ +    int                  cxy_na_nalloc;
+ +
+ +    int                 *sort_work;
+ +    int                  sort_work_nalloc;
+ +
+ +    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
+ +
+ +    int                  ndistc;       /* Number of distance checks for flop counting */
+ +
+ +    nbnxn_cycle_t        cc[enbsCCnr];
+ +
+ +    gmx_cache_protect_t  cp1;
+ +} nbnxn_search_work_t;
+ +
+ +/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
+ +typedef struct nbnxn_search {
+ +    int                 ePBC;            /* PBC type enum                              */
+ +    matrix              box;             /* The periodic unit-cell                     */
+ +
+ +    gmx_bool            DomDec;          /* Are we doing domain decomposition?         */
+ +    ivec                dd_dim;          /* Are we doing DD in x,y,z?                  */
+ +    gmx_domdec_zones_t *zones;           /* The domain decomposition zones        */
+ +
+ +    int                 ngrid;           /* The number of grids, equal to #DD-zones    */
+ +    nbnxn_grid_t       *grid;            /* Array of grids, size ngrid                 */
+ +    int                *cell;            /* Actual allocated cell array for all grids  */
+ +    int                 cell_nalloc;     /* Allocation size of cell                    */
+ +    int                *a;               /* Atom index for grid, the inverse of cell   */
+ +    int                 a_nalloc;        /* Allocation size of a                       */
+ +
+ +    int                 natoms_local;    /* The local atoms run from 0 to natoms_local */
+ +    int                 natoms_nonlocal; /* The non-local atoms run from natoms_local
+ +                                          * to natoms_nonlocal */
+ +
+ +    gmx_bool             print_cycles;
+ +    int                  search_count;
+ +    nbnxn_cycle_t        cc[enbsCCnr];
+ +
+ +    gmx_icell_set_x_t   *icell_set_x; /* Function for setting i-coords    */
+ +
+ +    int                  nthread_max; /* Maximum number of threads for pair-search  */
+ +    nbnxn_search_work_t *work;        /* Work array, size nthread_max          */
+ +} nbnxn_search_t_t;
+ +
+ +
+ +static void nbs_cycle_start(nbnxn_cycle_t *cc)
+ +{
+ +    cc->start = gmx_cycles_read();
+ +}
+ +
+ +static void nbs_cycle_stop(nbnxn_cycle_t *cc)
+ +{
+ +    cc->c += gmx_cycles_read() - cc->start;
+ +    cc->count++;
+ +}
+ +
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c

index 35a0d3bcb5b0a04cd99554dd4618d02b9e23e442,0000000000000000000000000000000000000000..7baaf5ac5db204c121e7e53b4379e4360cf000f6

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
@@@ -1,334 -1,0 +1,333 @@@
- #include "nbnxn_kernel_simd_2xnn.h"
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +
+ +#include "typedefs.h"
+ +#include "vec.h"
+ +#include "smalloc.h"
+ +#include "force.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "../nbnxn_consts.h"
+ +#include "nbnxn_kernel_common.h"
+ +
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +
- /* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
++/* Include the full width SIMD macros */
++#include "gmx_simd_macros.h"
++#include "gmx_simd_vec.h"
+ +
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #define GMX_MM128_HERE
- #else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- #define GMX_MM256_HERE
- #else
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
- #endif
++#include "nbnxn_kernel_simd_2xnn.h"
+ +
-     const int simd_width   = GMX_SIMD_WIDTH_HERE;
-     const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
++#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)
++#error "unsupported SIMD width"
+ +#endif
+ +
++
++/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
++
+ +/* Analytical reaction-field kernels */
+ +#define CALC_COUL_RF
+ +
+ +#include "nbnxn_kernel_simd_2xnn_includes.h"
+ +
+ +#undef CALC_COUL_RF
+ +
+ +/* Tabulated exclusion interaction electrostatics kernels */
+ +#define CALC_COUL_TAB
+ +
+ +/* Single cut-off: rcoulomb = rvdw */
+ +#include "nbnxn_kernel_simd_2xnn_includes.h"
+ +
+ +/* Twin cut-off: rcoulomb >= rvdw */
+ +#define VDW_CUTOFF_CHECK
+ +#include "nbnxn_kernel_simd_2xnn_includes.h"
+ +#undef VDW_CUTOFF_CHECK
+ +
+ +#undef CALC_COUL_TAB
+ +
+ +/* Analytical Ewald exclusion interaction electrostatics kernels */
+ +#define CALC_COUL_EWALD
+ +
+ +/* Single cut-off: rcoulomb = rvdw */
+ +#include "nbnxn_kernel_simd_2xnn_includes.h"
+ +
+ +/* Twin cut-off: rcoulomb >= rvdw */
+ +#define VDW_CUTOFF_CHECK
+ +#include "nbnxn_kernel_simd_2xnn_includes.h"
+ +#undef VDW_CUTOFF_CHECK
+ +
+ +#undef CALC_COUL_EWALD
+ +
+ +
+ +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
+ +                                const nbnxn_atomdata_t     *nbat,
+ +                                const interaction_const_t  *ic,
+ +                                rvec                       *shift_vec,
+ +                                real                       *f,
+ +                                real                       *fshift,
+ +                                real                       *Vvdw,
+ +                                real                       *Vc);
+ +
+ +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
+ +                                  const nbnxn_atomdata_t     *nbat,
+ +                                  const interaction_const_t  *ic,
+ +                                  rvec                       *shift_vec,
+ +                                  real                       *f,
+ +                                  real                       *fshift);
+ +
+ +enum {
+ +    coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR
+ +};
+ +
+ +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _ener
+ +static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
+ +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+ +#undef NBK_FN
+ +
+ +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _energrp
+ +static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
+ +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+ +#undef NBK_FN
+ +
+ +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _noener
+ +static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
+ +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+ +#undef NBK_FN
+ +
+ +
+ +static void reduce_group_energies(int ng, int ng_2log,
+ +                                  const real *VSvdw, const real *VSc,
+ +                                  real *Vvdw, real *Vc)
+ +{
-                 c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
++    const int unrollj      = GMX_SIMD_WIDTH_HERE/2;
++    const int unrollj_half = unrollj/2;
+ +    int       ng_p2, i, j, j0, j1, c, s;
+ +
+ +    ng_p2 = (1<<ng_2log);
+ +
+ +    /* The size of the x86 SIMD energy group buffer array is:
+ +     * ng*ng*ng_p2*unrollj_half*simd_width
+ +     */
+ +    for (i = 0; i < ng; i++)
+ +    {
+ +        for (j = 0; j < ng; j++)
+ +        {
+ +            Vvdw[i*ng+j] = 0;
+ +            Vc[i*ng+j]   = 0;
+ +        }
+ +
+ +        for (j1 = 0; j1 < ng; j1++)
+ +        {
+ +            for (j0 = 0; j0 < ng; j0++)
+ +            {
-                     c             += simd_width/2 + 2;
++                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
+ +                for (s = 0; s < unrollj_half; s++)
+ +                {
+ +                    Vvdw[i*ng+j0] += VSvdw[c+0];
+ +                    Vvdw[i*ng+j1] += VSvdw[c+1];
+ +                    Vc  [i*ng+j0] += VSc  [c+0];
+ +                    Vc  [i*ng+j1] += VSc  [c+1];
++                    c             += unrollj + 2;
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +#endif /* GMX_NBNXN_SIMD_2XNN */
+ +
+ +void
+ +nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       gmx_unused *nbl_list,
+ +                       const nbnxn_atomdata_t     gmx_unused *nbat,
+ +                       const interaction_const_t  gmx_unused *ic,
+ +                       int                        gmx_unused ewald_excl,
+ +                       rvec                       gmx_unused *shift_vec,
+ +                       int                        gmx_unused  force_flags,
+ +                       int                        gmx_unused  clearF,
+ +                       real                       gmx_unused *fshift,
+ +                       real                       gmx_unused *Vc,
+ +                       real                       gmx_unused *Vvdw)
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +{
+ +    int                nnbl;
+ +    nbnxn_pairlist_t **nbl;
+ +    int                coult;
+ +    int                nb;
+ +
+ +    nnbl = nbl_list->nnbl;
+ +    nbl  = nbl_list->nbl;
+ +
+ +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ +    {
+ +        coult = coultRF;
+ +    }
+ +    else
+ +    {
+ +        if (ewald_excl == ewaldexclTable)
+ +        {
+ +            if (ic->rcoulomb == ic->rvdw)
+ +            {
+ +                coult = coultTAB;
+ +            }
+ +            else
+ +            {
+ +                coult = coultTAB_TWIN;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            if (ic->rcoulomb == ic->rvdw)
+ +            {
+ +                coult = coultEWALD;
+ +            }
+ +            else
+ +            {
+ +                coult = coultEWALD_TWIN;
+ +            }
+ +        }
+ +    }
+ +
+ +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ +    for (nb = 0; nb < nnbl; nb++)
+ +    {
+ +        nbnxn_atomdata_output_t *out;
+ +        real                    *fshift_p;
+ +
+ +        out = &nbat->out[nb];
+ +
+ +        if (clearF == enbvClearFYes)
+ +        {
+ +            clear_f(nbat, nb, out->f);
+ +        }
+ +
+ +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+ +        {
+ +            fshift_p = fshift;
+ +        }
+ +        else
+ +        {
+ +            fshift_p = out->fshift;
+ +
+ +            if (clearF == enbvClearFYes)
+ +            {
+ +                clear_fshift(fshift_p);
+ +            }
+ +        }
+ +
+ +        /* With Ewald type electrostatics we the forces for excluded atom pairs
+ +         * should not contribute to the virial sum. The exclusion forces
+ +         * are not calculate in the energy kernels, but are in _noener.
+ +         */
+ +        if (!((force_flags & GMX_FORCE_ENERGY) ||
+ +              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ +        {
+ +            /* Don't calculate energies */
+ +            p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
+ +                                                 ic,
+ +                                                 shift_vec,
+ +                                                 out->f,
+ +                                                 fshift_p);
+ +        }
+ +        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ +        {
+ +            /* No energy groups */
+ +            out->Vvdw[0] = 0;
+ +            out->Vc[0]   = 0;
+ +
+ +            p_nbk_ener[coult][nbat->comb_rule](nbl[nb], nbat,
+ +                                               ic,
+ +                                               shift_vec,
+ +                                               out->f,
+ +                                               fshift_p,
+ +                                               out->Vvdw,
+ +                                               out->Vc);
+ +        }
+ +        else
+ +        {
+ +            /* Calculate energy group contributions */
+ +            int i;
+ +
+ +            for (i = 0; i < out->nVS; i++)
+ +            {
+ +                out->VSvdw[i] = 0;
+ +            }
+ +            for (i = 0; i < out->nVS; i++)
+ +            {
+ +                out->VSc[i] = 0;
+ +            }
+ +
+ +            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb], nbat,
+ +                                                  ic,
+ +                                                  shift_vec,
+ +                                                  out->f,
+ +                                                  fshift_p,
+ +                                                  out->VSvdw,
+ +                                                  out->VSc);
+ +
+ +            reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
+ +                                  out->VSvdw, out->VSc,
+ +                                  out->Vvdw, out->Vc);
+ +        }
+ +    }
+ +
+ +    if (force_flags & GMX_FORCE_ENERGY)
+ +    {
+ +        reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
+ +    }
+ +}
+ +#else
+ +{
+ +    gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
+ +}
+ +#endif
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h

index e63a81796ab9557854b6e5c65568f9036bdfd0b4,0000000000000000000000000000000000000000..9068062d7de8f35146b4a9a0c0bf7bb7a62b9eb2

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
@@@ -1,759 -1,0 +1,773 @@@
- #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2009, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ + * This flavor of the kernel duplicates the data for N j-particles in
+ + * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ + * This leads to 4/2=2 sets of most instructions. Therefore we call
+ + * this kernel 2x(N+N) = 2xnn
+ + *
+ + * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ + * and instructions removed.
+ + *
+ + * An alternative would be to load to different cluster of N j-particles
+ + * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ + * of instructions, which could lead to better scheduling. But we actually
+ + * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ + * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ + * It could be worth trying this option, but it takes some more effort.
+ + * This 2xnn kernel is basically the 4xn equivalent with
+ + */
+ +
+ +
+ +/* When calculating RF or Ewald interactions we calculate the electrostatic
+ + * forces on excluded atom pairs here in the non-bonded loops.
+ + * But when energies and/or virial is required we calculate them
+ + * separately to as then it is easier to separate the energy and virial
+ + * contributions.
+ + */
+ +#if defined CHECK_EXCLS && defined CALC_COULOMB
+ +#define EXCL_FORCES
+ +#endif
+ +
+ +/* Without exclusions and energies we only need to mask the cut-off,
+ + * this can be faster with blendv.
+ + */
-     gmx_mm_pr  int_S0;
-     gmx_mm_pr  int_S2;
++#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV && !defined COUNT_PAIRS
+ +/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ + * With gcc this is slower, except for RF on Sandy Bridge.
+ + * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ + */
+ +#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+ +#define CUTOFF_BLENDV
+ +#endif
+ +/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ + * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ + * Tested with icc 13.
+ + */
+ +#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+ +#define CUTOFF_BLENDV
+ +#endif
+ +#endif
+ +
+ +{
+ +    int        cj, aj, ajx, ajy, ajz;
+ +
+ +#ifdef ENERGY_GROUPS
+ +    /* Energy group indices for two atoms packed into one int */
+ +    int        egp_jj[UNROLLJ/2];
+ +#endif
+ +
+ +#ifdef CHECK_EXCLS
+ +    /* Interaction (non-exclusion) mask of all 1's or 0's */
-     gmx_mm_pr  wco_S0;
-     gmx_mm_pr  wco_S2;
++    gmx_mm_pb  interact_S0;
++    gmx_mm_pb  interact_S2;
+ +#endif
+ +
+ +    gmx_mm_pr  jx_S, jy_S, jz_S;
+ +    gmx_mm_pr  dx_S0, dy_S0, dz_S0;
+ +    gmx_mm_pr  dx_S2, dy_S2, dz_S2;
+ +    gmx_mm_pr  tx_S0, ty_S0, tz_S0;
+ +    gmx_mm_pr  tx_S2, ty_S2, tz_S2;
+ +    gmx_mm_pr  rsq_S0, rinv_S0, rinvsq_S0;
+ +    gmx_mm_pr  rsq_S2, rinv_S2, rinvsq_S2;
+ +#ifndef CUTOFF_BLENDV
+ +    /* wco: within cut-off, mask of all 1's or 0's */
-     gmx_mm_pr  wco_vdw_S0;
++    gmx_mm_pb  wco_S0;
++    gmx_mm_pb  wco_S2;
+ +#endif
+ +#ifdef VDW_CUTOFF_CHECK
-     gmx_mm_pr  wco_vdw_S2;
++    gmx_mm_pb  wco_vdw_S0;
+ +#ifndef HALF_LJ
-         /* Load integer interaction mask */
++    gmx_mm_pb  wco_vdw_S2;
+ +#endif
+ +#endif
+ +#ifdef CALC_COULOMB
+ +#ifdef CHECK_EXCLS
+ +    /* 1/r masked with the interaction mask */
+ +    gmx_mm_pr  rinv_ex_S0;
+ +    gmx_mm_pr  rinv_ex_S2;
+ +#endif
+ +    gmx_mm_pr  jq_S;
+ +    gmx_mm_pr  qq_S0;
+ +    gmx_mm_pr  qq_S2;
+ +#ifdef CALC_COUL_TAB
+ +    /* The force (PME mesh force) we need to subtract from 1/r^2 */
+ +    gmx_mm_pr  fsub_S0;
+ +    gmx_mm_pr  fsub_S2;
+ +#endif
+ +#ifdef CALC_COUL_EWALD
+ +    gmx_mm_pr  brsq_S0, brsq_S2;
+ +    gmx_mm_pr  ewcorr_S0, ewcorr_S2;
+ +#endif
+ +
+ +    /* frcoul = (1/r - fsub)*r */
+ +    gmx_mm_pr  frcoul_S0;
+ +    gmx_mm_pr  frcoul_S2;
+ +#ifdef CALC_COUL_TAB
+ +    /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+ +    gmx_mm_pr  r_S0, rs_S0, rf_S0, frac_S0;
+ +    gmx_mm_pr  r_S2, rs_S2, rf_S2, frac_S2;
+ +    /* Table index: rs truncated to an int */
+ +    gmx_epi32  ti_S0, ti_S2;
+ +    /* Linear force table values */
+ +    gmx_mm_pr  ctab0_S0, ctab1_S0;
+ +    gmx_mm_pr  ctab0_S2, ctab1_S2;
+ +#ifdef CALC_ENERGIES
+ +    /* Quadratic energy table value */
+ +    gmx_mm_pr  ctabv_S0;
+ +    gmx_mm_pr  ctabv_S2;
+ +#endif
+ +#endif
+ +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ +    /* The potential (PME mesh) we need to subtract from 1/r */
+ +    gmx_mm_pr  vc_sub_S0;
+ +    gmx_mm_pr  vc_sub_S2;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    /* Electrostatic potential */
+ +    gmx_mm_pr  vcoul_S0;
+ +    gmx_mm_pr  vcoul_S2;
+ +#endif
+ +#endif
+ +    /* The force times 1/r */
+ +    gmx_mm_pr  fscal_S0;
+ +    gmx_mm_pr  fscal_S2;
+ +
+ +#ifdef CALC_LJ
+ +#ifdef LJ_COMB_LB
+ +    /* LJ sigma_j/2 and sqrt(epsilon_j) */
+ +    gmx_mm_pr  hsig_j_S, seps_j_S;
+ +    /* LJ sigma_ij and epsilon_ij */
+ +    gmx_mm_pr  sig_S0, eps_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  sig_S2, eps_S2;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  sig2_S0, sig6_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  sig2_S2, sig6_S2;
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +#endif /* CALC_LJ */
+ +
+ +#ifdef LJ_COMB_GEOM
+ +    gmx_mm_pr  c6s_j_S, c12s_j_S;
+ +#endif
+ +
+ +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ +    /* Index for loading LJ parameters, complicated when interleaving */
+ +    int         aj2;
+ +#endif
+ +
+ +#ifndef FIX_LJ_C
+ +    /* LJ C6 and C12 parameters, used with geometric comb. rule */
+ +    gmx_mm_pr  c6_S0, c12_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  c6_S2, c12_S2;
+ +#endif
+ +#endif
+ +
+ +    /* Intermediate variables for LJ calculation */
+ +#ifndef LJ_COMB_LB
+ +    gmx_mm_pr  rinvsix_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  rinvsix_S2;
+ +#endif
+ +#endif
+ +#ifdef LJ_COMB_LB
+ +    gmx_mm_pr  sir_S0, sir2_S0, sir6_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  sir_S2, sir2_S2, sir6_S2;
+ +#endif
+ +#endif
+ +
+ +    gmx_mm_pr  FrLJ6_S0, FrLJ12_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  FrLJ6_S2, FrLJ12_S2;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  VLJ6_S0, VLJ12_S0, VLJ_S0;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  VLJ6_S2, VLJ12_S2, VLJ_S2;
+ +#endif
+ +#endif
+ +#endif /* CALC_LJ */
+ +
+ +    gmx_mm_hpr fjx_S, fjy_S, fjz_S;
+ +
+ +    /* j-cluster index */
+ +    cj            = l_cj[cjind].cj;
+ +
+ +    /* Atom indices (of the first atom in the cluster) */
+ +    aj            = cj*UNROLLJ;
+ +#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+ +#if UNROLLJ == STRIDE
+ +    aj2           = aj*2;
+ +#else
+ +    aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+ +#endif
+ +#endif
+ +#if UNROLLJ == STRIDE
+ +    ajx           = aj*DIM;
+ +#else
+ +    ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+ +#endif
+ +    ajy           = ajx + STRIDE;
+ +    ajz           = ajy + STRIDE;
+ +
+ +#ifdef CHECK_EXCLS
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ +    {
-         int_S0  = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
-         int_S2  = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
++        /* Load integer topology exclusion interaction mask */
++        gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
++
++        interact_S0  = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
++        interact_S2  = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
++    }
++#else
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
++    {
++        /* Integer mask set, cast to real and real mask operations */
+ +        gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
+ +
-     gmx_loaddh_pr(jx_S, x+ajx);
-     gmx_loaddh_pr(jy_S, x+ajy);
-     gmx_loaddh_pr(jz_S, x+ajz);
++        interact_S0  = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
++        interact_S2  = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
+ +    }
++#else
++#error "No SIMD bitmask operation available"
++#endif
+ +#endif
++#endif /* CHECK_EXCLS */
+ +
+ +    /* load j atom coordinates */
-         wco_S0  = gmx_and_pr(wco_S0, diag_S0);
-         wco_S2  = gmx_and_pr(wco_S2, diag_S2);
++    gmx_loaddh_pr(&jx_S, x+ajx);
++    gmx_loaddh_pr(&jy_S, x+ajy);
++    gmx_loaddh_pr(&jz_S, x+ajz);
+ +
+ +    /* Calculate distance */
+ +    dx_S0       = gmx_sub_pr(ix_S0, jx_S);
+ +    dy_S0       = gmx_sub_pr(iy_S0, jy_S);
+ +    dz_S0       = gmx_sub_pr(iz_S0, jz_S);
+ +    dx_S2       = gmx_sub_pr(ix_S2, jx_S);
+ +    dy_S2       = gmx_sub_pr(iy_S2, jy_S);
+ +    dz_S2       = gmx_sub_pr(iz_S2, jz_S);
+ +
+ +    /* rsq = dx*dx+dy*dy+dz*dz */
+ +    rsq_S0      = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ +    rsq_S2      = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ +
+ +#ifndef CUTOFF_BLENDV
+ +    wco_S0      = gmx_cmplt_pr(rsq_S0, rc2_S);
+ +    wco_S2      = gmx_cmplt_pr(rsq_S2, rc2_S);
+ +#endif
+ +
+ +#ifdef CHECK_EXCLS
+ +#ifdef EXCL_FORCES
+ +    /* Only remove the (sub-)diagonal to avoid double counting */
+ +#if UNROLLJ == UNROLLI
+ +    if (cj == ci_sh)
+ +    {
-         wco_S0  = gmx_and_pr(wco_S0, diag0_S0);
-         wco_S2  = gmx_and_pr(wco_S2, diag0_S2);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask_S0);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask_S2);
+ +    }
+ +#else
+ +#if UNROLLJ == 2*UNROLLI
+ +    if (cj*2 == ci_sh)
+ +    {
-         wco_S0  = gmx_and_pr(wco_S0, diag1_S0);
-         wco_S2  = gmx_and_pr(wco_S2, diag1_S2);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
+ +    }
+ +    else if (cj*2 + 1 == ci_sh)
+ +    {
-     wco_S0      = gmx_and_pr(wco_S0, int_S0);
-     wco_S2      = gmx_and_pr(wco_S2, int_S2);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
+ +    }
+ +#else
+ +#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
+ +#endif
+ +#endif
+ +#else /* EXCL_FORCES */
+ +    /* No exclusion forces: remove all excluded atom pairs from the list */
-     rsq_S0      = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
-     rsq_S2      = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
++    wco_S0      = gmx_and_pb(wco_S0, interact_S0);
++    wco_S2      = gmx_and_pb(wco_S2, interact_S2);
+ +#endif
+ +#endif
+ +
+ +#ifdef COUNT_PAIRS
+ +    {
+ +        int  i, j;
+ +        real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
+ +        tmp = gmx_simd_align_real(tmpa);
+ +        for (i = 0; i < UNROLLI; i+=2)
+ +        {
+ +            gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2);
+ +            for (j = 0; j < 2*UNROLLJ; j++)
+ +            {
+ +                if (!(tmp[j] == 0))
+ +                {
+ +                    npair++;
+ +                }
+ +            }
+ +        }
+ +    }
+ +#endif
+ +
+ +#ifdef CHECK_EXCLS
+ +    /* For excluded pairs add a small number to avoid r^-6 = NaN */
-     gmx_loaddh_pr(jq_S, q+aj);
++    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
++    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
+ +#endif
+ +
+ +    /* Calculate 1/r */
+ +    rinv_S0     = gmx_invsqrt_pr(rsq_S0);
+ +    rinv_S2     = gmx_invsqrt_pr(rsq_S2);
+ +
+ +#ifdef CALC_COULOMB
+ +    /* Load parameters for j atom */
-     load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_S0, c12_S0);
++    gmx_loaddh_pr(&jq_S, q+aj);
+ +    qq_S0       = gmx_mul_pr(iq_S0, jq_S);
+ +    qq_S2       = gmx_mul_pr(iq_S2, jq_S);
+ +#endif
+ +
+ +#ifdef CALC_LJ
+ +
+ +#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
-     load_lj_pair_params2(nbfp2, nbfp3, type, aj, c6_S2, c12_S2);
++    load_lj_pair_params2(nbfp0, nbfp1, type, aj, &c6_S0, &c12_S0);
+ +#ifndef HALF_LJ
-     gmx_loaddh_pr(c6s_j_S,  ljc+aj2+0);
-     gmx_loaddh_pr(c12s_j_S, ljc+aj2+STRIDE);
++    load_lj_pair_params2(nbfp2, nbfp3, type, aj, &c6_S2, &c12_S2);
+ +#endif
+ +#endif /* not defined any LJ rule */
+ +
+ +#ifdef LJ_COMB_GEOM
-     gmx_loaddh_pr(hsig_j_S, ljc+aj2+0);
-     gmx_loaddh_pr(seps_j_S, ljc+aj2+STRIDE);
++    gmx_loaddh_pr(&c6s_j_S,  ljc+aj2+0);
++    gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE);
+ +    c6_S0       = gmx_mul_pr(c6s_S0, c6s_j_S );
+ +#ifndef HALF_LJ
+ +    c6_S2       = gmx_mul_pr(c6s_S2, c6s_j_S );
+ +#endif
+ +    c12_S0      = gmx_mul_pr(c12s_S0, c12s_j_S);
+ +#ifndef HALF_LJ
+ +    c12_S2      = gmx_mul_pr(c12s_S2, c12s_j_S);
+ +#endif
+ +#endif /* LJ_COMB_GEOM */
+ +
+ +#ifdef LJ_COMB_LB
-     rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, int_S0);
-     rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, int_S2);
++    gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0);
++    gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE);
+ +
+ +    sig_S0      = gmx_add_pr(hsig_i_S0, hsig_j_S);
+ +    eps_S0      = gmx_mul_pr(seps_i_S0, seps_j_S);
+ +#ifndef HALF_LJ
+ +    sig_S2      = gmx_add_pr(hsig_i_S2, hsig_j_S);
+ +    eps_S2      = gmx_mul_pr(seps_i_S2, seps_j_S);
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +
+ +#endif /* CALC_LJ */
+ +
+ +#ifndef CUTOFF_BLENDV
+ +    rinv_S0     = gmx_blendzero_pr(rinv_S0, wco_S0);
+ +    rinv_S2     = gmx_blendzero_pr(rinv_S2, wco_S2);
+ +#else
+ +    /* We only need to mask for the cut-off: blendv is faster */
+ +    rinv_S0     = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
+ +    rinv_S2     = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+ +#endif
+ +
+ +    rinvsq_S0   = gmx_mul_pr(rinv_S0, rinv_S0);
+ +    rinvsq_S2   = gmx_mul_pr(rinv_S2, rinv_S2);
+ +
+ +#ifdef CALC_COULOMB
+ +    /* Note that here we calculate force*r, not the usual force/r.
+ +     * This allows avoiding masking the reaction-field contribution,
+ +     * as frcoul is later multiplied by rinvsq which has been
+ +     * masked with the cut-off check.
+ +     */
+ +
+ +#ifdef EXCL_FORCES
+ +    /* Only add 1/r for non-excluded atom pairs */
-     frcoul_S0   = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
-     frcoul_S2   = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
++    rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, interact_S0);
++    rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, interact_S2);
+ +#else
+ +    /* No exclusion forces, we always need 1/r */
+ +#define     rinv_ex_S0    rinv_S0
+ +#define     rinv_ex_S2    rinv_S2
+ +#endif
+ +
+ +#ifdef CALC_COUL_RF
+ +    /* Electrostatic interactions */
-     frcoul_S0   = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
-     frcoul_S2   = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
++    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
++    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
+ +
+ +#ifdef CALC_ENERGIES
+ +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
+ +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_COUL_EWALD
+ +    /* We need to mask (or limit) rsq for the cut-off,
+ +     * as large distances can cause an overflow in gmx_pmecorrF/V.
+ +     */
+ +#ifndef CUTOFF_BLENDV
+ +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
+ +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+ +#else
+ +    /* Strangely, putting mul on a separate line is slower (icc 13) */
+ +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
+ +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+ +#endif
+ +    ewcorr_S0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
+ +    ewcorr_S2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
- #ifdef GMX_HAVE_SIMD_FLOOR
++    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
++    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
+ +
+ +#ifdef CALC_ENERGIES
+ +    vc_sub_S0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
+ +    vc_sub_S2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+ +#endif
+ +
+ +#endif /* CALC_COUL_EWALD */
+ +
+ +#ifdef CALC_COUL_TAB
+ +    /* Electrostatic interactions */
+ +    r_S0        = gmx_mul_pr(rsq_S0, rinv_S0);
+ +    r_S2        = gmx_mul_pr(rsq_S2, rinv_S2);
+ +    /* Convert r to scaled table units */
+ +    rs_S0       = gmx_mul_pr(r_S0, invtsp_S);
+ +    rs_S2       = gmx_mul_pr(r_S2, invtsp_S);
+ +    /* Truncate scaled r to an int */
+ +    ti_S0       = gmx_cvttpr_epi32(rs_S0);
+ +    ti_S2       = gmx_cvttpr_epi32(rs_S2);
-     load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
-     load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
++#ifdef GMX_SIMD_HAVE_FLOOR
+ +    rf_S0       = gmx_floor_pr(rs_S0);
+ +    rf_S2       = gmx_floor_pr(rs_S2);
+ +#else
+ +    rf_S0       = gmx_cvtepi32_pr(ti_S0);
+ +    rf_S2       = gmx_cvtepi32_pr(ti_S2);
+ +#endif
+ +    frac_S0     = gmx_sub_pr(rs_S0, rf_S0);
+ +    frac_S2     = gmx_sub_pr(rs_S2, rf_S2);
+ +
+ +    /* Load and interpolate table forces and possibly energies.
+ +     * Force and energy can be combined in one table, stride 4: FDV0
+ +     * or in two separate tables with stride 1: F and V
+ +     * Currently single precision uses FDV0, double F and V.
+ +     */
+ +#ifndef CALC_ENERGIES
-     load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
-     load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
++    load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
++    load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
+ +#else
+ +#ifdef TAB_FDV0
-     load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
-     load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
++    load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++    load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
+ +#else
-     vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
-     vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
++    load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++    load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
+ +#endif
+ +#endif
+ +    fsub_S0     = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
+ +    fsub_S2     = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
+ +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
+ +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+ +
+ +#ifdef CALC_ENERGIES
+ +    vc_sub_S0   = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
+ +    vc_sub_S2   = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+ +#endif
+ +#endif /* CALC_COUL_TAB */
+ +
+ +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ +#ifndef NO_SHIFT_EWALD
+ +    /* Add Ewald potential shift to vc_sub for convenience */
+ +#ifdef CHECK_EXCLS
-     rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, int_S0);
++    vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
++    vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
+ +#else
+ +    vc_sub_S0   = gmx_add_pr(vc_sub_S0, sh_ewald_S);
+ +    vc_sub_S2   = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+ +#endif
+ +#endif
+ +
+ +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
+ +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    /* Mask energy for cut-off and diagonal */
+ +    vcoul_S0    = gmx_blendzero_pr(vcoul_S0, wco_S0);
+ +    vcoul_S2    = gmx_blendzero_pr(vcoul_S2, wco_S2);
+ +#endif
+ +
+ +#endif /* CALC_COULOMB */
+ +
+ +#ifdef CALC_LJ
+ +    /* Lennard-Jones interaction */
+ +
+ +#ifdef VDW_CUTOFF_CHECK
+ +    wco_vdw_S0  = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+ +#ifndef HALF_LJ
+ +    wco_vdw_S2  = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+ +#endif
+ +#else
+ +    /* Same cut-off for Coulomb and VdW, reuse the registers */
+ +#define     wco_vdw_S0    wco_S0
+ +#define     wco_vdw_S2    wco_S2
+ +#endif
+ +
+ +#ifndef LJ_COMB_LB
+ +    rinvsix_S0  = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+ +#ifdef EXCL_FORCES
-     rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, int_S2);
++    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, interact_S0);
+ +#endif
+ +#ifndef HALF_LJ
+ +    rinvsix_S2  = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+ +#ifdef EXCL_FORCES
-     sir6_S0     = gmx_blendzero_pr(sir6_S0, int_S0);
++    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, interact_S2);
+ +#endif
+ +#endif
+ +#ifdef VDW_CUTOFF_CHECK
+ +    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+ +#ifndef HALF_LJ
+ +    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+ +#endif
+ +#endif
+ +    FrLJ6_S0    = gmx_mul_pr(c6_S0, rinvsix_S0);
+ +#ifndef HALF_LJ
+ +    FrLJ6_S2    = gmx_mul_pr(c6_S2, rinvsix_S2);
+ +#endif
+ +    FrLJ12_S0   = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+ +#ifndef HALF_LJ
+ +    FrLJ12_S2   = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+ +#endif
+ +#endif /* not LJ_COMB_LB */
+ +
+ +#ifdef LJ_COMB_LB
+ +    sir_S0      = gmx_mul_pr(sig_S0, rinv_S0);
+ +#ifndef HALF_LJ
+ +    sir_S2      = gmx_mul_pr(sig_S2, rinv_S2);
+ +#endif
+ +    sir2_S0     = gmx_mul_pr(sir_S0, sir_S0);
+ +#ifndef HALF_LJ
+ +    sir2_S2     = gmx_mul_pr(sir_S2, sir_S2);
+ +#endif
+ +    sir6_S0     = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+ +#ifdef EXCL_FORCES
-     sir6_S2     = gmx_blendzero_pr(sir6_S2, int_S2);
++    sir6_S0     = gmx_blendzero_pr(sir6_S0, interact_S0);
+ +#endif
+ +#ifndef HALF_LJ
+ +    sir6_S2     = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+ +#ifdef EXCL_FORCES
-     VLJ_S0      = gmx_blendzero_pr(VLJ_S0, int_S0);
++    sir6_S2     = gmx_blendzero_pr(sir6_S2, interact_S2);
+ +#endif
+ +#endif
+ +#ifdef VDW_CUTOFF_CHECK
+ +    sir6_S0     = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+ +#ifndef HALF_LJ
+ +    sir6_S2     = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+ +#endif
+ +#endif
+ +    FrLJ6_S0    = gmx_mul_pr(eps_S0, sir6_S0);
+ +#ifndef HALF_LJ
+ +    FrLJ6_S2    = gmx_mul_pr(eps_S2, sir6_S2);
+ +#endif
+ +    FrLJ12_S0   = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+ +#ifndef HALF_LJ
+ +    FrLJ12_S2   = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+ +#endif
+ +#if defined CALC_ENERGIES
+ +    /* We need C6 and C12 to calculate the LJ potential shift */
+ +    sig2_S0     = gmx_mul_pr(sig_S0, sig_S0);
+ +#ifndef HALF_LJ
+ +    sig2_S2     = gmx_mul_pr(sig_S2, sig_S2);
+ +#endif
+ +    sig6_S0     = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+ +#ifndef HALF_LJ
+ +    sig6_S2     = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+ +#endif
+ +    c6_S0       = gmx_mul_pr(eps_S0, sig6_S0);
+ +#ifndef HALF_LJ
+ +    c6_S2       = gmx_mul_pr(eps_S2, sig6_S2);
+ +#endif
+ +    c12_S0      = gmx_mul_pr(c6_S0, sig6_S0);
+ +#ifndef HALF_LJ
+ +    c12_S2      = gmx_mul_pr(c6_S2, sig6_S2);
+ +#endif
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +
+ +#endif /* CALC_LJ */
+ +
+ +#ifdef CALC_ENERGIES
+ +#ifdef ENERGY_GROUPS
+ +    /* Extract the group pair index per j pair.
+ +     * Energy groups are stored per i-cluster, so things get
+ +     * complicated when the i- and j-cluster size don't match.
+ +     */
+ +    {
+ +        int egps_j;
+ +#if UNROLLJ == 2
+ +        egps_j    = nbat->energrp[cj>>1];
+ +        egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+ +#else
+ +        /* We assume UNROLLI <= UNROLLJ */
+ +        int jdi;
+ +        for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
+ +        {
+ +            int jj;
+ +            egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ +            for (jj = 0; jj < (UNROLLI/2); jj++)
+ +            {
+ +                egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ +            }
+ +        }
+ +#endif
+ +    }
+ +#endif
+ +
+ +#ifdef CALC_COULOMB
+ +#ifndef ENERGY_GROUPS
+ +    vctot_S      = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2));
+ +#else
+ +    add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
+ +    add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_LJ
+ +    /* Calculate the LJ energies */
+ +    VLJ6_S0     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
+ +#ifndef HALF_LJ
+ +    VLJ6_S2     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
+ +#endif
+ +    VLJ12_S0    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
+ +#ifndef HALF_LJ
+ +    VLJ12_S2    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
+ +#endif
+ +
+ +    VLJ_S0      = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
+ +#ifndef HALF_LJ
+ +    VLJ_S2      = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
+ +#endif
+ +    /* The potential shift should be removed for pairs beyond cut-off */
+ +    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
+ +#ifndef HALF_LJ
+ +    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
+ +#endif
+ +#ifdef CHECK_EXCLS
+ +    /* The potential shift should be removed for excluded pairs */
-     VLJ_S2      = gmx_blendzero_pr(VLJ_S2, int_S2);
++    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, interact_S0);
+ +#ifndef HALF_LJ
-     gmx_load_hpr(fjx_S, f+ajx);
-     gmx_load_hpr(fjy_S, f+ajy);
-     gmx_load_hpr(fjz_S, f+ajz);
++    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, interact_S2);
+ +#endif
+ +#endif
+ +#ifndef ENERGY_GROUPS
+ +    Vvdwtot_S    = gmx_add_pr(Vvdwtot_S,
+ +#ifndef HALF_LJ
+ +                              gmx_add_pr(VLJ_S0, VLJ_S2)
+ +#else
+ +                              VLJ_S0
+ +#endif
+ +                              );
+ +#else
+ +    add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
+ +#ifndef HALF_LJ
+ +    add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
+ +#endif
+ +#endif
+ +#endif /* CALC_LJ */
+ +#endif /* CALC_ENERGIES */
+ +
+ +#ifdef CALC_LJ
+ +    fscal_S0    = gmx_mul_pr(rinvsq_S0,
+ +#ifdef CALC_COULOMB
+ +                             gmx_add_pr(frcoul_S0,
+ +#else
+ +                             (
+ +#endif
+ +                              gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+ +#else
+ +    fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+ +#endif /* CALC_LJ */
+ +#if defined CALC_LJ && !defined HALF_LJ
+ +    fscal_S2    = gmx_mul_pr(rinvsq_S2,
+ +#ifdef CALC_COULOMB
+ +                             gmx_add_pr(frcoul_S2,
+ +#else
+ +                             (
+ +#endif
+ +                              gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+ +#else
+ +    /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+ +    fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+ +#endif
+ +
+ +    /* Calculate temporary vectorial force */
+ +    tx_S0       = gmx_mul_pr(fscal_S0, dx_S0);
+ +    tx_S2       = gmx_mul_pr(fscal_S2, dx_S2);
+ +    ty_S0       = gmx_mul_pr(fscal_S0, dy_S0);
+ +    ty_S2       = gmx_mul_pr(fscal_S2, dy_S2);
+ +    tz_S0       = gmx_mul_pr(fscal_S0, dz_S0);
+ +    tz_S2       = gmx_mul_pr(fscal_S2, dz_S2);
+ +
+ +    /* Increment i atom force */
+ +    fix_S0      = gmx_add_pr(fix_S0, tx_S0);
+ +    fix_S2      = gmx_add_pr(fix_S2, tx_S2);
+ +    fiy_S0      = gmx_add_pr(fiy_S0, ty_S0);
+ +    fiy_S2      = gmx_add_pr(fiy_S2, ty_S2);
+ +    fiz_S0      = gmx_add_pr(fiz_S0, tz_S0);
+ +    fiz_S2      = gmx_add_pr(fiz_S2, tz_S2);
+ +
+ +    /* Decrement j atom force */
++    gmx_load_hpr(&fjx_S, f+ajx);
++    gmx_load_hpr(&fjy_S, f+ajy);
++    gmx_load_hpr(&fjz_S, f+ajz);
+ +    gmx_store_hpr(f+ajx, gmx_sub_hpr(fjx_S, gmx_sum4_hpr(tx_S0, tx_S2)));
+ +    gmx_store_hpr(f+ajy, gmx_sub_hpr(fjy_S, gmx_sum4_hpr(ty_S0, ty_S2)));
+ +    gmx_store_hpr(f+ajz, gmx_sub_hpr(fjz_S, gmx_sum4_hpr(tz_S0, tz_S2)));
+ +}
+ +
+ +#undef  rinv_ex_S0
+ +#undef  rinv_ex_S2
+ +
+ +#undef  wco_vdw_S0
+ +#undef  wco_vdw_S2
+ +
+ +#undef  CUTOFF_BLENDV
+ +
+ +#undef  EXCL_FORCES
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h

index 20b2e0f4bb737ba9fcbec8e018a4a5068870d96f,0000000000000000000000000000000000000000..fec158de8f1a38377f5ce6b1509562bb379be84f

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
@@@ -1,747 -1,0 +1,744 @@@
- /* Include the full width SIMD macros */
- #include "gmx_simd_macros.h"
- 
- 
- /* Define a few macros for half-width SIMD */
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
- 
- /* Half-width SIMD real type */
- #define gmx_mm_hpr  __m128
- 
- /* Half-width SIMD operations */
- /* Load reals at half-width aligned pointer b into half-width SIMD register a */
- #define gmx_load_hpr(a, b)    a = _mm_load_ps(b)
- /* Load one real at pointer b into half-width SIMD register a */
- #define gmx_load1_hpr(a, b)   a = _mm_load1_ps(b)
- /* Load one real at b and one real at b+1 into halves of a, respectively */
- #define gmx_load1p1_pr(a, b)  a = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
- /* Load reals at half-width aligned pointer b into two halves of a */
- #define gmx_loaddh_pr(a, b)   a = gmx_mm256_load4_ps(b)
- /* To half-width SIMD register b into half width aligned memory a */
- #define gmx_store_hpr(a, b)       _mm_store_ps(a, b)
- #define gmx_add_hpr               _mm_add_ps
- #define gmx_sub_hpr               _mm_sub_ps
- /* Horizontal sum over a half SIMD register */
- #define gmx_sum4_hpr              gmx_mm256_sum4h_m128
- 
- #else
- #error "Half-width SIMD macros are not yet defined"
- #endif
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2009, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +
- #define SIMD_MASK_ALL   0xffffffff
++/* Half-width SIMD operations are required here.
++ * As the 4xn kernels are the "standard" kernels and some special operations
++ * are required only here, we define those in nbnxn_kernel_simd_utils_...
++ *
++ * Half-width SIMD real type:
++ * gmx_mm_hpr
++ *
++ * Half-width SIMD operations
++ * Load reals at half-width aligned pointer b into half-width SIMD register a:
++ * gmx_load_hpr(a, b)
++ * Set all entries in half-width SIMD register *a to b:
++ * gmx_set1_hpr(a, b)
++ * Load one real at b and one real at b+1 into halves of a, respectively:
++ * gmx_load1p1_pr(a, b)
++ * Load reals at half-width aligned pointer b into two halves of a:
++ * gmx_loaddh_pr(a, b)
++ * Store half-width SIMD register b into half width aligned memory a:
++ * gmx_store_hpr(a, b)
++ * gmx_add_hpr(a, b)
++ * gmx_sub_hpr(a, b)
++ * Sum over 4 half SIMD registers:
++ * gmx_sum4_hpr(a, b)
++ * Sum the elements of halfs of each input register and store sums in out:
++ * gmx_mm_transpose_sum4h_pr(a, b)
++ * Extract two half-width registers *b, *c from a full width register a:
++ * gmx_pr_to_2hpr(a, b, c)
++ */
+ +
+ +
+ +#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+ +
+ +#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
+ +#define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
+ +
+ +/* The stride of all the atom data arrays is equal to half the SIMD width */
+ +#define STRIDE     (GMX_SIMD_WIDTH_HERE/2)
+ +
+ +#if GMX_SIMD_WIDTH_HERE == 8
+ +#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+ +#else
+ +#if GMX_SIMD_WIDTH_HERE == 16
+ +/* This is getting ridiculous, SIMD horizontal adds would help,
+ + * but this is not performance critical (only used to reduce energies)
+ + */
+ +#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
+ +#else
+ +#error "unsupported kernel configuration"
+ +#endif
+ +#endif
+ +
+ +
+ +#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+ +/* AVX-256 single precision 2x(4+4) kernel,
+ + * we can do half SIMD-width aligned FDV0 table loads.
+ + */
+ +#define TAB_FDV0
+ +#endif
+ +
++/* Currently stride 4 for the 2 LJ parameters is hard coded */
++#define NBFP_STRIDE  4
+ +
- #if UNROLLJ >= 4
- #ifndef GMX_DOUBLE
-     __m128     fix_S, fiy_S, fiz_S;
- #else
-     __m256d    fix_S, fiy_S, fiz_S;
- #endif
- #else
-     __m128d    fix0_S, fiy0_S, fiz0_S;
-     __m128d    fix2_S, fiy2_S, fiz2_S;
- #endif
+ +
+ +#include "nbnxn_kernel_simd_utils.h"
+ +
+ +/* All functionality defines are set here, except for:
+ + * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ + * CHECK_EXCLS, which is set just before including the inner loop contents.
+ + * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ + * set before calling the kernel function. We might want to move that
+ + * to inside the n-loop and have a different combination rule for different
+ + * ci's, as no combination rule gives a 50% performance hit for LJ.
+ + */
+ +
+ +/* We always calculate shift forces, because it's cheap anyhow */
+ +#define CALC_SHIFTFORCES
+ +
+ +/* Assumes all LJ parameters are identical */
+ +/* #define FIX_LJ_C */
+ +
+ +/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ + * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ + * and energy calculations (ene), depending on the defines set.
+ + */
+ +
+ +#define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
+ +
+ +#if defined LJ_COMB_GEOM
+ +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
+ +#else
+ +#if defined LJ_COMB_LB
+ +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
+ +#else
+ +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_COUL_RF
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
+ +#endif
+ +#ifdef CALC_COUL_TAB
+ +#ifndef VDW_CUTOFF_CHECK
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
+ +#else
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
+ +#endif
+ +#endif
+ +#ifdef CALC_COUL_EWALD
+ +#ifndef VDW_CUTOFF_CHECK
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
+ +#else
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
+ +#endif
+ +#endif
+ +
+ +static void
+ +#ifndef CALC_ENERGIES
+ +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, noener)
+ +#else
+ +#ifndef ENERGY_GROUPS
+ +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, ener)
+ +#else
+ +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, energrp)
+ +#endif
+ +#endif
+ +#undef NBK_FUNC_NAME
+ +#undef NBK_FUNC_NAME_C
+ +#undef NBK_FUNC_NAME_C_LJC
+ +(const nbnxn_pairlist_t     *nbl,
+ + const nbnxn_atomdata_t     *nbat,
+ + const interaction_const_t  *ic,
+ + rvec                       *shift_vec,
+ + real                       *f
+ +#ifdef CALC_SHIFTFORCES
+ + ,
+ + real                       *fshift
+ +#endif
+ +#ifdef CALC_ENERGIES
+ + ,
+ + real                       *Vvdw,
+ + real                       *Vc
+ +#endif
+ +)
+ +{
+ +    const nbnxn_ci_t   *nbln;
+ +    const nbnxn_cj_t   *l_cj;
+ +    const int          *type;
+ +    const real         *q;
+ +    const real         *shiftvec;
+ +    const real         *x;
+ +    const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
+ +    real                facel;
+ +    real               *nbfp_ptr;
+ +    int                 nbfp_stride;
+ +    int                 n, ci, ci_sh;
+ +    int                 ish, ish3;
+ +    gmx_bool            do_LJ, half_LJ, do_coul;
+ +    int                 sci, scix, sciy, sciz, sci2;
+ +    int                 cjind0, cjind1, cjind;
+ +    int                 ip, jp;
+ +
+ +#ifdef ENERGY_GROUPS
+ +    int         Vstride_i;
+ +    int         egps_ishift, egps_imask;
+ +    int         egps_jshift, egps_jmask, egps_jstride;
+ +    int         egps_i;
+ +    real       *vvdwtp[UNROLLI];
+ +    real       *vctp[UNROLLI];
+ +#endif
+ +
+ +    gmx_mm_pr  shX_S;
+ +    gmx_mm_pr  shY_S;
+ +    gmx_mm_pr  shZ_S;
+ +    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
+ +    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
+ +    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
+ +    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
-     gmx_mm_pr  diag_jmi_S;
++    /* We use an i-force SIMD register width of 4 */
++    /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
++    gmx_mm_pr4 fix_S, fiy_S, fiz_S;
+ +
-     gmx_mm_pr  diag_S0, diag_S2;
++    gmx_mm_pr  diagonal_jmi_S;
+ +#if UNROLLI == UNROLLJ
-     gmx_mm_pr  diag0_S0, diag0_S2;
-     gmx_mm_pr  diag1_S0, diag1_S2;
++    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S2;
+ +#else
-     gmx_mm_pr  mask_S0, mask_S2;
++    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S2;
++    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S2;
+ +#endif
+ +
- #ifndef GMX_DOUBLE
++    unsigned   *excl_filter;
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++    gmx_epi32  filter_S0, filter_S2;
++#else
++    gmx_mm_pr  filter_S0, filter_S2;
++#endif
+ +
+ +    gmx_mm_pr  zero_S = gmx_set1_pr(0);
+ +
+ +    gmx_mm_pr  one_S = gmx_set1_pr(1.0);
+ +    gmx_mm_pr  iq_S0 = gmx_setzero_pr();
+ +    gmx_mm_pr  iq_S2 = gmx_setzero_pr();
+ +    gmx_mm_pr  mrc_3_S;
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  hrc_3_S, moh_rc_S;
+ +#endif
+ +
+ +#ifdef CALC_COUL_TAB
+ +    /* Coulomb table variables */
+ +    gmx_mm_pr   invtsp_S;
+ +    const real *tab_coul_F;
+ +#ifndef TAB_FDV0
+ +    const real *tab_coul_V;
+ +#endif
+ +    int        ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
+ +    int        ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  mhalfsp_S;
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_COUL_EWALD
+ +    gmx_mm_pr beta2_S, beta_S;
+ +#endif
+ +
+ +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ +    gmx_mm_pr  sh_ewald_S;
+ +#endif
+ +
+ +#ifdef LJ_COMB_LB
+ +    const real *ljc;
+ +
+ +    gmx_mm_pr   hsig_i_S0, seps_i_S0;
+ +    gmx_mm_pr   hsig_i_S2, seps_i_S2;
+ +#else
+ +#ifdef FIX_LJ_C
+ +    real        pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
+ +    real       *pvdw_c6, *pvdw_c12;
+ +    gmx_mm_pr   c6_S0, c12_S0;
+ +    gmx_mm_pr   c6_S2, c12_S2;
+ +#endif
+ +
+ +#ifdef LJ_COMB_GEOM
+ +    const real *ljc;
+ +
+ +    gmx_mm_pr   c6s_S0, c12s_S0;
+ +    gmx_mm_pr   c6s_S1, c12s_S1;
+ +    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ +    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +
+ +    gmx_mm_pr  vctot_S, Vvdwtot_S;
+ +    gmx_mm_pr  sixth_S, twelveth_S;
+ +
+ +    gmx_mm_pr  avoid_sing_S;
+ +    gmx_mm_pr  rc2_S;
+ +#ifdef VDW_CUTOFF_CHECK
+ +    gmx_mm_pr  rcvdw2_S;
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
+ +
+ +    /* cppcheck-suppress unassignedVariable */
+ +    real       tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
+ +#endif
+ +#ifdef CALC_SHIFTFORCES
+ +    /* cppcheck-suppress unassignedVariable */
+ +    real       shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
+ +#endif
+ +
+ +    int ninner;
+ +
+ +#ifdef COUNT_PAIRS
+ +    int npair = 0;
+ +#endif
+ +
+ +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ +    ljc = nbat->lj_comb;
+ +#else
+ +    /* No combination rule used */
- #define NBFP_STRIDE  4
++#if NBFP_STRIDE == 2
++    nbfp_ptr    = nbat->nbfp;
++#else
++#if NBFP_STRIDE == 4
+ +    nbfp_ptr    = nbat->nbfp_s4;
-     nbfp_ptr    = nbat->nbfp;
- #define NBFP_STRIDE  2
+ +#else
-     diag_jmi_S = gmx_load_pr(nbat->simd_2xnn_diag);
++#error "Only NBFP_STRIDE 2 and 4 are currently supported"
++#endif
+ +#endif
+ +    nbfp_stride = NBFP_STRIDE;
+ +#endif
+ +
+ +    /* Load j-i for the first i */
-     diag_S0    = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag_S2    = gmx_cmplt_pr(zero_S, diag_jmi_S);
++    diagonal_jmi_S    = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
+ +    /* Generate all the diagonal masks as comparison results */
+ +#if UNROLLI == UNROLLJ
-     diag0_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
-     diag_i_S = gmx_add_pr(diag_i_S, one_S);
-     diag_i_S = gmx_add_pr(diag_i_S, one_S);
-     diag0_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
-     diag_i_S = gmx_add_pr(diag_i_S, one_S);
-     diag_i_S = gmx_add_pr(diag_i_S, one_S);
-     diag1_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
-     diag_i_S = gmx_add_pr(diag_i_S, one_S);
-     diag_i_S = gmx_add_pr(diag_i_S, one_S);
-     diag1_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
++    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ +#else
+ +#if 2*UNROLLI == UNROLLJ
-     mask_S0    = gmx_load_pr((real *)nbat->simd_excl_mask + 0*2*UNROLLJ);
-     mask_S2    = gmx_load_pr((real *)nbat->simd_excl_mask + 1*2*UNROLLJ);
++    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ +#endif
+ +#endif
+ +
+ +    /* Load masks for topology exclusion masking */
- #if UNROLLJ == 2
-         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
- #endif
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#define FILTER_STRIDE  (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
++#else
++#ifdef GMX_DOUBLE
++#define FILTER_STRIDE  2
++#else
++#define FILTER_STRIDE  1
++#endif
++#endif
++#if FILTER_STRIDE == 1
++    excl_filter = nbat->simd_exclusion_filter1;
++#else
++    excl_filter = nbat->simd_exclusion_filter2;
++#endif
++    /* Here we cast the exclusion filters from unsigned * to int * or real *.
++     * Since we only check bits, the actual value they represent does not
++     * matter, as long as both filter and mask data are treated the same way.
++     */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++    filter_S0 = gmx_load_si((int *)excl_filter + 0*2*UNROLLJ*FILTER_STRIDE);
++    filter_S2 = gmx_load_si((int *)excl_filter + 1*2*UNROLLJ*FILTER_STRIDE);
++#else
++    filter_S0 = gmx_load_pr((real *)excl_filter + 0*2*UNROLLJ);
++    filter_S2 = gmx_load_pr((real *)excl_filter + 1*2*UNROLLJ);
++#endif
++#undef FILTER_STRIDE
+ +
+ +#ifdef CALC_COUL_TAB
+ +    /* Generate aligned table index pointers */
+ +    ti0 = gmx_simd_align_int(ti0_array);
+ +    ti2 = gmx_simd_align_int(ti2_array);
+ +
+ +    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
+ +#ifdef CALC_ENERGIES
+ +    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
+ +#endif
+ +
+ +#ifdef TAB_FDV0
+ +    tab_coul_F = ic->tabq_coul_FDV0;
+ +#else
+ +    tab_coul_F = ic->tabq_coul_F;
+ +    tab_coul_V = ic->tabq_coul_V;
+ +#endif
+ +#endif /* CALC_COUL_TAB */
+ +
+ +#ifdef CALC_COUL_EWALD
+ +    beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ +    beta_S  = gmx_set1_pr(ic->ewaldcoeff);
+ +#endif
+ +
+ +#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+ +    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
+ +#endif
+ +
+ +    q                   = nbat->q;
+ +    type                = nbat->type;
+ +    facel               = ic->epsfac;
+ +    shiftvec            = shift_vec[0];
+ +    x                   = nbat->x;
+ +
+ +    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+ +
+ +    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+ +    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+ +#ifdef VDW_CUTOFF_CHECK
+ +    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    sixth_S      = gmx_set1_pr(1.0/6.0);
+ +    twelveth_S   = gmx_set1_pr(1.0/12.0);
+ +
+ +    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
+ +    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+ +#endif
+ +
+ +    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
+ +
+ +#ifdef CALC_ENERGIES
+ +    hrc_3_S  = gmx_set1_pr(ic->k_rf);
+ +
+ +    moh_rc_S = gmx_set1_pr(-ic->c_rf);
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    tmpsum   = gmx_simd_align_real(tmpsum_array);
+ +#endif
+ +#ifdef CALC_SHIFTFORCES
+ +    shf      = gmx_simd_align_real(shf_array);
+ +#endif
+ +
+ +#ifdef FIX_LJ_C
+ +    pvdw_c6  = gmx_simd_align_real(pvdw_array);
+ +    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+ +
+ +    for (jp = 0; jp < UNROLLJ; jp++)
+ +    {
+ +        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +
+ +        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +    }
+ +    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ +    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ +    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ +    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+ +
+ +    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ +    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ +    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ +    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+ +#endif /* FIX_LJ_C */
+ +
+ +#ifdef ENERGY_GROUPS
+ +    egps_ishift  = nbat->neg_2log;
+ +    egps_imask   = (1<<egps_ishift) - 1;
+ +    egps_jshift  = 2*nbat->neg_2log;
+ +    egps_jmask   = (1<<egps_jshift) - 1;
+ +    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+ +    /* Major division is over i-particle energy groups, determine the stride */
+ +    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+ +#endif
+ +
+ +    l_cj = nbl->cj;
+ +
+ +    ninner = 0;
+ +    for (n = 0; n < nbl->nci; n++)
+ +    {
+ +        nbln = &nbl->ci[n];
+ +
+ +        ish              = (nbln->shift & NBNXN_CI_SHIFT);
+ +        ish3             = ish*3;
+ +        cjind0           = nbln->cj_ind_start;
+ +        cjind1           = nbln->cj_ind_end;
+ +        ci               = nbln->ci;
+ +        ci_sh            = (ish == CENTRAL ? ci : -1);
+ +
+ +        shX_S = gmx_load1_pr(shiftvec+ish3);
+ +        shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ +        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
+ +
+ +#if UNROLLJ <= 4
+ +        sci              = ci*STRIDE;
+ +        scix             = sci*DIM;
+ +        sci2             = sci*2;
+ +#else
+ +        sci              = (ci>>1)*STRIDE;
+ +        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
+ +        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
+ +        sci             += (ci & 1)*(STRIDE>>1);
+ +#endif
+ +
+ +        /* We have 5 LJ/C combinations, but use only three inner loops,
+ +         * as the other combinations are unlikely and/or not much faster:
+ +         * inner half-LJ + C for half-LJ + C / no-LJ + C
+ +         * inner LJ + C      for full-LJ + C
+ +         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+ +         */
+ +        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
+ +        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ +        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
+ +
+ +#ifdef ENERGY_GROUPS
+ +        egps_i = nbat->energrp[ci];
+ +        {
+ +            int ia, egp_ia;
+ +
+ +            for (ia = 0; ia < UNROLLI; ia++)
+ +            {
+ +                egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
+ +                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+ +                vctp[ia]   = Vc   + egp_ia*Vstride_i;
+ +            }
+ +        }
+ +#endif
+ +#if defined CALC_ENERGIES
+ +#if UNROLLJ == 4
+ +        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+ +#endif
-         gmx_load1p1_pr(ix_S0, x+scix);
-         gmx_load1p1_pr(ix_S2, x+scix+2);
-         gmx_load1p1_pr(iy_S0, x+sciy);
-         gmx_load1p1_pr(iy_S2, x+sciy+2);
-         gmx_load1p1_pr(iz_S0, x+sciz);
-         gmx_load1p1_pr(iz_S2, x+sciz+2);
+ +#if UNROLLJ == 8
+ +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+ +#endif
+ +        {
+ +            int  ia;
+ +            real Vc_sub_self;
+ +
+ +#ifdef CALC_COUL_RF
+ +            Vc_sub_self = 0.5*ic->c_rf;
+ +#endif
+ +#ifdef CALC_COUL_TAB
+ +#ifdef TAB_FDV0
+ +            Vc_sub_self = 0.5*tab_coul_F[2];
+ +#else
+ +            Vc_sub_self = 0.5*tab_coul_V[0];
+ +#endif
+ +#endif
+ +#ifdef CALC_COUL_EWALD
+ +            /* beta/sqrt(pi) */
+ +            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+ +#endif
+ +
+ +            for (ia = 0; ia < UNROLLI; ia++)
+ +            {
+ +                real qi;
+ +
+ +                qi = q[sci+ia];
+ +#ifdef ENERGY_GROUPS
+ +                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+ +#else
+ +                Vc[0]
+ +#endif
+ +                    -= facel*qi*qi*Vc_sub_self;
+ +            }
+ +        }
+ +#endif
+ +
+ +        /* Load i atom data */
+ +        sciy             = scix + STRIDE;
+ +        sciz             = sciy + STRIDE;
-             gmx_load1p1_pr(iq_S0, q+sci);
-             gmx_load1p1_pr(iq_S2, q+sci+2);
++        gmx_load1p1_pr(&ix_S0, x+scix);
++        gmx_load1p1_pr(&ix_S2, x+scix+2);
++        gmx_load1p1_pr(&iy_S0, x+sciy);
++        gmx_load1p1_pr(&iy_S2, x+sciy+2);
++        gmx_load1p1_pr(&iz_S0, x+sciz);
++        gmx_load1p1_pr(&iz_S2, x+sciz+2);
+ +        ix_S0          = gmx_add_pr(ix_S0, shX_S);
+ +        ix_S2          = gmx_add_pr(ix_S2, shX_S);
+ +        iy_S0          = gmx_add_pr(iy_S0, shY_S);
+ +        iy_S2          = gmx_add_pr(iy_S2, shY_S);
+ +        iz_S0          = gmx_add_pr(iz_S0, shZ_S);
+ +        iz_S2          = gmx_add_pr(iz_S2, shZ_S);
+ +
+ +        if (do_coul)
+ +        {
+ +            gmx_mm_pr facel_S;
+ +
+ +            facel_S    = gmx_set1_pr(facel);
+ +
-         gmx_load1p1_pr(hsig_i_S0, ljc+sci2+0);
-         gmx_load1p1_pr(hsig_i_S2, ljc+sci2+2);
-         gmx_load1p1_pr(seps_i_S0, ljc+sci2+STRIDE+0);
-         gmx_load1p1_pr(seps_i_S2, ljc+sci2+STRIDE+2);
++            gmx_load1p1_pr(&iq_S0, q+sci);
++            gmx_load1p1_pr(&iq_S2, q+sci+2);
+ +            iq_S0      = gmx_mul_pr(facel_S, iq_S0);
+ +            iq_S2      = gmx_mul_pr(facel_S, iq_S2);
+ +        }
+ +
+ +#ifdef LJ_COMB_LB
-         gmx_load1p1_pr(c6s_S0, ljc+sci2+0);
++        gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
++        gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
++        gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
++        gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
+ +#else
+ +#ifdef LJ_COMB_GEOM
-             gmx_load1p1_pr(c6s_S2, ljc+sci2+2);
++        gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
+ +        if (!half_LJ)
+ +        {
-         gmx_load1p1_pr(c12s_S0, ljc+sci2+STRIDE+0);
++            gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
+ +        }
-             gmx_load1p1_pr(c12s_S2, ljc+sci2+STRIDE+2);
++        gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
+ +        if (!half_LJ)
+ +        {
-             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++            gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
+ +        }
+ +#else
+ +        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
+ +        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+ +        if (!half_LJ)
+ +        {
+ +            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+ +            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+ +        }
+ +#endif
+ +#endif
+ +
+ +        /* Zero the potential energy for this list */
+ +        Vvdwtot_S        = gmx_setzero_pr();
+ +        vctot_S          = gmx_setzero_pr();
+ +
+ +        /* Clear i atom forces */
+ +        fix_S0           = gmx_setzero_pr();
+ +        fix_S2           = gmx_setzero_pr();
+ +        fiy_S0           = gmx_setzero_pr();
+ +        fiy_S2           = gmx_setzero_pr();
+ +        fiz_S0           = gmx_setzero_pr();
+ +        fiz_S2           = gmx_setzero_pr();
+ +
+ +        cjind = cjind0;
+ +
+ +        /* Currently all kernels use (at least half) LJ */
+ +#define CALC_LJ
+ +        if (half_LJ)
+ +        {
+ +#define CALC_COULOMB
+ +#define HALF_LJ
+ +#define CHECK_EXCLS
-             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +#include "nbnxn_kernel_simd_2xnn_inner.h"
+ +                cjind++;
+ +            }
+ +#undef CHECK_EXCLS
+ +            for (; (cjind < cjind1); cjind++)
+ +            {
+ +#include "nbnxn_kernel_simd_2xnn_inner.h"
+ +            }
+ +#undef HALF_LJ
+ +#undef CALC_COULOMB
+ +        }
+ +        else if (do_coul)
+ +        {
+ +#define CALC_COULOMB
+ +#define CHECK_EXCLS
-             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +#include "nbnxn_kernel_simd_2xnn_inner.h"
+ +                cjind++;
+ +            }
+ +#undef CHECK_EXCLS
+ +            for (; (cjind < cjind1); cjind++)
+ +            {
+ +#include "nbnxn_kernel_simd_2xnn_inner.h"
+ +            }
+ +#undef CALC_COULOMB
+ +        }
+ +        else
+ +        {
+ +#define CHECK_EXCLS
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
- #define gmx_load_pr4  _mm_load_ps
- #define gmx_store_pr4 _mm_store_ps
- #define gmx_add_pr4   _mm_add_ps
- #else
- #error "You need to define 4-width SIM macros for i-force reduction"
- #endif
-         GMX_MM_TRANSPOSE_SUM4H_PR(fix_S0, fix_S2, fix_S);
++            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +#include "nbnxn_kernel_simd_2xnn_inner.h"
+ +                cjind++;
+ +            }
+ +#undef CHECK_EXCLS
+ +            for (; (cjind < cjind1); cjind++)
+ +            {
+ +#include "nbnxn_kernel_simd_2xnn_inner.h"
+ +            }
+ +        }
+ +#undef CALC_LJ
+ +        ninner += cjind1 - cjind0;
+ +
+ +        /* Add accumulated i-forces to the force array */
-         GMX_MM_TRANSPOSE_SUM4H_PR(fiy_S0, fiy_S2, fiy_S);
++        fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
+ +        gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
+ +
-         GMX_MM_TRANSPOSE_SUM4H_PR(fiz_S0, fiz_S2, fiz_S);
++        fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
+ +        gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
+ +
- #undef gmx_load_pr4
- #undef gmx_store_pr4
- #undef gmx_store_pr4
- 
++        fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
+ +        gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
+ +
+ +#ifdef CALC_SHIFTFORCES
+ +        gmx_store_pr4(shf, fix_S);
+ +        fshift[ish3+0] += SUM_SIMD4(shf);
+ +        gmx_store_pr4(shf, fiy_S);
+ +        fshift[ish3+1] += SUM_SIMD4(shf);
+ +        gmx_store_pr4(shf, fiz_S);
+ +        fshift[ish3+2] += SUM_SIMD4(shf);
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +        if (do_coul)
+ +        {
+ +            gmx_store_pr(tmpsum, vctot_S);
+ +            *Vc += SUM_SIMD(tmpsum);
+ +        }
+ +
+ +        gmx_store_pr(tmpsum, Vvdwtot_S);
+ +        *Vvdw += SUM_SIMD(tmpsum);
+ +#endif
+ +
+ +        /* Outer loop uses 6 flops/iteration */
+ +    }
+ +
+ +#ifdef COUNT_PAIRS
+ +    printf("atom pairs %d\n", npair);
+ +#endif
+ +}
+ +
+ +
- 
- #undef gmx_mm_hpr
- 
- #undef gmx_load_hpr
- #undef gmx_load1_hpr
- #undef gmx_load1p1_pr
- #undef gmx_loaddh_pr
- #undef gmx_store_hpr
- #undef gmx_add_hpr
- #undef gmx_sub_hpr
- 
- #undef gmx_sum4_hpr
+ +#undef CALC_SHIFTFORCES
+ +
+ +#undef UNROLLI
+ +#undef UNROLLJ
+ +#undef STRIDE
+ +#undef TAB_FDV0
+ +#undef NBFP_STRIDE
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c

index 848fefa0da20c25c49deeb1d9d1dfb96db30c330,0000000000000000000000000000000000000000..3faedd9b4538e48fa6727e4fb9a46127da2bafe0

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
@@@ -1,328 -1,0 +1,335 @@@
- #include "nbnxn_kernel_simd_4xn.h"
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +
+ +#include "typedefs.h"
+ +#include "vec.h"
+ +#include "smalloc.h"
+ +#include "force.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "../nbnxn_consts.h"
+ +#include "nbnxn_kernel_common.h"
+ +
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +
- /* Include all flavors of the SSE or AVX 4xN kernel loops */
++#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
++#define GMX_USE_HALF_WIDTH_SIMD_HERE
++#endif
++#include "gmx_simd_macros.h"
++#include "gmx_simd_vec.h"
+ +
- #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
++#include "nbnxn_kernel_simd_4xn.h"
+ +
-     const int simd_width   = GMX_SIMD_WIDTH_HERE;
-     const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
++#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)
++#error "unsupported SIMD width"
+ +#endif
+ +
++
++/* Include all flavors of the SSE or AVX 4xN kernel loops */
++
+ +/* Analytical reaction-field kernels */
+ +#define CALC_COUL_RF
+ +
+ +#include "nbnxn_kernel_simd_4xn_includes.h"
+ +
+ +#undef CALC_COUL_RF
+ +
+ +/* Tabulated exclusion interaction electrostatics kernels */
+ +#define CALC_COUL_TAB
+ +
+ +/* Single cut-off: rcoulomb = rvdw */
+ +#include "nbnxn_kernel_simd_4xn_includes.h"
+ +
+ +/* Twin cut-off: rcoulomb >= rvdw */
+ +#define VDW_CUTOFF_CHECK
+ +#include "nbnxn_kernel_simd_4xn_includes.h"
+ +#undef VDW_CUTOFF_CHECK
+ +
+ +#undef CALC_COUL_TAB
+ +
+ +/* Analytical Ewald exclusion interaction electrostatics kernels */
+ +#define CALC_COUL_EWALD
+ +
+ +/* Single cut-off: rcoulomb = rvdw */
+ +#include "nbnxn_kernel_simd_4xn_includes.h"
+ +
+ +/* Twin cut-off: rcoulomb >= rvdw */
+ +#define VDW_CUTOFF_CHECK
+ +#include "nbnxn_kernel_simd_4xn_includes.h"
+ +#undef VDW_CUTOFF_CHECK
+ +
+ +#undef CALC_COUL_EWALD
+ +
+ +
+ +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
+ +                                const nbnxn_atomdata_t     *nbat,
+ +                                const interaction_const_t  *ic,
+ +                                rvec                       *shift_vec,
+ +                                real                       *f,
+ +                                real                       *fshift,
+ +                                real                       *Vvdw,
+ +                                real                       *Vc);
+ +
+ +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
+ +                                  const nbnxn_atomdata_t     *nbat,
+ +                                  const interaction_const_t  *ic,
+ +                                  rvec                       *shift_vec,
+ +                                  real                       *f,
+ +                                  real                       *fshift);
+ +
+ +enum {
+ +    coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR
+ +};
+ +
+ +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _ener
+ +static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
+ +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+ +#undef NBK_FN
+ +
+ +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _energrp
+ +static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
+ +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+ +#undef NBK_FN
+ +
+ +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _noener
+ +static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
+ +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+ +#undef NBK_FN
+ +
+ +
+ +static void reduce_group_energies(int ng, int ng_2log,
+ +                                  const real *VSvdw, const real *VSc,
+ +                                  real *Vvdw, real *Vc)
+ +{
-                 c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
++    const int unrollj      = GMX_SIMD_WIDTH_HERE;
++    const int unrollj_half = unrollj/2;
+ +    int       ng_p2, i, j, j0, j1, c, s;
+ +
+ +    ng_p2 = (1<<ng_2log);
+ +
+ +    /* The size of the x86 SIMD energy group buffer array is:
+ +     * ng*ng*ng_p2*unrollj_half*simd_width
+ +     */
+ +    for (i = 0; i < ng; i++)
+ +    {
+ +        for (j = 0; j < ng; j++)
+ +        {
+ +            Vvdw[i*ng+j] = 0;
+ +            Vc[i*ng+j]   = 0;
+ +        }
+ +
+ +        for (j1 = 0; j1 < ng; j1++)
+ +        {
+ +            for (j0 = 0; j0 < ng; j0++)
+ +            {
-                     c             += simd_width + 2;
++                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
+ +                for (s = 0; s < unrollj_half; s++)
+ +                {
+ +                    Vvdw[i*ng+j0] += VSvdw[c+0];
+ +                    Vvdw[i*ng+j1] += VSvdw[c+1];
+ +                    Vc  [i*ng+j0] += VSc  [c+0];
+ +                    Vc  [i*ng+j1] += VSc  [c+1];
++                    c             += unrollj + 2;
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +#endif /* GMX_NBNXN_SIMD_4XN */
+ +
+ +void
+ +nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+ +                      const nbnxn_atomdata_t     *nbat,
+ +                      const interaction_const_t  *ic,
+ +                      int                         ewald_excl,
+ +                      rvec                       *shift_vec,
+ +                      int                         force_flags,
+ +                      int                         clearF,
+ +                      real                       *fshift,
+ +                      real                       *Vc,
+ +                      real                       *Vvdw)
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +{
+ +    int                nnbl;
+ +    nbnxn_pairlist_t **nbl;
+ +    int                coult;
+ +    int                nb;
+ +
+ +    nnbl = nbl_list->nnbl;
+ +    nbl  = nbl_list->nbl;
+ +
+ +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ +    {
+ +        coult = coultRF;
+ +    }
+ +    else
+ +    {
+ +        if (ewald_excl == ewaldexclTable)
+ +        {
+ +            if (ic->rcoulomb == ic->rvdw)
+ +            {
+ +                coult = coultTAB;
+ +            }
+ +            else
+ +            {
+ +                coult = coultTAB_TWIN;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            if (ic->rcoulomb == ic->rvdw)
+ +            {
+ +                coult = coultEWALD;
+ +            }
+ +            else
+ +            {
+ +                coult = coultEWALD_TWIN;
+ +            }
+ +        }
+ +    }
+ +
+ +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ +    for (nb = 0; nb < nnbl; nb++)
+ +    {
+ +        nbnxn_atomdata_output_t *out;
+ +        real                    *fshift_p;
+ +
+ +        out = &nbat->out[nb];
+ +
+ +        if (clearF == enbvClearFYes)
+ +        {
+ +            clear_f(nbat, nb, out->f);
+ +        }
+ +
+ +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+ +        {
+ +            fshift_p = fshift;
+ +        }
+ +        else
+ +        {
+ +            fshift_p = out->fshift;
+ +
+ +            if (clearF == enbvClearFYes)
+ +            {
+ +                clear_fshift(fshift_p);
+ +            }
+ +        }
+ +
+ +        /* With Ewald type electrostatics we the forces for excluded atom pairs
+ +         * should not contribute to the virial sum. The exclusion forces
+ +         * are not calculate in the energy kernels, but are in _noener.
+ +         */
+ +        if (!((force_flags & GMX_FORCE_ENERGY) ||
+ +              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ +        {
+ +            /* Don't calculate energies */
+ +            p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
+ +                                                 ic,
+ +                                                 shift_vec,
+ +                                                 out->f,
+ +                                                 fshift_p);
+ +        }
+ +        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ +        {
+ +            /* No energy groups */
+ +            out->Vvdw[0] = 0;
+ +            out->Vc[0]   = 0;
+ +
+ +            p_nbk_ener[coult][nbat->comb_rule](nbl[nb], nbat,
+ +                                               ic,
+ +                                               shift_vec,
+ +                                               out->f,
+ +                                               fshift_p,
+ +                                               out->Vvdw,
+ +                                               out->Vc);
+ +        }
+ +        else
+ +        {
+ +            /* Calculate energy group contributions */
+ +            int i;
+ +
+ +            for (i = 0; i < out->nVS; i++)
+ +            {
+ +                out->VSvdw[i] = 0;
+ +            }
+ +            for (i = 0; i < out->nVS; i++)
+ +            {
+ +                out->VSc[i] = 0;
+ +            }
+ +
+ +            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb], nbat,
+ +                                                  ic,
+ +                                                  shift_vec,
+ +                                                  out->f,
+ +                                                  fshift_p,
+ +                                                  out->VSvdw,
+ +                                                  out->VSc);
+ +
+ +            reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
+ +                                  out->VSvdw, out->VSc,
+ +                                  out->Vvdw, out->Vc);
+ +        }
+ +    }
+ +
+ +    if (force_flags & GMX_FORCE_ENERGY)
+ +    {
+ +        reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
+ +    }
+ +}
+ +#else
+ +{
+ +    gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
+ +}
+ +#endif
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h

index 329fd97602bb2738e0e7b1fde2b4e3be70a57950,0000000000000000000000000000000000000000..241c180c901bb8709eb12e8a290e4c40d886c543

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
@@@ -1,997 -1,0 +1,1001 @@@
- #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2009, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ + * This flavor of the kernel calculates interactions of 4 i-atoms
+ + * with N j-atoms stored in N wide SIMD registers.
+ + */
+ +
+ +
+ +/* When calculating RF or Ewald interactions we calculate the electrostatic
+ + * forces on excluded atom pairs here in the non-bonded loops.
+ + * But when energies and/or virial is required we calculate them
+ + * separately to as then it is easier to separate the energy and virial
+ + * contributions.
+ + */
+ +#if defined CHECK_EXCLS && defined CALC_COULOMB
+ +#define EXCL_FORCES
+ +#endif
+ +
+ +/* Without exclusions and energies we only need to mask the cut-off,
+ + * this can be faster when we have defined gmx_blendv_pr, i.e. an instruction
+ + * that selects from two SIMD registers based on the contents of a third.
+ + */
-     gmx_mm_pr  int_S0;
-     gmx_mm_pr  int_S1;
-     gmx_mm_pr  int_S2;
-     gmx_mm_pr  int_S3;
++#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
+ +/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ + * With gcc this is slower, except for RF on Sandy Bridge.
+ + * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ + */
+ +#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+ +#define NBNXN_CUTOFF_USE_BLENDV
+ +#endif
+ +/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ + * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ + * Tested with icc 13.
+ + */
+ +#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+ +#define NBNXN_CUTOFF_USE_BLENDV
+ +#endif
+ +#endif
+ +
+ +{
+ +    int        cj, aj, ajx, ajy, ajz;
+ +
+ +#ifdef ENERGY_GROUPS
+ +    /* Energy group indices for two atoms packed into one int */
+ +    int        egp_jj[UNROLLJ/2];
+ +#endif
+ +
+ +#ifdef CHECK_EXCLS
+ +    /* Interaction (non-exclusion) mask of all 1's or 0's */
-     gmx_mm_pr  wco_S0;
-     gmx_mm_pr  wco_S1;
-     gmx_mm_pr  wco_S2;
-     gmx_mm_pr  wco_S3;
++    gmx_mm_pb  interact_S0;
++    gmx_mm_pb  interact_S1;
++    gmx_mm_pb  interact_S2;
++    gmx_mm_pb  interact_S3;
+ +#endif
+ +
+ +    gmx_mm_pr  jx_S, jy_S, jz_S;
+ +    gmx_mm_pr  dx_S0, dy_S0, dz_S0;
+ +    gmx_mm_pr  dx_S1, dy_S1, dz_S1;
+ +    gmx_mm_pr  dx_S2, dy_S2, dz_S2;
+ +    gmx_mm_pr  dx_S3, dy_S3, dz_S3;
+ +    gmx_mm_pr  tx_S0, ty_S0, tz_S0;
+ +    gmx_mm_pr  tx_S1, ty_S1, tz_S1;
+ +    gmx_mm_pr  tx_S2, ty_S2, tz_S2;
+ +    gmx_mm_pr  tx_S3, ty_S3, tz_S3;
+ +    gmx_mm_pr  rsq_S0, rinv_S0, rinvsq_S0;
+ +    gmx_mm_pr  rsq_S1, rinv_S1, rinvsq_S1;
+ +    gmx_mm_pr  rsq_S2, rinv_S2, rinvsq_S2;
+ +    gmx_mm_pr  rsq_S3, rinv_S3, rinvsq_S3;
+ +#ifndef NBNXN_CUTOFF_USE_BLENDV
+ +    /* wco: within cut-off, mask of all 1's or 0's */
-     gmx_mm_pr  wco_vdw_S0;
-     gmx_mm_pr  wco_vdw_S1;
++    gmx_mm_pb  wco_S0;
++    gmx_mm_pb  wco_S1;
++    gmx_mm_pb  wco_S2;
++    gmx_mm_pb  wco_S3;
+ +#endif
+ +#ifdef VDW_CUTOFF_CHECK
-     gmx_mm_pr  wco_vdw_S2;
-     gmx_mm_pr  wco_vdw_S3;
++    gmx_mm_pb  wco_vdw_S0;
++    gmx_mm_pb  wco_vdw_S1;
+ +#ifndef HALF_LJ
- #ifdef gmx_checkbitmask_epi32
++    gmx_mm_pb  wco_vdw_S2;
++    gmx_mm_pb  wco_vdw_S3;
+ +#endif
+ +#endif
+ +#ifdef CALC_COULOMB
+ +#ifdef CHECK_EXCLS
+ +    /* 1/r masked with the interaction mask */
+ +    gmx_mm_pr  rinv_ex_S0;
+ +    gmx_mm_pr  rinv_ex_S1;
+ +    gmx_mm_pr  rinv_ex_S2;
+ +    gmx_mm_pr  rinv_ex_S3;
+ +#endif
+ +    gmx_mm_pr  jq_S;
+ +    gmx_mm_pr  qq_S0;
+ +    gmx_mm_pr  qq_S1;
+ +    gmx_mm_pr  qq_S2;
+ +    gmx_mm_pr  qq_S3;
+ +#ifdef CALC_COUL_TAB
+ +    /* The force (PME mesh force) we need to subtract from 1/r^2 */
+ +    gmx_mm_pr  fsub_S0;
+ +    gmx_mm_pr  fsub_S1;
+ +    gmx_mm_pr  fsub_S2;
+ +    gmx_mm_pr  fsub_S3;
+ +#endif
+ +#ifdef CALC_COUL_EWALD
+ +    gmx_mm_pr  brsq_S0, brsq_S1, brsq_S2, brsq_S3;
+ +    gmx_mm_pr  ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
+ +#endif
+ +
+ +    /* frcoul = (1/r - fsub)*r */
+ +    gmx_mm_pr  frcoul_S0;
+ +    gmx_mm_pr  frcoul_S1;
+ +    gmx_mm_pr  frcoul_S2;
+ +    gmx_mm_pr  frcoul_S3;
+ +#ifdef CALC_COUL_TAB
+ +    /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+ +    gmx_mm_pr  r_S0, rs_S0, rf_S0, frac_S0;
+ +    gmx_mm_pr  r_S1, rs_S1, rf_S1, frac_S1;
+ +    gmx_mm_pr  r_S2, rs_S2, rf_S2, frac_S2;
+ +    gmx_mm_pr  r_S3, rs_S3, rf_S3, frac_S3;
+ +    /* Table index: rs truncated to an int */
+ +    gmx_epi32  ti_S0, ti_S1, ti_S2, ti_S3;
+ +    /* Linear force table values */
+ +    gmx_mm_pr  ctab0_S0, ctab1_S0;
+ +    gmx_mm_pr  ctab0_S1, ctab1_S1;
+ +    gmx_mm_pr  ctab0_S2, ctab1_S2;
+ +    gmx_mm_pr  ctab0_S3, ctab1_S3;
+ +#ifdef CALC_ENERGIES
+ +    /* Quadratic energy table value */
+ +    gmx_mm_pr  ctabv_S0;
+ +    gmx_mm_pr  ctabv_S1;
+ +    gmx_mm_pr  ctabv_S2;
+ +    gmx_mm_pr  ctabv_S3;
+ +#endif
+ +#endif
+ +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ +    /* The potential (PME mesh) we need to subtract from 1/r */
+ +    gmx_mm_pr  vc_sub_S0;
+ +    gmx_mm_pr  vc_sub_S1;
+ +    gmx_mm_pr  vc_sub_S2;
+ +    gmx_mm_pr  vc_sub_S3;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    /* Electrostatic potential */
+ +    gmx_mm_pr  vcoul_S0;
+ +    gmx_mm_pr  vcoul_S1;
+ +    gmx_mm_pr  vcoul_S2;
+ +    gmx_mm_pr  vcoul_S3;
+ +#endif
+ +#endif
+ +    /* The force times 1/r */
+ +    gmx_mm_pr  fscal_S0;
+ +    gmx_mm_pr  fscal_S1;
+ +    gmx_mm_pr  fscal_S2;
+ +    gmx_mm_pr  fscal_S3;
+ +
+ +#ifdef CALC_LJ
+ +#ifdef LJ_COMB_LB
+ +    /* LJ sigma_j/2 and sqrt(epsilon_j) */
+ +    gmx_mm_pr  hsig_j_S, seps_j_S;
+ +    /* LJ sigma_ij and epsilon_ij */
+ +    gmx_mm_pr  sig_S0, eps_S0;
+ +    gmx_mm_pr  sig_S1, eps_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  sig_S2, eps_S2;
+ +    gmx_mm_pr  sig_S3, eps_S3;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  sig2_S0, sig6_S0;
+ +    gmx_mm_pr  sig2_S1, sig6_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  sig2_S2, sig6_S2;
+ +    gmx_mm_pr  sig2_S3, sig6_S3;
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +#endif /* CALC_LJ */
+ +
+ +#ifdef LJ_COMB_GEOM
+ +    gmx_mm_pr  c6s_j_S, c12s_j_S;
+ +#endif
+ +
+ +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ +    /* Index for loading LJ parameters, complicated when interleaving */
+ +    int         aj2;
+ +#endif
+ +
+ +#ifndef FIX_LJ_C
+ +    /* LJ C6 and C12 parameters, used with geometric comb. rule */
+ +    gmx_mm_pr  c6_S0, c12_S0;
+ +    gmx_mm_pr  c6_S1, c12_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  c6_S2, c12_S2;
+ +    gmx_mm_pr  c6_S3, c12_S3;
+ +#endif
+ +#endif
+ +
+ +    /* Intermediate variables for LJ calculation */
+ +#ifndef LJ_COMB_LB
+ +    gmx_mm_pr  rinvsix_S0;
+ +    gmx_mm_pr  rinvsix_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  rinvsix_S2;
+ +    gmx_mm_pr  rinvsix_S3;
+ +#endif
+ +#endif
+ +#ifdef LJ_COMB_LB
+ +    gmx_mm_pr  sir_S0, sir2_S0, sir6_S0;
+ +    gmx_mm_pr  sir_S1, sir2_S1, sir6_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  sir_S2, sir2_S2, sir6_S2;
+ +    gmx_mm_pr  sir_S3, sir2_S3, sir6_S3;
+ +#endif
+ +#endif
+ +
+ +    gmx_mm_pr  FrLJ6_S0, FrLJ12_S0;
+ +    gmx_mm_pr  FrLJ6_S1, FrLJ12_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  FrLJ6_S2, FrLJ12_S2;
+ +    gmx_mm_pr  FrLJ6_S3, FrLJ12_S3;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  VLJ6_S0, VLJ12_S0, VLJ_S0;
+ +    gmx_mm_pr  VLJ6_S1, VLJ12_S1, VLJ_S1;
+ +#ifndef HALF_LJ
+ +    gmx_mm_pr  VLJ6_S2, VLJ12_S2, VLJ_S2;
+ +    gmx_mm_pr  VLJ6_S3, VLJ12_S3, VLJ_S3;
+ +#endif
+ +#endif
+ +#endif /* CALC_LJ */
+ +
+ +    /* j-cluster index */
+ +    cj            = l_cj[cjind].cj;
+ +
+ +    /* Atom indices (of the first atom in the cluster) */
+ +    aj            = cj*UNROLLJ;
+ +#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+ +#if UNROLLJ == STRIDE
+ +    aj2           = aj*2;
+ +#else
+ +    aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+ +#endif
+ +#endif
+ +#if UNROLLJ == STRIDE
+ +    ajx           = aj*DIM;
+ +#else
+ +    ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+ +#endif
+ +    ajy           = ajx + STRIDE;
+ +    ajz           = ajy + STRIDE;
+ +
+ +#ifdef CHECK_EXCLS
-         /* Integer mask set and operations, cast result to real */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ +    {
-         int_S0  = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S0));
-         int_S1  = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S1));
-         int_S2  = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S2));
-         int_S3  = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S3));
++        /* Load integer topology exclusion interaction mask */
+ +        gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
+ +
-         int_S0  = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
-         int_S1  = gmx_checkbitmask_pr(mask_pr_S, mask_S1);
-         int_S2  = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
-         int_S3  = gmx_checkbitmask_pr(mask_pr_S, mask_S3);
++        interact_S0  = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
++        interact_S1  = gmx_checkbitmask_epi32(mask_pr_S, filter_S1);
++        interact_S2  = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
++        interact_S3  = gmx_checkbitmask_epi32(mask_pr_S, filter_S3);
+ +    }
+ +#else
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
+ +    {
+ +        /* Integer mask set, cast to real and real mask operations */
+ +        gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
+ +
-         wco_S0  = gmx_and_pr(wco_S0, diag_S0);
-         wco_S1  = gmx_and_pr(wco_S1, diag_S1);
-         wco_S2  = gmx_and_pr(wco_S2, diag_S2);
-         wco_S3  = gmx_and_pr(wco_S3, diag_S3);
++        interact_S0  = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
++        interact_S1  = gmx_checkbitmask_pr(mask_pr_S, filter_S1);
++        interact_S2  = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
++        interact_S3  = gmx_checkbitmask_pr(mask_pr_S, filter_S3);
+ +    }
++#else
++#error "No SIMD bitmask operation available"
+ +#endif
+ +#endif
++#endif /* CHECK_EXCLS */
+ +
+ +    /* load j atom coordinates */
+ +    jx_S        = gmx_load_pr(x+ajx);
+ +    jy_S        = gmx_load_pr(x+ajy);
+ +    jz_S        = gmx_load_pr(x+ajz);
+ +
+ +    /* Calculate distance */
+ +    dx_S0       = gmx_sub_pr(ix_S0, jx_S);
+ +    dy_S0       = gmx_sub_pr(iy_S0, jy_S);
+ +    dz_S0       = gmx_sub_pr(iz_S0, jz_S);
+ +    dx_S1       = gmx_sub_pr(ix_S1, jx_S);
+ +    dy_S1       = gmx_sub_pr(iy_S1, jy_S);
+ +    dz_S1       = gmx_sub_pr(iz_S1, jz_S);
+ +    dx_S2       = gmx_sub_pr(ix_S2, jx_S);
+ +    dy_S2       = gmx_sub_pr(iy_S2, jy_S);
+ +    dz_S2       = gmx_sub_pr(iz_S2, jz_S);
+ +    dx_S3       = gmx_sub_pr(ix_S3, jx_S);
+ +    dy_S3       = gmx_sub_pr(iy_S3, jy_S);
+ +    dz_S3       = gmx_sub_pr(iz_S3, jz_S);
+ +
+ +    /* rsq = dx*dx+dy*dy+dz*dz */
+ +    rsq_S0      = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ +    rsq_S1      = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+ +    rsq_S2      = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ +    rsq_S3      = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+ +
+ +#ifndef NBNXN_CUTOFF_USE_BLENDV
+ +    wco_S0      = gmx_cmplt_pr(rsq_S0, rc2_S);
+ +    wco_S1      = gmx_cmplt_pr(rsq_S1, rc2_S);
+ +    wco_S2      = gmx_cmplt_pr(rsq_S2, rc2_S);
+ +    wco_S3      = gmx_cmplt_pr(rsq_S3, rc2_S);
+ +#endif
+ +
+ +#ifdef CHECK_EXCLS
+ +#ifdef EXCL_FORCES
+ +    /* Only remove the (sub-)diagonal to avoid double counting */
+ +#if UNROLLJ == UNROLLI
+ +    if (cj == ci_sh)
+ +    {
-         wco_S0  = gmx_and_pr(wco_S0, diag0_S0);
-         wco_S1  = gmx_and_pr(wco_S1, diag0_S1);
-         wco_S2  = gmx_and_pr(wco_S2, diag0_S2);
-         wco_S3  = gmx_and_pr(wco_S3, diag0_S3);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask_S0);
++        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask_S1);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask_S2);
++        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask_S3);
+ +    }
+ +#else
+ +#if UNROLLJ < UNROLLI
+ +    if (cj == ci_sh*2)
+ +    {
-         wco_S0  = gmx_and_pr(wco_S0, diag1_S0);
-         wco_S1  = gmx_and_pr(wco_S1, diag1_S1);
-         wco_S2  = gmx_and_pr(wco_S2, diag1_S2);
-         wco_S3  = gmx_and_pr(wco_S3, diag1_S3);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
++        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask0_S1);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
++        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask0_S3);
+ +    }
+ +    if (cj == ci_sh*2 + 1)
+ +    {
-         wco_S0  = gmx_and_pr(wco_S0, diag0_S0);
-         wco_S1  = gmx_and_pr(wco_S1, diag0_S1);
-         wco_S2  = gmx_and_pr(wco_S2, diag0_S2);
-         wco_S3  = gmx_and_pr(wco_S3, diag0_S3);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
++        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask1_S1);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
++        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask1_S3);
+ +    }
+ +#else
+ +    if (cj*2 == ci_sh)
+ +    {
-         wco_S0  = gmx_and_pr(wco_S0, diag1_S0);
-         wco_S1  = gmx_and_pr(wco_S1, diag1_S1);
-         wco_S2  = gmx_and_pr(wco_S2, diag1_S2);
-         wco_S3  = gmx_and_pr(wco_S3, diag1_S3);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
++        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask0_S1);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
++        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask0_S3);
+ +    }
+ +    else if (cj*2 + 1 == ci_sh)
+ +    {
-     wco_S0      = gmx_and_pr(wco_S0, int_S0);
-     wco_S1      = gmx_and_pr(wco_S1, int_S1);
-     wco_S2      = gmx_and_pr(wco_S2, int_S2);
-     wco_S3      = gmx_and_pr(wco_S3, int_S3);
++        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
++        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask1_S1);
++        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
++        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask1_S3);
+ +    }
+ +#endif
+ +#endif
+ +#else /* EXCL_FORCES */
+ +    /* No exclusion forces: remove all excluded atom pairs from the list */
-             gmx_store_pr(tmp, i == 0 ? wco_S0 : (i == 1 ? wco_S1 : (i == 2 ? wco_S2 : wco_S3)));
++    wco_S0      = gmx_and_pb(wco_S0, interact_S0);
++    wco_S1      = gmx_and_pb(wco_S1, interact_S1);
++    wco_S2      = gmx_and_pb(wco_S2, interact_S2);
++    wco_S3      = gmx_and_pb(wco_S3, interact_S3);
+ +#endif
+ +#endif
+ +
+ +#ifdef COUNT_PAIRS
+ +    {
+ +        int  i, j;
+ +        real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
+ +        tmp = gmx_simd_align_real(tmpa);
+ +        for (i = 0; i < UNROLLI; i++)
+ +        {
-                 if (!(tmp[j] == 0))
++            gmx_store_pr(tmp, gmx_sub_pr(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
+ +            for (j = 0; j < UNROLLJ; j++)
+ +            {
-     rsq_S0      = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
-     rsq_S1      = gmx_add_pr(rsq_S1, gmx_andnot_pr(int_S1, avoid_sing_S));
-     rsq_S2      = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
-     rsq_S3      = gmx_add_pr(rsq_S3, gmx_andnot_pr(int_S3, avoid_sing_S));
++                if (tmp[j] >= 0)
+ +                {
+ +                    npair++;
+ +                }
+ +            }
+ +        }
+ +    }
+ +#endif
+ +
+ +#ifdef CHECK_EXCLS
+ +    /* For excluded pairs add a small number to avoid r^-6 = NaN */
-     GMX_MM_INVSQRT2_PD(rsq_S0, rsq_S1, rinv_S0, rinv_S1);
-     GMX_MM_INVSQRT2_PD(rsq_S2, rsq_S3, rinv_S2, rinv_S3);
++    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
++    rsq_S1      = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
++    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
++    rsq_S3      = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
+ +#endif
+ +
+ +    /* Calculate 1/r */
+ +#ifndef GMX_DOUBLE
+ +    rinv_S0     = gmx_invsqrt_pr(rsq_S0);
+ +    rinv_S1     = gmx_invsqrt_pr(rsq_S1);
+ +    rinv_S2     = gmx_invsqrt_pr(rsq_S2);
+ +    rinv_S3     = gmx_invsqrt_pr(rsq_S3);
+ +#else
-     load_lj_pair_params(nbfp0, type, aj, c6_S0, c12_S0);
-     load_lj_pair_params(nbfp1, type, aj, c6_S1, c12_S1);
++    gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
++    gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
+ +#endif
+ +
+ +#ifdef CALC_COULOMB
+ +    /* Load parameters for j atom */
+ +    jq_S        = gmx_load_pr(q+aj);
+ +    qq_S0       = gmx_mul_pr(iq_S0, jq_S);
+ +    qq_S1       = gmx_mul_pr(iq_S1, jq_S);
+ +    qq_S2       = gmx_mul_pr(iq_S2, jq_S);
+ +    qq_S3       = gmx_mul_pr(iq_S3, jq_S);
+ +#endif
+ +
+ +#ifdef CALC_LJ
+ +
+ +#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
-     load_lj_pair_params(nbfp2, type, aj, c6_S2, c12_S2);
-     load_lj_pair_params(nbfp3, type, aj, c6_S3, c12_S3);
++    load_lj_pair_params(nbfp0, type, aj, &c6_S0, &c12_S0);
++    load_lj_pair_params(nbfp1, type, aj, &c6_S1, &c12_S1);
+ +#ifndef HALF_LJ
-     rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, int_S0);
-     rinv_ex_S1  = gmx_blendzero_pr(rinv_S1, int_S1);
-     rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, int_S2);
-     rinv_ex_S3  = gmx_blendzero_pr(rinv_S3, int_S3);
++    load_lj_pair_params(nbfp2, type, aj, &c6_S2, &c12_S2);
++    load_lj_pair_params(nbfp3, type, aj, &c6_S3, &c12_S3);
+ +#endif
+ +#endif /* not defined any LJ rule */
+ +
+ +#ifdef LJ_COMB_GEOM
+ +    c6s_j_S     = gmx_load_pr(ljc+aj2+0);
+ +    c12s_j_S    = gmx_load_pr(ljc+aj2+STRIDE);
+ +    c6_S0       = gmx_mul_pr(c6s_S0, c6s_j_S );
+ +    c6_S1       = gmx_mul_pr(c6s_S1, c6s_j_S );
+ +#ifndef HALF_LJ
+ +    c6_S2       = gmx_mul_pr(c6s_S2, c6s_j_S );
+ +    c6_S3       = gmx_mul_pr(c6s_S3, c6s_j_S );
+ +#endif
+ +    c12_S0      = gmx_mul_pr(c12s_S0, c12s_j_S);
+ +    c12_S1      = gmx_mul_pr(c12s_S1, c12s_j_S);
+ +#ifndef HALF_LJ
+ +    c12_S2      = gmx_mul_pr(c12s_S2, c12s_j_S);
+ +    c12_S3      = gmx_mul_pr(c12s_S3, c12s_j_S);
+ +#endif
+ +#endif /* LJ_COMB_GEOM */
+ +
+ +#ifdef LJ_COMB_LB
+ +    hsig_j_S    = gmx_load_pr(ljc+aj2+0);
+ +    seps_j_S    = gmx_load_pr(ljc+aj2+STRIDE);
+ +
+ +    sig_S0      = gmx_add_pr(hsig_i_S0, hsig_j_S);
+ +    sig_S1      = gmx_add_pr(hsig_i_S1, hsig_j_S);
+ +    eps_S0      = gmx_mul_pr(seps_i_S0, seps_j_S);
+ +    eps_S1      = gmx_mul_pr(seps_i_S1, seps_j_S);
+ +#ifndef HALF_LJ
+ +    sig_S2      = gmx_add_pr(hsig_i_S2, hsig_j_S);
+ +    sig_S3      = gmx_add_pr(hsig_i_S3, hsig_j_S);
+ +    eps_S2      = gmx_mul_pr(seps_i_S2, seps_j_S);
+ +    eps_S3      = gmx_mul_pr(seps_i_S3, seps_j_S);
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +
+ +#endif /* CALC_LJ */
+ +
+ +#ifndef NBNXN_CUTOFF_USE_BLENDV
+ +    rinv_S0     = gmx_blendzero_pr(rinv_S0, wco_S0);
+ +    rinv_S1     = gmx_blendzero_pr(rinv_S1, wco_S1);
+ +    rinv_S2     = gmx_blendzero_pr(rinv_S2, wco_S2);
+ +    rinv_S3     = gmx_blendzero_pr(rinv_S3, wco_S3);
+ +#else
+ +    /* We only need to mask for the cut-off: blendv is faster */
+ +    rinv_S0     = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
+ +    rinv_S1     = gmx_blendv_pr(rinv_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1));
+ +    rinv_S2     = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+ +    rinv_S3     = gmx_blendv_pr(rinv_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3));
+ +#endif
+ +
+ +    rinvsq_S0   = gmx_mul_pr(rinv_S0, rinv_S0);
+ +    rinvsq_S1   = gmx_mul_pr(rinv_S1, rinv_S1);
+ +    rinvsq_S2   = gmx_mul_pr(rinv_S2, rinv_S2);
+ +    rinvsq_S3   = gmx_mul_pr(rinv_S3, rinv_S3);
+ +
+ +#ifdef CALC_COULOMB
+ +    /* Note that here we calculate force*r, not the usual force/r.
+ +     * This allows avoiding masking the reaction-field contribution,
+ +     * as frcoul is later multiplied by rinvsq which has been
+ +     * masked with the cut-off check.
+ +     */
+ +
+ +#ifdef EXCL_FORCES
+ +    /* Only add 1/r for non-excluded atom pairs */
-     frcoul_S0   = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
-     frcoul_S1   = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(rsq_S1, mrc_3_S)));
-     frcoul_S2   = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
-     frcoul_S3   = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(rsq_S3, mrc_3_S)));
++    rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, interact_S0);
++    rinv_ex_S1  = gmx_blendzero_pr(rinv_S1, interact_S1);
++    rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, interact_S2);
++    rinv_ex_S3  = gmx_blendzero_pr(rinv_S3, interact_S3);
+ +#else
+ +    /* No exclusion forces, we always need 1/r */
+ +#define     rinv_ex_S0    rinv_S0
+ +#define     rinv_ex_S1    rinv_S1
+ +#define     rinv_ex_S2    rinv_S2
+ +#define     rinv_ex_S3    rinv_S3
+ +#endif
+ +
+ +#ifdef CALC_COUL_RF
+ +    /* Electrostatic interactions */
-     frcoul_S0   = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
-     frcoul_S1   = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(ewcorr_S1, brsq_S1)));
-     frcoul_S2   = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
-     frcoul_S3   = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(ewcorr_S3, brsq_S3)));
++    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
++    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_madd_pr(rsq_S1, mrc_3_S, rinv_ex_S1));
++    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
++    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_madd_pr(rsq_S3, mrc_3_S, rinv_ex_S3));
+ +
+ +#ifdef CALC_ENERGIES
+ +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
+ +    vcoul_S1    = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_add_pr(gmx_mul_pr(rsq_S1, hrc_3_S), moh_rc_S)));
+ +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+ +    vcoul_S3    = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_add_pr(gmx_mul_pr(rsq_S3, hrc_3_S), moh_rc_S)));
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_COUL_EWALD
+ +    /* We need to mask (or limit) rsq for the cut-off,
+ +     * as large distances can cause an overflow in gmx_pmecorrF/V.
+ +     */
+ +#ifndef NBNXN_CUTOFF_USE_BLENDV
+ +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
+ +    brsq_S1     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S1, wco_S1));
+ +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+ +    brsq_S3     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S3, wco_S3));
+ +#else
+ +    /* Strangely, putting mul on a separate line is slower (icc 13) */
+ +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
+ +    brsq_S1     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1)));
+ +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+ +    brsq_S3     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3)));
+ +#endif
+ +    ewcorr_S0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
+ +    ewcorr_S1   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S);
+ +    ewcorr_S2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
+ +    ewcorr_S3   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S);
- #ifdef GMX_HAVE_SIMD_FLOOR
++    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
++    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_madd_pr(ewcorr_S1, brsq_S1, rinv_ex_S1));
++    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
++    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_madd_pr(ewcorr_S3, brsq_S3, rinv_ex_S3));
+ +
+ +#ifdef CALC_ENERGIES
+ +    vc_sub_S0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
+ +    vc_sub_S1   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S1), beta_S);
+ +    vc_sub_S2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+ +    vc_sub_S3   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S3), beta_S);
+ +#endif
+ +
+ +#endif /* CALC_COUL_EWALD */
+ +
+ +#ifdef CALC_COUL_TAB
+ +    /* Electrostatic interactions */
+ +    r_S0        = gmx_mul_pr(rsq_S0, rinv_S0);
+ +    r_S1        = gmx_mul_pr(rsq_S1, rinv_S1);
+ +    r_S2        = gmx_mul_pr(rsq_S2, rinv_S2);
+ +    r_S3        = gmx_mul_pr(rsq_S3, rinv_S3);
+ +    /* Convert r to scaled table units */
+ +    rs_S0       = gmx_mul_pr(r_S0, invtsp_S);
+ +    rs_S1       = gmx_mul_pr(r_S1, invtsp_S);
+ +    rs_S2       = gmx_mul_pr(r_S2, invtsp_S);
+ +    rs_S3       = gmx_mul_pr(r_S3, invtsp_S);
+ +    /* Truncate scaled r to an int */
+ +    ti_S0       = gmx_cvttpr_epi32(rs_S0);
+ +    ti_S1       = gmx_cvttpr_epi32(rs_S1);
+ +    ti_S2       = gmx_cvttpr_epi32(rs_S2);
+ +    ti_S3       = gmx_cvttpr_epi32(rs_S3);
-     load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
-     load_table_f(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1);
-     load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
-     load_table_f(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3);
++#ifdef GMX_SIMD_HAVE_FLOOR
+ +    /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+ +    rf_S0       = gmx_floor_pr(rs_S0);
+ +    rf_S1       = gmx_floor_pr(rs_S1);
+ +    rf_S2       = gmx_floor_pr(rs_S2);
+ +    rf_S3       = gmx_floor_pr(rs_S3);
+ +#else
+ +    rf_S0       = gmx_cvtepi32_pr(ti_S0);
+ +    rf_S1       = gmx_cvtepi32_pr(ti_S1);
+ +    rf_S2       = gmx_cvtepi32_pr(ti_S2);
+ +    rf_S3       = gmx_cvtepi32_pr(ti_S3);
+ +#endif
+ +    frac_S0     = gmx_sub_pr(rs_S0, rf_S0);
+ +    frac_S1     = gmx_sub_pr(rs_S1, rf_S1);
+ +    frac_S2     = gmx_sub_pr(rs_S2, rf_S2);
+ +    frac_S3     = gmx_sub_pr(rs_S3, rf_S3);
+ +
+ +    /* Load and interpolate table forces and possibly energies.
+ +     * Force and energy can be combined in one table, stride 4: FDV0
+ +     * or in two separate tables with stride 1: F and V
+ +     * Currently single precision uses FDV0, double F and V.
+ +     */
+ +#ifndef CALC_ENERGIES
-     load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
-     load_table_f_v(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
-     load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
-     load_table_f_v(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
++    load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
++    load_table_f(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1);
++    load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
++    load_table_f(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3);
+ +#else
+ +#ifdef TAB_FDV0
-     load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
-     load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
-     load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
-     load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
++    load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++    load_table_f_v(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
++    load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
++    load_table_f_v(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
+ +#else
-     vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
-     vc_sub_S1   = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, int_S1));
-     vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
-     vc_sub_S3   = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, int_S3));
++    load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++    load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
++    load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
++    load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
+ +#endif
+ +#endif
+ +    fsub_S0     = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
+ +    fsub_S1     = gmx_add_pr(ctab0_S1, gmx_mul_pr(frac_S1, ctab1_S1));
+ +    fsub_S2     = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
+ +    fsub_S3     = gmx_add_pr(ctab0_S3, gmx_mul_pr(frac_S3, ctab1_S3));
+ +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
+ +    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, gmx_mul_pr(fsub_S1, r_S1)));
+ +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+ +    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, gmx_mul_pr(fsub_S3, r_S3)));
+ +
+ +#ifdef CALC_ENERGIES
+ +    vc_sub_S0   = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
+ +    vc_sub_S1   = gmx_add_pr(ctabv_S1, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S1), gmx_add_pr(ctab0_S1, fsub_S1)));
+ +    vc_sub_S2   = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+ +    vc_sub_S3   = gmx_add_pr(ctabv_S3, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S3), gmx_add_pr(ctab0_S3, fsub_S3)));
+ +#endif
+ +#endif /* CALC_COUL_TAB */
+ +
+ +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ +#ifndef NO_SHIFT_EWALD
+ +    /* Add Ewald potential shift to vc_sub for convenience */
+ +#ifdef CHECK_EXCLS
-     rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, int_S0);
-     rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, int_S1);
++    vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
++    vc_sub_S1   = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, interact_S1));
++    vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
++    vc_sub_S3   = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, interact_S3));
+ +#else
+ +    vc_sub_S0   = gmx_add_pr(vc_sub_S0, sh_ewald_S);
+ +    vc_sub_S1   = gmx_add_pr(vc_sub_S1, sh_ewald_S);
+ +    vc_sub_S2   = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+ +    vc_sub_S3   = gmx_add_pr(vc_sub_S3, sh_ewald_S);
+ +#endif
+ +#endif
+ +
+ +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
+ +    vcoul_S1    = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, vc_sub_S1));
+ +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+ +    vcoul_S3    = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, vc_sub_S3));
+ +
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    /* Mask energy for cut-off and diagonal */
+ +    vcoul_S0    = gmx_blendzero_pr(vcoul_S0, wco_S0);
+ +    vcoul_S1    = gmx_blendzero_pr(vcoul_S1, wco_S1);
+ +    vcoul_S2    = gmx_blendzero_pr(vcoul_S2, wco_S2);
+ +    vcoul_S3    = gmx_blendzero_pr(vcoul_S3, wco_S3);
+ +#endif
+ +
+ +#endif /* CALC_COULOMB */
+ +
+ +#ifdef CALC_LJ
+ +    /* Lennard-Jones interaction */
+ +
+ +#ifdef VDW_CUTOFF_CHECK
+ +    wco_vdw_S0  = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+ +    wco_vdw_S1  = gmx_cmplt_pr(rsq_S1, rcvdw2_S);
+ +#ifndef HALF_LJ
+ +    wco_vdw_S2  = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+ +    wco_vdw_S3  = gmx_cmplt_pr(rsq_S3, rcvdw2_S);
+ +#endif
+ +#else
+ +    /* Same cut-off for Coulomb and VdW, reuse the registers */
+ +#define     wco_vdw_S0    wco_S0
+ +#define     wco_vdw_S1    wco_S1
+ +#define     wco_vdw_S2    wco_S2
+ +#define     wco_vdw_S3    wco_S3
+ +#endif
+ +
+ +#ifndef LJ_COMB_LB
+ +    rinvsix_S0  = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+ +    rinvsix_S1  = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1));
+ +#ifdef EXCL_FORCES
-     rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, int_S2);
-     rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, int_S3);
++    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, interact_S0);
++    rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, interact_S1);
+ +#endif
+ +#ifndef HALF_LJ
+ +    rinvsix_S2  = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+ +    rinvsix_S3  = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3));
+ +#ifdef EXCL_FORCES
-     sir6_S0     = gmx_blendzero_pr(sir6_S0, int_S0);
-     sir6_S1     = gmx_blendzero_pr(sir6_S1, int_S1);
++    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, interact_S2);
++    rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, interact_S3);
+ +#endif
+ +#endif
+ +#ifdef VDW_CUTOFF_CHECK
+ +    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+ +    rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, wco_vdw_S1);
+ +#ifndef HALF_LJ
+ +    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+ +    rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, wco_vdw_S3);
+ +#endif
+ +#endif
+ +    FrLJ6_S0    = gmx_mul_pr(c6_S0, rinvsix_S0);
+ +    FrLJ6_S1    = gmx_mul_pr(c6_S1, rinvsix_S1);
+ +#ifndef HALF_LJ
+ +    FrLJ6_S2    = gmx_mul_pr(c6_S2, rinvsix_S2);
+ +    FrLJ6_S3    = gmx_mul_pr(c6_S3, rinvsix_S3);
+ +#endif
+ +    FrLJ12_S0   = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+ +    FrLJ12_S1   = gmx_mul_pr(c12_S1, gmx_mul_pr(rinvsix_S1, rinvsix_S1));
+ +#ifndef HALF_LJ
+ +    FrLJ12_S2   = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+ +    FrLJ12_S3   = gmx_mul_pr(c12_S3, gmx_mul_pr(rinvsix_S3, rinvsix_S3));
+ +#endif
+ +#endif /* not LJ_COMB_LB */
+ +
+ +#ifdef LJ_COMB_LB
+ +    sir_S0      = gmx_mul_pr(sig_S0, rinv_S0);
+ +    sir_S1      = gmx_mul_pr(sig_S1, rinv_S1);
+ +#ifndef HALF_LJ
+ +    sir_S2      = gmx_mul_pr(sig_S2, rinv_S2);
+ +    sir_S3      = gmx_mul_pr(sig_S3, rinv_S3);
+ +#endif
+ +    sir2_S0     = gmx_mul_pr(sir_S0, sir_S0);
+ +    sir2_S1     = gmx_mul_pr(sir_S1, sir_S1);
+ +#ifndef HALF_LJ
+ +    sir2_S2     = gmx_mul_pr(sir_S2, sir_S2);
+ +    sir2_S3     = gmx_mul_pr(sir_S3, sir_S3);
+ +#endif
+ +    sir6_S0     = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+ +    sir6_S1     = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1));
+ +#ifdef EXCL_FORCES
-     sir6_S2     = gmx_blendzero_pr(sir6_S2, int_S2);
-     sir6_S3     = gmx_blendzero_pr(sir6_S3, int_S3);
++    sir6_S0     = gmx_blendzero_pr(sir6_S0, interact_S0);
++    sir6_S1     = gmx_blendzero_pr(sir6_S1, interact_S1);
+ +#endif
+ +#ifndef HALF_LJ
+ +    sir6_S2     = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+ +    sir6_S3     = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3));
+ +#ifdef EXCL_FORCES
-     VLJ_S0      = gmx_blendzero_pr(VLJ_S0, int_S0);
-     VLJ_S1      = gmx_blendzero_pr(VLJ_S1, int_S1);
++    sir6_S2     = gmx_blendzero_pr(sir6_S2, interact_S2);
++    sir6_S3     = gmx_blendzero_pr(sir6_S3, interact_S3);
+ +#endif
+ +#endif
+ +#ifdef VDW_CUTOFF_CHECK
+ +    sir6_S0     = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+ +    sir6_S1     = gmx_blendzero_pr(sir6_S1, wco_vdw_S1);
+ +#ifndef HALF_LJ
+ +    sir6_S2     = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+ +    sir6_S3     = gmx_blendzero_pr(sir6_S3, wco_vdw_S3);
+ +#endif
+ +#endif
+ +    FrLJ6_S0    = gmx_mul_pr(eps_S0, sir6_S0);
+ +    FrLJ6_S1    = gmx_mul_pr(eps_S1, sir6_S1);
+ +#ifndef HALF_LJ
+ +    FrLJ6_S2    = gmx_mul_pr(eps_S2, sir6_S2);
+ +    FrLJ6_S3    = gmx_mul_pr(eps_S3, sir6_S3);
+ +#endif
+ +    FrLJ12_S0   = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+ +    FrLJ12_S1   = gmx_mul_pr(FrLJ6_S1, sir6_S1);
+ +#ifndef HALF_LJ
+ +    FrLJ12_S2   = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+ +    FrLJ12_S3   = gmx_mul_pr(FrLJ6_S3, sir6_S3);
+ +#endif
+ +#if defined CALC_ENERGIES
+ +    /* We need C6 and C12 to calculate the LJ potential shift */
+ +    sig2_S0     = gmx_mul_pr(sig_S0, sig_S0);
+ +    sig2_S1     = gmx_mul_pr(sig_S1, sig_S1);
+ +#ifndef HALF_LJ
+ +    sig2_S2     = gmx_mul_pr(sig_S2, sig_S2);
+ +    sig2_S3     = gmx_mul_pr(sig_S3, sig_S3);
+ +#endif
+ +    sig6_S0     = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+ +    sig6_S1     = gmx_mul_pr(sig2_S1, gmx_mul_pr(sig2_S1, sig2_S1));
+ +#ifndef HALF_LJ
+ +    sig6_S2     = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+ +    sig6_S3     = gmx_mul_pr(sig2_S3, gmx_mul_pr(sig2_S3, sig2_S3));
+ +#endif
+ +    c6_S0       = gmx_mul_pr(eps_S0, sig6_S0);
+ +    c6_S1       = gmx_mul_pr(eps_S1, sig6_S1);
+ +#ifndef HALF_LJ
+ +    c6_S2       = gmx_mul_pr(eps_S2, sig6_S2);
+ +    c6_S3       = gmx_mul_pr(eps_S3, sig6_S3);
+ +#endif
+ +    c12_S0      = gmx_mul_pr(c6_S0, sig6_S0);
+ +    c12_S1      = gmx_mul_pr(c6_S1, sig6_S1);
+ +#ifndef HALF_LJ
+ +    c12_S2      = gmx_mul_pr(c6_S2, sig6_S2);
+ +    c12_S3      = gmx_mul_pr(c6_S3, sig6_S3);
+ +#endif
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +
+ +#endif /* CALC_LJ */
+ +
+ +#ifdef CALC_ENERGIES
+ +#ifdef ENERGY_GROUPS
+ +    /* Extract the group pair index per j pair.
+ +     * Energy groups are stored per i-cluster, so things get
+ +     * complicated when the i- and j-cluster size don't match.
+ +     */
+ +    {
+ +        int egps_j;
+ +#if UNROLLJ == 2
+ +        egps_j    = nbat->energrp[cj>>1];
+ +        egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+ +#else
+ +        /* We assume UNROLLI <= UNROLLJ */
+ +        int jdi;
+ +        for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
+ +        {
+ +            int jj;
+ +            egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ +            for (jj = 0; jj < (UNROLLI/2); jj++)
+ +            {
+ +                egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ +            }
+ +        }
+ +#endif
+ +    }
+ +#endif
+ +
+ +#ifdef CALC_COULOMB
+ +#ifndef ENERGY_GROUPS
+ +    vctot_S      = gmx_add_pr(vctot_S, gmx_sum4_pr(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
+ +#else
+ +    add_ener_grp(vcoul_S0, vctp[0], egp_jj);
+ +    add_ener_grp(vcoul_S1, vctp[1], egp_jj);
+ +    add_ener_grp(vcoul_S2, vctp[2], egp_jj);
+ +    add_ener_grp(vcoul_S3, vctp[3], egp_jj);
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_LJ
+ +    /* Calculate the LJ energies */
+ +    VLJ6_S0     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
+ +    VLJ6_S1     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S1, gmx_mul_pr(c6_S1, sh_invrc6_S)));
+ +#ifndef HALF_LJ
+ +    VLJ6_S2     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
+ +    VLJ6_S3     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S3, gmx_mul_pr(c6_S3, sh_invrc6_S)));
+ +#endif
+ +    VLJ12_S0    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
+ +    VLJ12_S1    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S1, gmx_mul_pr(c12_S1, sh_invrc12_S)));
+ +#ifndef HALF_LJ
+ +    VLJ12_S2    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
+ +    VLJ12_S3    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S3, gmx_mul_pr(c12_S3, sh_invrc12_S)));
+ +#endif
+ +
+ +    VLJ_S0      = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
+ +    VLJ_S1      = gmx_sub_pr(VLJ12_S1, VLJ6_S1);
+ +#ifndef HALF_LJ
+ +    VLJ_S2      = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
+ +    VLJ_S3      = gmx_sub_pr(VLJ12_S3, VLJ6_S3);
+ +#endif
+ +    /* The potential shift should be removed for pairs beyond cut-off */
+ +    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
+ +    VLJ_S1      = gmx_blendzero_pr(VLJ_S1, wco_vdw_S1);
+ +#ifndef HALF_LJ
+ +    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
+ +    VLJ_S3      = gmx_blendzero_pr(VLJ_S3, wco_vdw_S3);
+ +#endif
+ +#ifdef CHECK_EXCLS
+ +    /* The potential shift should be removed for excluded pairs */
-     VLJ_S2      = gmx_blendzero_pr(VLJ_S2, int_S2);
-     VLJ_S3      = gmx_blendzero_pr(VLJ_S3, int_S3);
++    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, interact_S0);
++    VLJ_S1      = gmx_blendzero_pr(VLJ_S1, interact_S1);
+ +#ifndef HALF_LJ
++    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, interact_S2);
++    VLJ_S3      = gmx_blendzero_pr(VLJ_S3, interact_S3);
+ +#endif
+ +#endif
+ +#ifndef ENERGY_GROUPS
+ +    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
+ +#ifndef HALF_LJ
+ +                             gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
+ +#else
+ +                             gmx_add_pr(VLJ_S0, VLJ_S1)
+ +#endif
+ +                             );
+ +#else
+ +    add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
+ +    add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
+ +#ifndef HALF_LJ
+ +    add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
+ +    add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
+ +#endif
+ +#endif
+ +#endif /* CALC_LJ */
+ +#endif /* CALC_ENERGIES */
+ +
+ +#ifdef CALC_LJ
+ +    fscal_S0    = gmx_mul_pr(rinvsq_S0,
+ +#ifdef CALC_COULOMB
+ +                               gmx_add_pr(frcoul_S0,
+ +#else
+ +                               (
+ +#endif
+ +                                          gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+ +    fscal_S1    = gmx_mul_pr(rinvsq_S1,
+ +#ifdef CALC_COULOMB
+ +                               gmx_add_pr(frcoul_S1,
+ +#else
+ +                               (
+ +#endif
+ +                                          gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+ +#else
+ +    fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+ +    fscal_S1    = gmx_mul_pr(rinvsq_S1, frcoul_S1);
+ +#endif /* CALC_LJ */
+ +#if defined CALC_LJ && !defined HALF_LJ
+ +    fscal_S2    = gmx_mul_pr(rinvsq_S2,
+ +#ifdef CALC_COULOMB
+ +                               gmx_add_pr(frcoul_S2,
+ +#else
+ +                               (
+ +#endif
+ +                                          gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+ +    fscal_S3    = gmx_mul_pr(rinvsq_S3,
+ +#ifdef CALC_COULOMB
+ +                               gmx_add_pr(frcoul_S3,
+ +#else
+ +                               (
+ +#endif
+ +                                          gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+ +#else
+ +    /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+ +    fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+ +    fscal_S3    = gmx_mul_pr(rinvsq_S3, frcoul_S3);
+ +#endif
+ +
+ +    /* Calculate temporary vectorial force */
+ +    tx_S0       = gmx_mul_pr(fscal_S0, dx_S0);
+ +    tx_S1       = gmx_mul_pr(fscal_S1, dx_S1);
+ +    tx_S2       = gmx_mul_pr(fscal_S2, dx_S2);
+ +    tx_S3       = gmx_mul_pr(fscal_S3, dx_S3);
+ +    ty_S0       = gmx_mul_pr(fscal_S0, dy_S0);
+ +    ty_S1       = gmx_mul_pr(fscal_S1, dy_S1);
+ +    ty_S2       = gmx_mul_pr(fscal_S2, dy_S2);
+ +    ty_S3       = gmx_mul_pr(fscal_S3, dy_S3);
+ +    tz_S0       = gmx_mul_pr(fscal_S0, dz_S0);
+ +    tz_S1       = gmx_mul_pr(fscal_S1, dz_S1);
+ +    tz_S2       = gmx_mul_pr(fscal_S2, dz_S2);
+ +    tz_S3       = gmx_mul_pr(fscal_S3, dz_S3);
+ +
+ +    /* Increment i atom force */
+ +    fix_S0      = gmx_add_pr(fix_S0, tx_S0);
+ +    fix_S1      = gmx_add_pr(fix_S1, tx_S1);
+ +    fix_S2      = gmx_add_pr(fix_S2, tx_S2);
+ +    fix_S3      = gmx_add_pr(fix_S3, tx_S3);
+ +    fiy_S0      = gmx_add_pr(fiy_S0, ty_S0);
+ +    fiy_S1      = gmx_add_pr(fiy_S1, ty_S1);
+ +    fiy_S2      = gmx_add_pr(fiy_S2, ty_S2);
+ +    fiy_S3      = gmx_add_pr(fiy_S3, ty_S3);
+ +    fiz_S0      = gmx_add_pr(fiz_S0, tz_S0);
+ +    fiz_S1      = gmx_add_pr(fiz_S1, tz_S1);
+ +    fiz_S2      = gmx_add_pr(fiz_S2, tz_S2);
+ +    fiz_S3      = gmx_add_pr(fiz_S3, tz_S3);
+ +
+ +    /* Decrement j atom force */
+ +    gmx_store_pr(f+ajx,
+ +                 gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_S0, tx_S1, tx_S2, tx_S3) ));
+ +    gmx_store_pr(f+ajy,
+ +                 gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_S0, ty_S1, ty_S2, ty_S3) ));
+ +    gmx_store_pr(f+ajz,
+ +                 gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_S0, tz_S1, tz_S2, tz_S3) ));
+ +}
+ +
+ +#undef  rinv_ex_S0
+ +#undef  rinv_ex_S1
+ +#undef  rinv_ex_S2
+ +#undef  rinv_ex_S3
+ +
+ +#undef  wco_vdw_S0
+ +#undef  wco_vdw_S1
+ +#undef  wco_vdw_S2
+ +#undef  wco_vdw_S3
+ +
+ +#undef  NBNXN_CUTOFF_USE_BLENDV
+ +
+ +#undef  EXCL_FORCES
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h

index e5f4ae0fa8a520f98ee90e7c14eba96eda9e9215,0000000000000000000000000000000000000000..e5b71bac4f265896b0c4d16c4c801d1184893cd9

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
@@@ -1,808 -1,0 +1,842 @@@
- #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
- #endif
- 
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
- 
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2009, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
- #define SIMD_MASK_ALL   0xffffffff
+ +#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+ +
+ +#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
+ +#define UNROLLJ    GMX_SIMD_WIDTH_HERE
+ +
+ +/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
+ +#if GMX_SIMD_WIDTH_HERE >= UNROLLI
+ +#define STRIDE     GMX_SIMD_WIDTH_HERE
+ +#else
+ +#define STRIDE     UNROLLI
+ +#endif
+ +
+ +#if GMX_SIMD_WIDTH_HERE == 2
+ +#define SUM_SIMD(x)  (x[0]+x[1])
+ +#else
+ +#if GMX_SIMD_WIDTH_HERE == 4
+ +#define SUM_SIMD(x)  SUM_SIMD4(x)
+ +#else
+ +#if GMX_SIMD_WIDTH_HERE == 8
+ +#define SUM_SIMD(x)  (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+ +#else
+ +#error "unsupported kernel configuration"
+ +#endif
+ +#endif
+ +#endif
+ +
+ +
+ +/* Decide if we should use the FDV0 table layout */
+ +#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+ +/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+ +#if GMX_SIMD_WIDTH_HERE/2 == 4
+ +#define TAB_FDV0
+ +#endif
+ +#else
+ +/* We use the FDV0 table layout when we can use aligned table loads */
+ +#if GMX_SIMD_WIDTH_HERE == 4
+ +#define TAB_FDV0
+ +#endif
+ +#endif
+ +
++/* Decide the stride for the 2 LJ parameters */
++#ifdef GMX_X86_SSE2
++#ifdef GMX_DOUBLE
++#define NBFP_STRIDE  2
++#else
++#define NBFP_STRIDE  4
++#endif
++#else
++#if GMX_SIMD_WIDTH_HERE > 4
++#define NBFP_STRIDE  4
++#else
++#define NBFP_STRIDE  GMX_SIMD_WIDTH_HERE
++#endif
++#endif
+ +
- #ifndef GMX_DOUBLE
-     __m128     fix_S, fiy_S, fiz_S;
+ +
+ +#include "nbnxn_kernel_simd_utils.h"
+ +
+ +/* All functionality defines are set here, except for:
+ + * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ + * CHECK_EXCLS, which is set just before including the inner loop contents.
+ + * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ + * set before calling the kernel function. We might want to move that
+ + * to inside the n-loop and have a different combination rule for different
+ + * ci's, as no combination rule gives a 50% performance hit for LJ.
+ + */
+ +
+ +/* We always calculate shift forces, because it's cheap anyhow */
+ +#define CALC_SHIFTFORCES
+ +
+ +/* Assumes all LJ parameters are identical */
+ +/* #define FIX_LJ_C */
+ +
+ +/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ + * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ + * and energy calculations (ene), depending on the defines set.
+ + */
+ +
+ +#define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
+ +
+ +#if defined LJ_COMB_GEOM
+ +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
+ +#else
+ +#if defined LJ_COMB_LB
+ +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
+ +#else
+ +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_COUL_RF
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
+ +#endif
+ +#ifdef CALC_COUL_TAB
+ +#ifndef VDW_CUTOFF_CHECK
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
+ +#else
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
+ +#endif
+ +#endif
+ +#ifdef CALC_COUL_EWALD
+ +#ifndef VDW_CUTOFF_CHECK
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
+ +#else
+ +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
+ +#endif
+ +#endif
+ +
+ +static void
+ +#ifndef CALC_ENERGIES
+ +NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
+ +#else
+ +#ifndef ENERGY_GROUPS
+ +NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
+ +#else
+ +NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
+ +#endif
+ +#endif
+ +#undef NBK_FUNC_NAME
+ +#undef NBK_FUNC_NAME_C
+ +#undef NBK_FUNC_NAME_C_LJC
+ +(const nbnxn_pairlist_t     *nbl,
+ + const nbnxn_atomdata_t     *nbat,
+ + const interaction_const_t  *ic,
+ + rvec                       *shift_vec,
+ + real                       *f
+ +#ifdef CALC_SHIFTFORCES
+ + ,
+ + real                       *fshift
+ +#endif
+ +#ifdef CALC_ENERGIES
+ + ,
+ + real                       *Vvdw,
+ + real                       *Vc
+ +#endif
+ +)
+ +{
+ +    const nbnxn_ci_t   *nbln;
+ +    const nbnxn_cj_t   *l_cj;
+ +    const int          *type;
+ +    const real         *q;
+ +    const real         *shiftvec;
+ +    const real         *x;
+ +    const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
+ +    real                facel;
+ +    real               *nbfp_ptr;
+ +    int                 nbfp_stride;
+ +    int                 n, ci, ci_sh;
+ +    int                 ish, ish3;
+ +    gmx_bool            do_LJ, half_LJ, do_coul;
+ +    int                 sci, scix, sciy, sciz, sci2;
+ +    int                 cjind0, cjind1, cjind;
+ +    int                 ip, jp;
+ +
+ +#ifdef ENERGY_GROUPS
+ +    int         Vstride_i;
+ +    int         egps_ishift, egps_imask;
+ +    int         egps_jshift, egps_jmask, egps_jstride;
+ +    int         egps_i;
+ +    real       *vvdwtp[UNROLLI];
+ +    real       *vctp[UNROLLI];
+ +#endif
+ +
+ +    gmx_mm_pr  shX_S;
+ +    gmx_mm_pr  shY_S;
+ +    gmx_mm_pr  shZ_S;
+ +    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
+ +    gmx_mm_pr  ix_S1, iy_S1, iz_S1;
+ +    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
+ +    gmx_mm_pr  ix_S3, iy_S3, iz_S3;
+ +    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
+ +    gmx_mm_pr  fix_S1, fiy_S1, fiz_S1;
+ +    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
+ +    gmx_mm_pr  fix_S3, fiy_S3, fiz_S3;
+ +#if UNROLLJ >= 4
-     __m256d    fix_S, fiy_S, fiz_S;
++    /* We use an i-force SIMD register width of 4 */
++#if UNROLLJ == 4
++#define gmx_mm_pr4     gmx_mm_pr
++#define gmx_load_pr4   gmx_load_pr
++#define gmx_store_pr4  gmx_store_pr
++#define gmx_add_pr4    gmx_add_pr
+ +#else
-     __m128d    fix0_S, fiy0_S, fiz0_S;
-     __m128d    fix2_S, fiy2_S, fiz2_S;
++    /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
+ +#endif
++    gmx_mm_pr4 fix_S, fiy_S, fiz_S;
+ +#else
-     gmx_mm_pr diag_jmi_S;
++    /* We use an i-force SIMD register width of 2 */
++    gmx_mm_pr  fix0_S, fiy0_S, fiz0_S;
++    gmx_mm_pr  fix2_S, fiy2_S, fiz2_S;
+ +#endif
+ +
-     gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
++    gmx_mm_pr  diagonal_jmi_S;
+ +#if UNROLLI == UNROLLJ
-     gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
-     gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
++    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
+ +#else
- #ifdef gmx_checkbitmask_epi32
-     gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
++    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
++    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
+ +#endif
+ +
-     gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
++    unsigned   *excl_filter;
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++    gmx_epi32  filter_S0, filter_S1, filter_S2, filter_S3;
+ +#else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
++    gmx_mm_pr  filter_S0, filter_S1, filter_S2, filter_S3;
+ +#endif
+ +
+ +    gmx_mm_pr  zero_S = gmx_set1_pr(0);
+ +
+ +    gmx_mm_pr  one_S  = gmx_set1_pr(1.0);
+ +    gmx_mm_pr  iq_S0  = gmx_setzero_pr();
+ +    gmx_mm_pr  iq_S1  = gmx_setzero_pr();
+ +    gmx_mm_pr  iq_S2  = gmx_setzero_pr();
+ +    gmx_mm_pr  iq_S3  = gmx_setzero_pr();
+ +    gmx_mm_pr  mrc_3_S;
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  hrc_3_S, moh_rc_S;
+ +#endif
+ +
+ +#ifdef CALC_COUL_TAB
+ +    /* Coulomb table variables */
+ +    gmx_mm_pr   invtsp_S;
+ +    const real *tab_coul_F;
+ +#ifndef TAB_FDV0
+ +    const real *tab_coul_V;
+ +#endif
- #ifndef GMX_DOUBLE
++#if GMX_SIMD_WIDTH_HERE >= 8 || (defined GMX_DOUBLE && GMX_SIMD_WIDTH_HERE >= 4)
++#define STORE_TABLE_INDICES
++#endif
++#ifdef STORE_TABLE_INDICES
+ +    int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
+ +    int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
+ +    int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
+ +    int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
++#else
++    /* Table indices not used, but a function requires the argument */
++    int        *ti0 = NULL, *ti1 = NULL, *ti2 = NULL, *ti3 = NULL;
+ +#endif
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  mhalfsp_S;
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_COUL_EWALD
+ +    gmx_mm_pr beta2_S, beta_S;
+ +#endif
+ +
+ +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ +    gmx_mm_pr  sh_ewald_S;
+ +#endif
+ +
+ +#ifdef LJ_COMB_LB
+ +    const real *ljc;
+ +
+ +    gmx_mm_pr   hsig_i_S0, seps_i_S0;
+ +    gmx_mm_pr   hsig_i_S1, seps_i_S1;
+ +    gmx_mm_pr   hsig_i_S2, seps_i_S2;
+ +    gmx_mm_pr   hsig_i_S3, seps_i_S3;
+ +#else
+ +#ifdef FIX_LJ_C
+ +    real        pvdw_array[2*UNROLLI*UNROLLJ+3];
+ +    real       *pvdw_c6, *pvdw_c12;
+ +    gmx_mm_pr   c6_S0, c12_S0;
+ +    gmx_mm_pr   c6_S1, c12_S1;
+ +    gmx_mm_pr   c6_S2, c12_S2;
+ +    gmx_mm_pr   c6_S3, c12_S3;
+ +#endif
+ +
+ +#ifdef LJ_COMB_GEOM
+ +    const real *ljc;
+ +
+ +    gmx_mm_pr   c6s_S0, c12s_S0;
+ +    gmx_mm_pr   c6s_S1, c12s_S1;
+ +    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ +    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
+ +#endif
+ +#endif /* LJ_COMB_LB */
+ +
+ +    gmx_mm_pr  vctot_S, Vvdwtot_S;
+ +    gmx_mm_pr  sixth_S, twelveth_S;
+ +
+ +    gmx_mm_pr  avoid_sing_S;
+ +    gmx_mm_pr  rc2_S;
+ +#ifdef VDW_CUTOFF_CHECK
+ +    gmx_mm_pr  rcvdw2_S;
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
+ +
+ +    /* cppcheck-suppress unassignedVariable */
+ +    real       tmpsum_array[15], *tmpsum;
+ +#endif
+ +#ifdef CALC_SHIFTFORCES
+ +    /* cppcheck-suppress unassignedVariable */
+ +    real       shf_array[15], *shf;
+ +#endif
+ +
+ +    int ninner;
+ +
+ +#ifdef COUNT_PAIRS
+ +    int npair = 0;
+ +#endif
+ +
+ +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ +    ljc = nbat->lj_comb;
+ +#else
+ +    /* No combination rule used */
- #define NBFP_STRIDE  4
++#if NBFP_STRIDE == 2
++    nbfp_ptr    = nbat->nbfp;
++#else
++#if NBFP_STRIDE == 4
+ +    nbfp_ptr    = nbat->nbfp_s4;
-     nbfp_ptr    = nbat->nbfp;
- #define NBFP_STRIDE  2
+ +#else
-     diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
++#error "Only NBFP_STRIDE 2 and 4 are currently supported"
++#endif
+ +#endif
+ +    nbfp_stride = NBFP_STRIDE;
+ +#endif
+ +
+ +    /* Load j-i for the first i */
-     diag_S0    = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag_S1    = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag_S2    = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag_S3    = gmx_cmplt_pr(zero_S, diag_jmi_S);
++    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
+ +    /* Generate all the diagonal masks as comparison results */
+ +#if UNROLLI == UNROLLJ
-     diag0_S0   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag0_S1   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag0_S2   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag0_S3   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
++    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask_S1  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask_S3  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ +#else
+ +#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
-     diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
++    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
+ +
+ +#if UNROLLI == 2*UNROLLJ
+ +    /* Load j-i for the second half of the j-cluster */
-     diag1_S0   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag1_S1   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag1_S2   = gmx_cmplt_pr(zero_S, diag_jmi_S);
-     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
-     diag1_S3   = gmx_cmplt_pr(zero_S, diag_jmi_S);
++    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
+ +#endif
+ +
- #ifdef gmx_checkbitmask_epi32
-     mask_S0    = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
-     mask_S1    = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
-     mask_S2    = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
-     mask_S3    = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
++    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
++    diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ +#endif
+ +#endif
+ +
+ +    /* Load masks for topology exclusion masking */
-     mask_S0    = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
-     mask_S1    = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
-     mask_S2    = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
-     mask_S3    = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#define FILTER_STRIDE  (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
++#else
++#ifdef GMX_DOUBLE
++#define FILTER_STRIDE  2
++#else
++#define FILTER_STRIDE  1
++#endif
++#endif
++#if FILTER_STRIDE == 1
++    excl_filter = nbat->simd_exclusion_filter1;
+ +#else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
++    excl_filter = nbat->simd_exclusion_filter2;
++#endif
++    /* Here we cast the exclusion filters from unsigned * to int * or real *.
++     * Since we only check bits, the actual value they represent does not
++     * matter, as long as both filter and mask data are treated the same way.
++     */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++    filter_S0 = gmx_load_si((int *)excl_filter + 0*UNROLLJ*FILTER_STRIDE);
++    filter_S1 = gmx_load_si((int *)excl_filter + 1*UNROLLJ*FILTER_STRIDE);
++    filter_S2 = gmx_load_si((int *)excl_filter + 2*UNROLLJ*FILTER_STRIDE);
++    filter_S3 = gmx_load_si((int *)excl_filter + 3*UNROLLJ*FILTER_STRIDE);
++#else
++    filter_S0 = gmx_load_pr((real *)excl_filter + 0*UNROLLJ);
++    filter_S1 = gmx_load_pr((real *)excl_filter + 1*UNROLLJ);
++    filter_S2 = gmx_load_pr((real *)excl_filter + 2*UNROLLJ);
++    filter_S3 = gmx_load_pr((real *)excl_filter + 3*UNROLLJ);
+ +#endif
++#undef FILTER_STRIDE
+ +
+ +#ifdef CALC_COUL_TAB
-             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++#ifdef STORE_TABLE_INDICES
+ +    /* Generate aligned table index pointers */
+ +    ti0 = gmx_simd_align_int(ti0_array);
+ +    ti1 = gmx_simd_align_int(ti1_array);
+ +    ti2 = gmx_simd_align_int(ti2_array);
+ +    ti3 = gmx_simd_align_int(ti3_array);
+ +#endif
+ +
+ +    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
+ +#ifdef CALC_ENERGIES
+ +    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
+ +#endif
+ +
+ +#ifdef TAB_FDV0
+ +    tab_coul_F = ic->tabq_coul_FDV0;
+ +#else
+ +    tab_coul_F = ic->tabq_coul_F;
+ +    tab_coul_V = ic->tabq_coul_V;
+ +#endif
+ +#endif /* CALC_COUL_TAB */
+ +
+ +#ifdef CALC_COUL_EWALD
+ +    beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ +    beta_S  = gmx_set1_pr(ic->ewaldcoeff);
+ +#endif
+ +
+ +#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+ +    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
+ +#endif
+ +
+ +    q                   = nbat->q;
+ +    type                = nbat->type;
+ +    facel               = ic->epsfac;
+ +    shiftvec            = shift_vec[0];
+ +    x                   = nbat->x;
+ +
+ +    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+ +
+ +    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+ +    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+ +#ifdef VDW_CUTOFF_CHECK
+ +    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    sixth_S      = gmx_set1_pr(1.0/6.0);
+ +    twelveth_S   = gmx_set1_pr(1.0/12.0);
+ +
+ +    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
+ +    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+ +#endif
+ +
+ +    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
+ +
+ +#ifdef CALC_ENERGIES
+ +    hrc_3_S  = gmx_set1_pr(ic->k_rf);
+ +
+ +    moh_rc_S = gmx_set1_pr(-ic->c_rf);
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +    tmpsum   = gmx_simd_align_real(tmpsum_array);
+ +#endif
+ +#ifdef CALC_SHIFTFORCES
+ +    shf      = gmx_simd_align_real(shf_array);
+ +#endif
+ +
+ +#ifdef FIX_LJ_C
+ +    pvdw_c6  = gmx_simd_align_real(pvdw_array+3);
+ +    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+ +
+ +    for (jp = 0; jp < UNROLLJ; jp++)
+ +    {
+ +        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+ +
+ +        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ +    }
+ +    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ +    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ +    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ +    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+ +
+ +    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ +    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ +    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ +    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+ +#endif /* FIX_LJ_C */
+ +
+ +#ifdef ENERGY_GROUPS
+ +    egps_ishift  = nbat->neg_2log;
+ +    egps_imask   = (1<<egps_ishift) - 1;
+ +    egps_jshift  = 2*nbat->neg_2log;
+ +    egps_jmask   = (1<<egps_jshift) - 1;
+ +    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+ +    /* Major division is over i-particle energy groups, determine the stride */
+ +    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+ +#endif
+ +
+ +    l_cj = nbl->cj;
+ +
+ +    ninner = 0;
+ +    for (n = 0; n < nbl->nci; n++)
+ +    {
+ +        nbln = &nbl->ci[n];
+ +
+ +        ish              = (nbln->shift & NBNXN_CI_SHIFT);
+ +        ish3             = ish*3;
+ +        cjind0           = nbln->cj_ind_start;
+ +        cjind1           = nbln->cj_ind_end;
+ +        ci               = nbln->ci;
+ +        ci_sh            = (ish == CENTRAL ? ci : -1);
+ +
+ +        shX_S = gmx_load1_pr(shiftvec+ish3);
+ +        shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ +        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
+ +
+ +#if UNROLLJ <= 4
+ +        sci              = ci*STRIDE;
+ +        scix             = sci*DIM;
+ +        sci2             = sci*2;
+ +#else
+ +        sci              = (ci>>1)*STRIDE;
+ +        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
+ +        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
+ +        sci             += (ci & 1)*(STRIDE>>1);
+ +#endif
+ +
+ +        /* We have 5 LJ/C combinations, but use only three inner loops,
+ +         * as the other combinations are unlikely and/or not much faster:
+ +         * inner half-LJ + C for half-LJ + C / no-LJ + C
+ +         * inner LJ + C      for full-LJ + C
+ +         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+ +         */
+ +        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
+ +        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ +        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
+ +
+ +#ifdef ENERGY_GROUPS
+ +        egps_i = nbat->energrp[ci];
+ +        {
+ +            int ia, egp_ia;
+ +
+ +            for (ia = 0; ia < UNROLLI; ia++)
+ +            {
+ +                egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
+ +                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+ +                vctp[ia]   = Vc   + egp_ia*Vstride_i;
+ +            }
+ +        }
+ +#endif
+ +#if defined CALC_ENERGIES
+ +#if UNROLLJ == 4
+ +        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+ +#endif
+ +#if UNROLLJ == 2
+ +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+ +#endif
+ +#if UNROLLJ == 8
+ +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+ +#endif
+ +        {
+ +            int  ia;
+ +            real Vc_sub_self;
+ +
+ +#ifdef CALC_COUL_RF
+ +            Vc_sub_self = 0.5*ic->c_rf;
+ +#endif
+ +#ifdef CALC_COUL_TAB
+ +#ifdef TAB_FDV0
+ +            Vc_sub_self = 0.5*tab_coul_F[2];
+ +#else
+ +            Vc_sub_self = 0.5*tab_coul_V[0];
+ +#endif
+ +#endif
+ +#ifdef CALC_COUL_EWALD
+ +            /* beta/sqrt(pi) */
+ +            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+ +#endif
+ +
+ +            for (ia = 0; ia < UNROLLI; ia++)
+ +            {
+ +                real qi;
+ +
+ +                qi = q[sci+ia];
+ +#ifdef ENERGY_GROUPS
+ +                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+ +#else
+ +                Vc[0]
+ +#endif
+ +                    -= facel*qi*qi*Vc_sub_self;
+ +            }
+ +        }
+ +#endif
+ +
+ +        /* Load i atom data */
+ +        sciy             = scix + STRIDE;
+ +        sciz             = sciy + STRIDE;
+ +        ix_S0          = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
+ +        ix_S1          = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
+ +        ix_S2          = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
+ +        ix_S3          = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
+ +        iy_S0          = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
+ +        iy_S1          = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
+ +        iy_S2          = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
+ +        iy_S3          = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
+ +        iz_S0          = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
+ +        iz_S1          = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
+ +        iz_S2          = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
+ +        iz_S3          = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
+ +
+ +        if (do_coul)
+ +        {
+ +            iq_S0      = gmx_set1_pr(facel*q[sci]);
+ +            iq_S1      = gmx_set1_pr(facel*q[sci+1]);
+ +            iq_S2      = gmx_set1_pr(facel*q[sci+2]);
+ +            iq_S3      = gmx_set1_pr(facel*q[sci+3]);
+ +        }
+ +
+ +#ifdef LJ_COMB_LB
+ +        hsig_i_S0      = gmx_load1_pr(ljc+sci2+0);
+ +        hsig_i_S1      = gmx_load1_pr(ljc+sci2+1);
+ +        hsig_i_S2      = gmx_load1_pr(ljc+sci2+2);
+ +        hsig_i_S3      = gmx_load1_pr(ljc+sci2+3);
+ +        seps_i_S0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ +        seps_i_S1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ +        seps_i_S2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ +        seps_i_S3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ +#else
+ +#ifdef LJ_COMB_GEOM
+ +        c6s_S0         = gmx_load1_pr(ljc+sci2+0);
+ +        c6s_S1         = gmx_load1_pr(ljc+sci2+1);
+ +        if (!half_LJ)
+ +        {
+ +            c6s_S2     = gmx_load1_pr(ljc+sci2+2);
+ +            c6s_S3     = gmx_load1_pr(ljc+sci2+3);
+ +        }
+ +        c12s_S0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ +        c12s_S1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ +        if (!half_LJ)
+ +        {
+ +            c12s_S2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ +            c12s_S3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ +        }
+ +#else
+ +        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
+ +        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+ +        if (!half_LJ)
+ +        {
+ +            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+ +            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+ +        }
+ +#endif
+ +#endif
+ +
+ +        /* Zero the potential energy for this list */
+ +        Vvdwtot_S        = gmx_setzero_pr();
+ +        vctot_S          = gmx_setzero_pr();
+ +
+ +        /* Clear i atom forces */
+ +        fix_S0           = gmx_setzero_pr();
+ +        fix_S1           = gmx_setzero_pr();
+ +        fix_S2           = gmx_setzero_pr();
+ +        fix_S3           = gmx_setzero_pr();
+ +        fiy_S0           = gmx_setzero_pr();
+ +        fiy_S1           = gmx_setzero_pr();
+ +        fiy_S2           = gmx_setzero_pr();
+ +        fiy_S3           = gmx_setzero_pr();
+ +        fiz_S0           = gmx_setzero_pr();
+ +        fiz_S1           = gmx_setzero_pr();
+ +        fiz_S2           = gmx_setzero_pr();
+ +        fiz_S3           = gmx_setzero_pr();
+ +
+ +        cjind = cjind0;
+ +
+ +        /* Currently all kernels use (at least half) LJ */
+ +#define CALC_LJ
+ +        if (half_LJ)
+ +        {
+ +#define CALC_COULOMB
+ +#define HALF_LJ
+ +#define CHECK_EXCLS
-             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +#include "nbnxn_kernel_simd_4xn_inner.h"
+ +                cjind++;
+ +            }
+ +#undef CHECK_EXCLS
+ +            for (; (cjind < cjind1); cjind++)
+ +            {
+ +#include "nbnxn_kernel_simd_4xn_inner.h"
+ +            }
+ +#undef HALF_LJ
+ +#undef CALC_COULOMB
+ +        }
+ +        else if (do_coul)
+ +        {
+ +#define CALC_COULOMB
+ +#define CHECK_EXCLS
-             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +#include "nbnxn_kernel_simd_4xn_inner.h"
+ +                cjind++;
+ +            }
+ +#undef CHECK_EXCLS
+ +            for (; (cjind < cjind1); cjind++)
+ +            {
+ +#include "nbnxn_kernel_simd_4xn_inner.h"
+ +            }
+ +#undef CALC_COULOMB
+ +        }
+ +        else
+ +        {
+ +#define CHECK_EXCLS
- #ifndef GMX_DOUBLE
- #define gmx_load_pr4  _mm_load_ps
- #define gmx_store_pr4 _mm_store_ps
- #define gmx_add_pr4   _mm_add_ps
- #else
- #define gmx_load_pr4  _mm256_load_pd
- #define gmx_store_pr4 _mm256_store_pd
- #define gmx_add_pr4   _mm256_add_pd
- #endif
-         GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
++            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +#include "nbnxn_kernel_simd_4xn_inner.h"
+ +                cjind++;
+ +            }
+ +#undef CHECK_EXCLS
+ +            for (; (cjind < cjind1); cjind++)
+ +            {
+ +#include "nbnxn_kernel_simd_4xn_inner.h"
+ +            }
+ +        }
+ +#undef CALC_LJ
+ +        ninner += cjind1 - cjind0;
+ +
+ +        /* Add accumulated i-forces to the force array */
+ +#if UNROLLJ >= 4
-         GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
++        fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
+ +        gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
+ +
-         GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
++        fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
+ +        gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
+ +
-         GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
-         _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
-         GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
-         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
++        fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
+ +        gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
+ +
+ +#ifdef CALC_SHIFTFORCES
+ +        gmx_store_pr4(shf, fix_S);
+ +        fshift[ish3+0] += SUM_SIMD4(shf);
+ +        gmx_store_pr4(shf, fiy_S);
+ +        fshift[ish3+1] += SUM_SIMD4(shf);
+ +        gmx_store_pr4(shf, fiz_S);
+ +        fshift[ish3+2] += SUM_SIMD4(shf);
+ +#endif
+ +#else
-         GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
-         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
-         GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
-         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
++        fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
++        gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
++        fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
++        gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
+ +
-         GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
-         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
-         GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
-         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
++        fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
++        gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
++        fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
++        gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
+ +
-         _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
++        fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
++        gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
++        fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
++        gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
+ +
+ +#ifdef CALC_SHIFTFORCES
-         _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
++        gmx_store_pr(shf, gmx_add_pr(fix0_S, fix2_S));
+ +        fshift[ish3+0] += shf[0] + shf[1];
-         _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
++        gmx_store_pr(shf, gmx_add_pr(fiy0_S, fiy2_S));
+ +        fshift[ish3+1] += shf[0] + shf[1];
++        gmx_store_pr(shf, gmx_add_pr(fiz0_S, fiz2_S));
+ +        fshift[ish3+2] += shf[0] + shf[1];
+ +#endif
+ +#endif
+ +
+ +#ifdef CALC_ENERGIES
+ +        if (do_coul)
+ +        {
+ +            gmx_store_pr(tmpsum, vctot_S);
+ +            *Vc += SUM_SIMD(tmpsum);
+ +        }
+ +
+ +        gmx_store_pr(tmpsum, Vvdwtot_S);
+ +        *Vvdw += SUM_SIMD(tmpsum);
+ +#endif
+ +
+ +        /* Outer loop uses 6 flops/iteration */
+ +    }
+ +
+ +#ifdef COUNT_PAIRS
+ +    printf("atom pairs %d\n", npair);
+ +#endif
+ +}
+ +
+ +
++#if UNROLLJ == 4
++#undef gmx_mm_pr4
+ +#undef gmx_load_pr4
+ +#undef gmx_store_pr4
+ +#undef gmx_store_pr4
++#endif
++
++#undef STORE_TABLE_INDICES
+ +
+ +#undef CALC_SHIFTFORCES
+ +
+ +#undef UNROLLI
+ +#undef UNROLLJ
+ +#undef STRIDE
+ +#undef TAB_FDV0
+ +#undef NBFP_STRIDE
+ +
+ +#undef GMX_USE_HALF_WIDTH_SIMD_HERE
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index 6f52427c75bfec9667c4b90099aba3df1e5c3bae,0000000000000000000000000000000000000000..4ad646534d521fa981ea90ec447c2e11ddec3590

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@@ -1,572 -1,0 +1,121 @@@
- #ifdef GMX_X86_SSE2
- 
- /* Transpose 2 double precision registers */
- #define GMX_MM_TRANSPOSE2_OP_PD(in0, in1, out0, out1)                      \
-     {                                                                       \
-         out0 = _mm_unpacklo_pd(in0, in1);                                    \
-         out1 = _mm_unpackhi_pd(in0, in1);                                    \
-     }
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#ifndef _nbnxn_kernel_sse_utils_h_
+ +#define _nbnxn_kernel_sse_utils_h_
+ +
+ +/* This files contains all functions/macros for the SIMD kernels
+ + * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ + * The functionality which depends on the j-cluster size is:
+ + *   LJ-parameter lookup
+ + *   force table lookup
+ + *   energy group pair energy storage
+ + */
+ +
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 || !defined GMX_DOUBLE
- /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
- #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1)    \
-     {                                                                       \
-         __m128 _c01, _c23;                                                   \
-         _c01 = _mm_movelh_ps(in0, in1);                                      \
-         _c23 = _mm_movelh_ps(in2, in3);                                      \
-         out0 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));              \
-         out1 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1));              \
-     }
+ +
- /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
- #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1)    \
-     {                                                                       \
-         __m256d _c01, _c23;                                                  \
-         _c01 = _mm256_shuffle_pd(in0, in1, _MM_SHUFFLE(1, 0, 1, 0));             \
-         _c23 = _mm256_shuffle_pd(in2, in3, _MM_SHUFFLE(1, 0, 1, 0));             \
-         out0 = _mm256_shuffle_pd(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));           \
-         out1 = _mm256_shuffle_pd(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1));           \
-     }
- #endif
- 
- /* Collect element 2 of the 4 inputs to out */
- #define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0, in1, in2, in3, out)           \
-     {                                                                       \
-         __m128 _c01, _c23;                                                   \
-         _c01 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 2, 3, 2));                \
-         _c23 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 2, 3, 2));                \
-         out  = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));              \
-     }
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #ifndef GMX_DOUBLE
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out)                   \
-     {                                                                       \
-         _MM_TRANSPOSE4_PS(in0, in1, in2, in3);                                 \
-         in0  = _mm_add_ps(in0, in1);                                          \
-         in2  = _mm_add_ps(in2, in3);                                          \
-         out  = _mm_add_ps(in0, in2);                                         \
-     }
++/* Include SIMD architecture specific versions of the 4/5 functions above */
++#ifdef GMX_SIMD_REFERENCE_PLAIN_C
++#include "nbnxn_kernel_simd_utils_ref.h"
+ +#else
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM2_PD(in0, in1, out)                           \
-     {                                                                       \
-         GMX_MM_TRANSPOSE2_PD(in0, in1);                                      \
-         out  = _mm_add_pd(in0, in1);                                         \
-     }
++#ifdef GMX_X86_SSE2
++/* Include x86 SSE2 compatible SIMD functions */
++#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
++#ifdef GMX_DOUBLE
++#include "nbnxn_kernel_simd_utils_x86_256d.h"
+ +#else
- #ifndef GMX_DOUBLE
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out)                   \
-     {                                                                       \
-         in0 = _mm256_hadd_ps(in0, in1);                                      \
-         in2 = _mm256_hadd_ps(in2, in3);                                      \
-         in1 = _mm256_hadd_ps(in0, in2);                                      \
-         out = _mm_add_ps(_mm256_castps256_ps128(in1), _mm256_extractf128_ps(in1, 1)); \
-     }
- /* Sum the elements of halfs of each input register and store sums in out */
- #define GMX_MM_TRANSPOSE_SUM4H_PR(in0, in2, out)                          \
-     {                                                                       \
-         in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps());                      \
-         in2 = _mm256_hadd_ps(in2, _mm256_setzero_ps());                      \
-         in0 = _mm256_hadd_ps(in0, in2);                                      \
-         in2 = _mm256_permute_ps(in0, _MM_SHUFFLE(2, 3, 0, 1));                  \
-         out = _mm_add_ps(_mm256_castps256_ps128(in0), _mm256_extractf128_ps(in2, 1)); \
-     }
++#include "nbnxn_kernel_simd_utils_x86_256s.h"
+ +#endif
+ +#else
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out)                   \
-     {                                                                       \
-         in0 = _mm256_hadd_pd(in0, in1);                                      \
-         in2 = _mm256_hadd_pd(in2, in3);                                      \
-         out = _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31)); \
-     }
- #endif
- #endif
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- 
- static inline __m128
- gmx_mm128_invsqrt_ps_single(__m128 x)
- {
-     const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
-     const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
- 
-     __m128       lu = _mm_rsqrt_ps(x);
- 
-     return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
- }
- 
- /* Do 2 double precision invsqrt operations.
-  * Doing the SIMD rsqrt and the first Newton Raphson iteration
-  * in single precision gives full double precision accuracy.
-  * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
-  */
- #define GMX_MM128_INVSQRT2_PD(in0, in1, out0, out1)                        \
-     {                                                                       \
-         const __m128d half  = _mm_set1_pd(0.5);                             \
-         const __m128d three = _mm_set1_pd(3.0);                             \
-         __m128        s, ir;                                                       \
-         __m128d       lu0, lu1;                                                    \
-                                                                         \
-         s    = _mm_movelh_ps(_mm_cvtpd_ps(in0), _mm_cvtpd_ps(in1));          \
-         ir   = gmx_mm128_invsqrt_ps_single(s);                              \
-         lu0  = _mm_cvtps_pd(ir);                                            \
-         lu1  = _mm_cvtps_pd(_mm_movehl_ps(ir, ir));                          \
-         out0 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu0, lu0), in0)), lu0)); \
-         out1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), in1)), lu1)); \
-     }
- 
- #define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
- 
- #endif
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- 
- static inline __m256
- gmx_mm256_invsqrt_ps_single(__m256 x)
- {
-     const __m256 half  = _mm256_set_ps(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
-     const __m256 three = _mm256_set_ps(3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0);
- 
-     __m256       lu = _mm256_rsqrt_ps(x);
- 
-     return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
- }
- 
- /* Do 4 double precision invsqrt operations.
-  * Doing the SIMD rsqrt and the first Newton Raphson iteration
-  * in single precision gives full double precision accuracy.
-  */
- #define GMX_MM256_INVSQRT2_PD(in0, in1, out0, out1)                        \
-     {                                                                       \
-         const __m256d half  = _mm256_set1_pd(0.5);                          \
-         const __m256d three = _mm256_set1_pd(3.0);                          \
-         __m256        s, ir;                                                       \
-         __m256d       lu0, lu1;                                                    \
-                                                                         \
-         s    = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)), _mm256_cvtpd_ps(in1), 1); \
-         ir   = gmx_mm256_invsqrt_ps_single(s);                              \
-         lu0  = _mm256_cvtps_pd(_mm256_castps256_ps128(ir));                 \
-         lu1  = _mm256_cvtps_pd(_mm256_extractf128_ps(ir, 1));                \
-         out0 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu0, lu0), in0)), lu0)); \
-         out1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), in1)), lu1)); \
-     }
- 
- #define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
- 
- #endif
- 
- /* Force and energy table load and interpolation routines */
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
- 
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE)                \
-     {                                                                       \
-         gmx_mm_pr clj_SSE[UNROLLJ];                                         \
-         int       p;                                                              \
-                                                                         \
-         for (p = 0; p < UNROLLJ; p++)                                            \
-         {                                                                   \
-             /* Here we load 4 aligned floats, but we need just 2 */         \
-             clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE);          \
-         }                                                                   \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0], clj_SSE[1], clj_SSE[2], clj_SSE[3], c6_SSE, c12_SSE); \
-     }
- 
- #endif
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
- 
- /* Put two 128-bit 4-float registers into one 256-bit 8-float register */
- #define GMX_2_MM_TO_M256(in0, in1, out)                                   \
-     {                                                                       \
-         out = _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1);      \
-     }
- 
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE)                \
-     {                                                                       \
-         __m128 clj_SSE[UNROLLJ], c6t_SSE[2], c12t_SSE[2];                     \
-         int    p;                                                              \
-                                                                         \
-         for (p = 0; p < UNROLLJ; p++)                                            \
-         {                                                                   \
-             /* Here we load 4 aligned floats, but we need just 2 */         \
-             clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
-         }                                                                   \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0], clj_SSE[1], clj_SSE[2], clj_SSE[3], c6t_SSE[0], c12t_SSE[0]); \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4], clj_SSE[5], clj_SSE[6], clj_SSE[7], c6t_SSE[1], c12t_SSE[1]); \
-                                                                         \
-         GMX_2_MM_TO_M256(c6t_SSE[0], c6t_SSE[1], c6_SSE);                     \
-         GMX_2_MM_TO_M256(c12t_SSE[0], c12t_SSE[1], c12_SSE);                  \
-     }
- 
- #define load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_SSE, c12_SSE)        \
-     {                                                                       \
-         __m128 clj_SSE0[UNROLLJ], clj_SSE1[UNROLLJ], c6t_SSE[2], c12t_SSE[2];  \
-         int    p;                                                              \
-                                                                         \
-         for (p = 0; p < UNROLLJ; p++)                                            \
-         {                                                                   \
-             /* Here we load 4 aligned floats, but we need just 2 */         \
-             clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE);        \
-         }                                                                   \
-         for (p = 0; p < UNROLLJ; p++)                                            \
-         {                                                                   \
-             /* Here we load 4 aligned floats, but we need just 2 */         \
-             clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE);        \
-         }                                                                   \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0], clj_SSE0[1], clj_SSE0[2], clj_SSE0[3], c6t_SSE[0], c12t_SSE[0]); \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0], clj_SSE1[1], clj_SSE1[2], clj_SSE1[3], c6t_SSE[1], c12t_SSE[1]); \
-                                                                         \
-         GMX_2_MM_TO_M256(c6t_SSE[0], c6t_SSE[1], c6_SSE);                     \
-         GMX_2_MM_TO_M256(c12t_SSE[0], c12t_SSE[1], c12_SSE);                  \
-     }
- 
++#ifdef GMX_DOUBLE
++#include "nbnxn_kernel_simd_utils_x86_128d.h"
+ +#else
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
- 
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE)                \
-     {                                                                       \
-         gmx_mm_pr clj_SSE[UNROLLJ];                                         \
-         int       p;                                                              \
-                                                                         \
-         for (p = 0; p < UNROLLJ; p++)                                            \
-         {                                                                   \
-             clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE);          \
-         }                                                                   \
-         GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0], clj_SSE[1], c6_SSE, c12_SSE);      \
-     }
- 
++#include "nbnxn_kernel_simd_utils_x86_128s.h"
+ +#endif
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
- 
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE)                \
-     {                                                                       \
-         __m128d clj_SSE[UNROLLJ], c6t_SSE[2], c12t_SSE[2];                    \
-         int     p;                                                              \
-                                                                         \
-         for (p = 0; p < UNROLLJ; p++)                                            \
-         {                                                                   \
-             clj_SSE[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE);          \
-         }                                                                   \
-         GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0], clj_SSE[1], c6t_SSE[0], c12t_SSE[0]); \
-         GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[2], clj_SSE[3], c6t_SSE[1], c12t_SSE[1]); \
-         GMX_2_M128D_TO_M256D(c6t_SSE[0], c6t_SSE[1], c6_SSE);                 \
-         GMX_2_M128D_TO_M256D(c12t_SSE[0], c12t_SSE[1], c12_SSE);              \
-     }
- 
+ +#endif
- 
- 
- /* The load_table functions below are performance critical.
-  * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
-  * As these all have latencies, scheduling is crucial.
-  * The Intel compilers and CPUs seem to do a good job at this.
-  * But AMD CPUs perform significantly worse with gcc than with icc.
-  * Performance is improved a bit by using the extract function UNROLLJ times,
-  * instead of doing an _mm_store_si128 for every i-particle.
-  * This is only faster when we use FDV0 formatted tables, where we also need
-  * to multiple the index by 4, which can be done by a SIMD bit shift.
-  * With single precision AVX, 8 extracts are much slower than 1 store.
-  * Because of this, the load_table_f macro always takes the ti parameter,
-  * but it is only used with AVX.
-  */
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
- 
- #define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE)   \
-     {                                                                       \
-         int    idx[4];                                                      \
-         __m128 ctab_SSE[4];                                                 \
-                                                                         \
-         /* Table has 4 entries, left-shift index by 2 */                    \
-         ti_SSE = _mm_slli_epi32(ti_SSE, 2);                                  \
-         /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
-         idx[0]      = gmx_mm_extract_epi32(ti_SSE, 0);                            \
-         ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);                    \
-         idx[1]      = gmx_mm_extract_epi32(ti_SSE, 1);                            \
-         ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);                    \
-         idx[2]      = gmx_mm_extract_epi32(ti_SSE, 2);                            \
-         ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);                    \
-         idx[3]      = gmx_mm_extract_epi32(ti_SSE, 3);                            \
-         ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);                    \
-                                                                         \
-         /* Shuffle the force table entries to a convenient order */         \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctab0_SSE, ctab1_SSE); \
-     }
- 
- #define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
-     {                                                                       \
-         int    idx[4];                                                      \
-         __m128 ctab_SSE[4];                                                 \
-                                                                         \
-         /* Table has 4 entries, left-shift index by 2 */                    \
-         ti_SSE = _mm_slli_epi32(ti_SSE, 2);                                  \
-         /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
-         idx[0]      = gmx_mm_extract_epi32(ti_SSE, 0);                            \
-         ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);                    \
-         idx[1]      = gmx_mm_extract_epi32(ti_SSE, 1);                            \
-         ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);                    \
-         idx[2]      = gmx_mm_extract_epi32(ti_SSE, 2);                            \
-         ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);                    \
-         idx[3]      = gmx_mm_extract_epi32(ti_SSE, 3);                            \
-         ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);                    \
-                                                                         \
-         /* Shuffle the force  table entries to a convenient order */        \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctab0_SSE, ctab1_SSE); \
-         /* Shuffle the energy table entries to a convenient order */        \
-         GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabv_SSE); \
-     }
- 
+ +#endif
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
- 
- #define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE)   \
-     {                                                                       \
-         __m128 ctab_SSE[8], ctabt_SSE[4];                                    \
-         int    j;                                                           \
-                                                                         \
-         /* Bit shifting would be faster, but AVX doesn't support that */    \
-         _mm256_store_si256((__m256i *)ti, ti_SSE);                           \
-         for (j = 0; j < 8; j++)                                                  \
-         {                                                                   \
-             ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4);               \
-         }                                                                   \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabt_SSE[0], ctabt_SSE[2]); \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabt_SSE[1], ctabt_SSE[3]); \
-                                                                         \
-         GMX_2_MM_TO_M256(ctabt_SSE[0], ctabt_SSE[1], ctab0_SSE);              \
-         GMX_2_MM_TO_M256(ctabt_SSE[2], ctabt_SSE[3], ctab1_SSE);              \
-     }
+ +#endif
+ +
- #define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
-     {                                                                       \
-         __m128 ctab_SSE[8], ctabt_SSE[4], ctabvt_SSE[2];                      \
-         int    j;                                                           \
-                                                                         \
-         /* Bit shifting would be faster, but AVX doesn't support that */    \
-         _mm256_store_si256((__m256i *)ti, ti_SSE);                           \
-         for (j = 0; j < 8; j++)                                                  \
-         {                                                                   \
-             ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4);               \
-         }                                                                   \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabt_SSE[0], ctabt_SSE[2]); \
-         GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabt_SSE[1], ctabt_SSE[3]); \
-                                                                         \
-         GMX_2_MM_TO_M256(ctabt_SSE[0], ctabt_SSE[1], ctab0_SSE);              \
-         GMX_2_MM_TO_M256(ctabt_SSE[2], ctabt_SSE[3], ctab1_SSE);              \
-                                                                         \
-         GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabvt_SSE[0]); \
-         GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabvt_SSE[1]); \
-                                                                         \
-         GMX_2_MM_TO_M256(ctabvt_SSE[0], ctabvt_SSE[1], ctabv_SSE);            \
-     }
- 
- #endif
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
- 
- #define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE)      \
-     {                                                                       \
-         int     idx[2];                                                     \
-         __m128d ctab_SSE[2];                                                \
-                                                                         \
-         /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
-         idx[0]      = gmx_mm_extract_epi32(ti_SSE, 0);                            \
-         ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]);                      \
-         idx[1]      = gmx_mm_extract_epi32(ti_SSE, 1);                            \
-         ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]);                      \
-                                                                         \
-         /* Shuffle the force table entries to a convenient order */         \
-         GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], ctab0_SSE, ctab1_SSE); \
-         /* The second force table entry should contain the difference */    \
-         ctab1_SSE = _mm_sub_pd(ctab1_SSE, ctab0_SSE);                        \
-     }
- 
- #define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
-     {                                                                       \
-         int     idx[2];                                                     \
-         __m128d ctab_SSE[4];                                                \
-                                                                         \
-         /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
-         idx[0]      = gmx_mm_extract_epi32(ti_SSE, 0);                            \
-         ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]);                      \
-         idx[1]      = gmx_mm_extract_epi32(ti_SSE, 1);                            \
-         ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]);                      \
-                                                                         \
-         /* Shuffle the force table entries to a convenient order */         \
-         GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], ctab0_SSE, ctab1_SSE); \
-         /* The second force table entry should contain the difference */    \
-         ctab1_SSE = _mm_sub_pd(ctab1_SSE, ctab0_SSE);                        \
-                                                                         \
-         ctab_SSE[2] = _mm_loadu_pd(tab_coul_V+idx[0]);                      \
-         ctab_SSE[3] = _mm_loadu_pd(tab_coul_V+idx[1]);                      \
-                                                                         \
-         /* Shuffle the energy table entries to a single register */         \
-         ctabv_SSE = _mm_shuffle_pd(ctab_SSE[2], ctab_SSE[3], _MM_SHUFFLE2(0, 0)); \
-     }
- 
- #endif
- 
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
- 
- /* Put two 128-bit 2-double registers into one 256-bit 4-ouble register */
- #define GMX_2_M128D_TO_M256D(in0, in1, out)                               \
-     {                                                                       \
-         out = _mm256_insertf128_pd(_mm256_castpd128_pd256(in0), in1, 1);      \
-     }
- 
- #define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE)      \
-     {                                                                       \
-         __m128d ctab_SSE[4], tr_SSE[4];                                      \
-         int     j;                                                          \
-                                                                         \
-         _mm_store_si128((__m128i *)ti, ti_SSE);                              \
-         for (j = 0; j < 4; j++)                                                  \
-         {                                                                   \
-             ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]);                   \
-         }                                                                   \
-         /* Shuffle the force table entries to a convenient order */         \
-         GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], tr_SSE[0], tr_SSE[1]); \
-         GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2], ctab_SSE[3], tr_SSE[2], tr_SSE[3]); \
-         GMX_2_M128D_TO_M256D(tr_SSE[0], tr_SSE[2], ctab0_SSE);                \
-         GMX_2_M128D_TO_M256D(tr_SSE[1], tr_SSE[3], ctab1_SSE);                \
-         /* The second force table entry should contain the difference */    \
-         ctab1_SSE = _mm256_sub_pd(ctab1_SSE, ctab0_SSE);                     \
-     }
- 
- #define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
-     {                                                                       \
-         __m128d ctab_SSE[8], tr_SSE[4];                                      \
-         int     j;                                                          \
-                                                                         \
-         _mm_store_si128((__m128i *)ti, ti_SSE);                              \
-         for (j = 0; j < 4; j++)                                                  \
-         {                                                                   \
-             ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]);                   \
-         }                                                                   \
-         /* Shuffle the force table entries to a convenient order */         \
-         GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], tr_SSE[0], tr_SSE[1]); \
-         GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2], ctab_SSE[3], tr_SSE[2], tr_SSE[3]); \
-         GMX_2_M128D_TO_M256D(tr_SSE[0], tr_SSE[2], ctab0_SSE);                \
-         GMX_2_M128D_TO_M256D(tr_SSE[1], tr_SSE[3], ctab1_SSE);                \
-         /* The second force table entry should contain the difference */    \
-         ctab1_SSE = _mm256_sub_pd(ctab1_SSE, ctab0_SSE);                     \
-                                                                         \
-         for (j = 0; j < 4; j++)                                                  \
-         {                                                                   \
-             ctab_SSE[4+j] = _mm_loadu_pd(tab_coul_V+ti[j]);                 \
-         }                                                                   \
-         /* Shuffle the energy table entries to a single register */         \
-         GMX_2_M128D_TO_M256D(_mm_shuffle_pd(ctab_SSE[4], ctab_SSE[5], _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(ctab_SSE[6], ctab_SSE[7], _MM_SHUFFLE2(0, 0)), ctabv_SSE); \
-     }
- 
- #endif
- 
- 
- /* Add energy register to possibly multiple terms in the energy array.
-  * This function is the same for SSE/AVX single/double.
-  */
- static inline void add_ener_grp(gmx_mm_pr e_SSE, real *v, const int *offset_jj)
+ +
-         gmx_mm_pr v_SSE;
++#ifdef UNROLLJ
++/* Add energy register to possibly multiple terms in the energy array */
++static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
+ +{
+ +    int jj;
+ +
+ +    /* We need to balance the number of store operations with
+ +     * the rapidly increases number of combinations of energy groups.
+ +     * We add to a temporary buffer for 1 i-group vs 2 j-groups.
+ +     */
+ +    for (jj = 0; jj < (UNROLLJ/2); jj++)
+ +    {
-         v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
-         gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_SSE, e_SSE));
++        gmx_mm_pr v_S;
+ +
- #if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8 && defined gmx_mm_hpr
- /* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
++        v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
++        gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
+ +    }
+ +}
++#endif
+ +
- static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
-                                        real *v0, real *v1, const int *offset_jj)
++#if defined GMX_NBNXN_SIMD_2XNN && defined UNROLLJ
++/* As add_ener_grp, but for two groups of UNROLLJ/2 stored in
+ + * a single SIMD register.
+ + */
-     gmx_mm_hpr e_SSE0, e_SSE1;
++static inline void
++add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
+ +{
-     e_SSE0 = _mm256_extractf128_ps(e_SSE, 0);
-     e_SSE1 = _mm256_extractf128_ps(e_SSE, 1);
++    gmx_mm_hpr e_S0, e_S1;
+ +    int        jj;
+ +
-         gmx_mm_hpr v_SSE;
++    gmx_pr_to_2hpr(e_S, &e_S0, &e_S1);
+ +
+ +    for (jj = 0; jj < (UNROLLJ/2); jj++)
+ +    {
-         gmx_load_hpr(v_SSE, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
-         gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE0));
++        gmx_mm_hpr v_S;
+ +
-         gmx_mm_hpr v_SSE;
++        gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
++        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
+ +    }
+ +    for (jj = 0; jj < (UNROLLJ/2); jj++)
+ +    {
-         gmx_load_hpr(v_SSE, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
-         gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE1));
++        gmx_mm_hpr v_S;
+ +
- #endif /* GMX_X86_SSE2 */
- 
++        gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
++        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
+ +    }
+ +}
+ +#endif
+ +
+ +#endif /* _nbnxn_kernel_sse_utils_h_ */
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

index 0000000000000000000000000000000000000000,3121b40ce6c2bdebde7fc4c8a42eb27c0b8ede74..3121b40ce6c2bdebde7fc4c8a42eb27c0b8ede74

mode 000000,100644..100644
--- /dev/null
--- 2/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h

index 0000000000000000000000000000000000000000,e170f7600de607460dcc8afda8a7a665bafde178..e170f7600de607460dcc8afda8a7a665bafde178

mode 000000,100644..100644
--- /dev/null
--- 2/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h

index 0000000000000000000000000000000000000000,d03154a93861412297130a788c59ef30cfa8ea03..d03154a93861412297130a788c59ef30cfa8ea03

mode 000000,100644..100644
--- /dev/null
--- 2/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h

index 0000000000000000000000000000000000000000,97f313045db539e861c65ac1278c0e275eadeebb..97f313045db539e861c65ac1278c0e275eadeebb

mode 000000,100644..100644
--- /dev/null
--- 2/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h

index 0000000000000000000000000000000000000000,73171af0400e10e6e640424a343c523eb6f71eb9..73171af0400e10e6e640424a343c523eb6f71eb9

mode 000000,100644..100644
--- /dev/null
--- 2/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
diff --cc src/gromacs/mdlib/nbnxn_search.c

index ccfed3293c04399a0554975505a17666c5d6569d,0000000000000000000000000000000000000000..bf33c95c31cee2ed810195eae26be3ea72b3db6c

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_search.c
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@@ -1,5139 -1,0 +1,5164 @@@
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #ifdef GMX_DOUBLE
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + */
+ +
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include <string.h>
+ +#include "sysstuff.h"
+ +#include "smalloc.h"
+ +#include "macros.h"
+ +#include "maths.h"
+ +#include "vec.h"
+ +#include "pbc.h"
+ +#include "nbnxn_consts.h"
++/* nbnxn_internal.h included gmx_simd_macros.h */
+ +#include "nbnxn_internal.h"
++#ifdef GMX_NBNXN_SIMD
++#include "gmx_simd_vec.h"
++#endif
+ +#include "nbnxn_atomdata.h"
+ +#include "nbnxn_search.h"
+ +#include "gmx_cyclecounter.h"
+ +#include "gmxfio.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "nrnb.h"
+ +
+ +
+ +/* Pair search box lower and upper corner in x,y,z.
+ + * Store this in 4 iso 3 reals, which is useful with SSE.
+ + * To avoid complicating the code we also use 4 without SSE.
+ + */
+ +#define NNBSBB_C         4
+ +#define NNBSBB_B         (2*NNBSBB_C)
+ +/* Pair search box lower and upper bound in z only. */
+ +#define NNBSBB_D         2
+ +/* Pair search box lower and upper corner x,y,z indices */
+ +#define BBL_X  0
+ +#define BBL_Y  1
+ +#define BBL_Z  2
+ +#define BBU_X  4
+ +#define BBU_Y  5
+ +#define BBU_Z  6
+ +
+ +
+ +#ifdef NBNXN_SEARCH_BB_SSE
+ +/* We use SSE or AVX-128bit for bounding box calculations */
+ +
+ +#ifndef GMX_DOUBLE
+ +/* Single precision BBs + coordinates, we can also load coordinates using SSE */
+ +#define NBNXN_SEARCH_SSE_SINGLE
+ +#endif
+ +
+ +/* Include basic SSE2 stuff */
+ +#include <emmintrin.h>
+ +
+ +#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+ +/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+ +#define NBNXN_PBB_SSE
+ +#endif
+ +
+ +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
+ + * Here AVX-256 turns out to be slightly slower than AVX-128.
+ + */
+ +#define STRIDE_PBB        4
+ +#define STRIDE_PBB_2LOG   2
+ +
+ +#endif /* NBNXN_SEARCH_BB_SSE */
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +
+ +/* The functions below are macros as they are performance sensitive */
+ +
+ +/* 4x4 list, pack=4: no complex conversion required */
+ +/* i-cluster to j-cluster conversion */
+ +#define CI_TO_CJ_J4(ci)   (ci)
+ +/* cluster index to coordinate array index conversion */
+ +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
+ +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
+ +
+ +/* 4x2 list, pack=4: j-cluster size is half the packing width */
+ +/* i-cluster to j-cluster conversion */
+ +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
+ +/* cluster index to coordinate array index conversion */
+ +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
+ +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
+ +
+ +/* 4x8 list, pack=8: i-cluster size is half the packing width */
+ +/* i-cluster to j-cluster conversion */
+ +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
+ +/* cluster index to coordinate array index conversion */
+ +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
+ +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
+ +
+ +/* The j-cluster size is matched to the SIMD width */
- #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
- #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
- #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
- #endif
- #else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- #ifdef GMX_DOUBLE
++#if GMX_SIMD_WIDTH_HERE == 2
+ +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
+ +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
+ +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
+ +#else
- #endif
++#if GMX_SIMD_WIDTH_HERE == 4
+ +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+ +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+ +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+ +#else
++#if GMX_SIMD_WIDTH_HERE == 8
+ +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
+ +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
+ +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
+ +/* Half SIMD with j-cluster size */
+ +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+ +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+ +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
- /* Interaction masks for 4xN atom interactions.
-  * Bit i*CJ_SIZE + j tells if atom i and j interact.
-  */
- /* All interaction mask is the same for all kernels */
- #define NBNXN_INT_MASK_ALL        0xffffffff
- /* 4x4 kernel diagonal mask */
- #define NBNXN_INT_MASK_DIAG       0x08ce
- /* 4x2 kernel diagonal masks */
- #define NBNXN_INT_MASK_DIAG_J2_0  0x0002
- #define NBNXN_INT_MASK_DIAG_J2_1  0x002F
- /* 4x8 kernel diagonal masks */
- #define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
- #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
- 
- 
++#else
++#if GMX_SIMD_WIDTH_HERE == 16
++#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
++#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
++#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
+ +#else
+ +#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+ +#endif
+ +#endif
++#endif
++#endif
+ +
+ +#endif /* GMX_NBNXN_SIMD */
+ +
+ +
-     nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+ +#ifdef NBNXN_SEARCH_BB_SSE
+ +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
+ +#define NBNXN_BBXXXX
+ +/* Size of bounding box corners quadruplet */
+ +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
+ +#endif
+ +
+ +/* We shift the i-particles backward for PBC.
+ + * This leads to more conditionals than shifting forward.
+ + * We do this to get more balanced pair lists.
+ + */
+ +#define NBNXN_SHIFT_BACKWARD
+ +
+ +
+ +/* This define is a lazy way to avoid interdependence of the grid
+ + * and searching data structures.
+ + */
+ +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
+ +
+ +
+ +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
+ +{
+ +    int i;
+ +
+ +    for (i = 0; i < enbsCCnr; i++)
+ +    {
+ +        cc[i].count = 0;
+ +        cc[i].c     = 0;
+ +    }
+ +}
+ +
+ +static double Mcyc_av(const nbnxn_cycle_t *cc)
+ +{
+ +    return (double)cc->c*1e-6/cc->count;
+ +}
+ +
+ +static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
+ +{
+ +    int n;
+ +    int t;
+ +
+ +    fprintf(fp, "\n");
+ +    fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
+ +            nbs->cc[enbsCCgrid].count,
+ +            Mcyc_av(&nbs->cc[enbsCCgrid]),
+ +            Mcyc_av(&nbs->cc[enbsCCsearch]),
+ +            Mcyc_av(&nbs->cc[enbsCCreducef]));
+ +
+ +    if (nbs->nthread_max > 1)
+ +    {
+ +        if (nbs->cc[enbsCCcombine].count > 0)
+ +        {
+ +            fprintf(fp, " comb %5.2f",
+ +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
+ +        }
+ +        fprintf(fp, " s. th");
+ +        for (t = 0; t < nbs->nthread_max; t++)
+ +        {
+ +            fprintf(fp, " %4.1f",
+ +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
+ +        }
+ +    }
+ +    fprintf(fp, "\n");
+ +}
+ +
+ +static void nbnxn_grid_init(nbnxn_grid_t * grid)
+ +{
+ +    grid->cxy_na      = NULL;
+ +    grid->cxy_ind     = NULL;
+ +    grid->cxy_nalloc  = 0;
+ +    grid->bb          = NULL;
+ +    grid->bbj         = NULL;
+ +    grid->nc_nalloc   = 0;
+ +}
+ +
+ +static int get_2log(int n)
+ +{
+ +    int log2;
+ +
+ +    log2 = 0;
+ +    while ((1<<log2) < n)
+ +    {
+ +        log2++;
+ +    }
+ +    if ((1<<log2) != n)
+ +    {
+ +        gmx_fatal(FARGS, "nbnxn na_c (%d) is not a power of 2", n);
+ +    }
+ +
+ +    return log2;
+ +}
+ +
+ +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
+ +{
+ +    switch (nb_kernel_type)
+ +    {
+ +        case nbnxnk4x4_PlainC:
+ +        case nbnxnk4xN_SIMD_4xN:
+ +        case nbnxnk4xN_SIMD_2xNN:
+ +            return NBNXN_CPU_CLUSTER_I_SIZE;
+ +        case nbnxnk8x8x8_CUDA:
+ +        case nbnxnk8x8x8_PlainC:
+ +            /* The cluster size for super/sub lists is only set here.
+ +             * Any value should work for the pair-search and atomdata code.
+ +             * The kernels, of course, might require a particular value.
+ +             */
+ +            return NBNXN_GPU_CLUSTER_SIZE;
+ +        default:
+ +            gmx_incons("unknown kernel type");
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
+ +{
+ +    int nbnxn_simd_width = 0;
+ +    int cj_size          = 0;
+ +
+ +#ifdef GMX_NBNXN_SIMD
- #ifdef NBNXN_SEARCH_BB_SSE
- 
++    nbnxn_simd_width = GMX_SIMD_WIDTH_HERE;
+ +#endif
+ +
+ +    switch (nb_kernel_type)
+ +    {
+ +        case nbnxnk4x4_PlainC:
+ +            cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+ +            break;
+ +        case nbnxnk4xN_SIMD_4xN:
+ +            cj_size = nbnxn_simd_width;
+ +            break;
+ +        case nbnxnk4xN_SIMD_2xNN:
+ +            cj_size = nbnxn_simd_width/2;
+ +            break;
+ +        case nbnxnk8x8x8_CUDA:
+ +        case nbnxnk8x8x8_PlainC:
+ +            cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+ +            break;
+ +        default:
+ +            gmx_incons("unknown kernel type");
+ +    }
+ +
+ +    return cj_size;
+ +}
+ +
+ +static int ci_to_cj(int na_cj_2log, int ci)
+ +{
+ +    switch (na_cj_2log)
+ +    {
+ +        case 2: return ci;     break;
+ +        case 1: return (ci<<1); break;
+ +        case 3: return (ci>>1); break;
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
+ +{
+ +    if (nb_kernel_type == nbnxnkNotSet)
+ +    {
+ +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
+ +    }
+ +
+ +    switch (nb_kernel_type)
+ +    {
+ +        case nbnxnk8x8x8_CUDA:
+ +        case nbnxnk8x8x8_PlainC:
+ +            return FALSE;
+ +
+ +        case nbnxnk4x4_PlainC:
+ +        case nbnxnk4xN_SIMD_4xN:
+ +        case nbnxnk4xN_SIMD_2xNN:
+ +            return TRUE;
+ +
+ +        default:
+ +            gmx_incons("Invalid nonbonded kernel type passed!");
+ +            return FALSE;
+ +    }
+ +}
+ +
+ +void nbnxn_init_search(nbnxn_search_t    * nbs_ptr,
+ +                       ivec               *n_dd_cells,
+ +                       gmx_domdec_zones_t *zones,
+ +                       int                 nthread_max)
+ +{
+ +    nbnxn_search_t nbs;
+ +    int            d, g, t;
+ +
+ +    snew(nbs, 1);
+ +    *nbs_ptr = nbs;
+ +
+ +    nbs->DomDec = (n_dd_cells != NULL);
+ +
+ +    clear_ivec(nbs->dd_dim);
+ +    nbs->ngrid = 1;
+ +    if (nbs->DomDec)
+ +    {
+ +        nbs->zones = zones;
+ +
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            if ((*n_dd_cells)[d] > 1)
+ +            {
+ +                nbs->dd_dim[d] = 1;
+ +                /* Each grid matches a DD zone */
+ +                nbs->ngrid *= 2;
+ +            }
+ +        }
+ +    }
+ +
+ +    snew(nbs->grid, nbs->ngrid);
+ +    for (g = 0; g < nbs->ngrid; g++)
+ +    {
+ +        nbnxn_grid_init(&nbs->grid[g]);
+ +    }
+ +    nbs->cell        = NULL;
+ +    nbs->cell_nalloc = 0;
+ +    nbs->a           = NULL;
+ +    nbs->a_nalloc    = 0;
+ +
+ +    nbs->nthread_max = nthread_max;
+ +
+ +    /* Initialize the work data structures for each thread */
+ +    snew(nbs->work, nbs->nthread_max);
+ +    for (t = 0; t < nbs->nthread_max; t++)
+ +    {
+ +        nbs->work[t].cxy_na           = NULL;
+ +        nbs->work[t].cxy_na_nalloc    = 0;
+ +        nbs->work[t].sort_work        = NULL;
+ +        nbs->work[t].sort_work_nalloc = 0;
+ +    }
+ +
+ +    /* Initialize detailed nbsearch cycle counting */
+ +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
+ +    nbs->search_count = 0;
+ +    nbs_cycle_clear(nbs->cc);
+ +    for (t = 0; t < nbs->nthread_max; t++)
+ +    {
+ +        nbs_cycle_clear(nbs->work[t].cc);
+ +    }
+ +}
+ +
+ +static real grid_atom_density(int n, rvec corner0, rvec corner1)
+ +{
+ +    rvec size;
+ +
+ +    rvec_sub(corner1, corner0, size);
+ +
+ +    return n/(size[XX]*size[YY]*size[ZZ]);
+ +}
+ +
+ +static int set_grid_size_xy(const nbnxn_search_t nbs,
+ +                            nbnxn_grid_t *grid,
+ +                            int dd_zone,
+ +                            int n, rvec corner0, rvec corner1,
+ +                            real atom_density)
+ +{
+ +    rvec size;
+ +    int  na_c;
+ +    real adens, tlen, tlen_x, tlen_y, nc_max;
+ +    int  t;
+ +
+ +    rvec_sub(corner1, corner0, size);
+ +
+ +    if (n > grid->na_sc)
+ +    {
+ +        /* target cell length */
+ +        if (grid->bSimple)
+ +        {
+ +            /* To minimize the zero interactions, we should make
+ +             * the largest of the i/j cell cubic.
+ +             */
+ +            na_c = max(grid->na_c, grid->na_cj);
+ +
+ +            /* Approximately cubic cells */
+ +            tlen   = pow(na_c/atom_density, 1.0/3.0);
+ +            tlen_x = tlen;
+ +            tlen_y = tlen;
+ +        }
+ +        else
+ +        {
+ +            /* Approximately cubic sub cells */
+ +            tlen   = pow(grid->na_c/atom_density, 1.0/3.0);
+ +            tlen_x = tlen*GPU_NSUBCELL_X;
+ +            tlen_y = tlen*GPU_NSUBCELL_Y;
+ +        }
+ +        /* We round ncx and ncy down, because we get less cell pairs
+ +         * in the nbsist when the fixed cell dimensions (x,y) are
+ +         * larger than the variable one (z) than the other way around.
+ +         */
+ +        grid->ncx = max(1, (int)(size[XX]/tlen_x));
+ +        grid->ncy = max(1, (int)(size[YY]/tlen_y));
+ +    }
+ +    else
+ +    {
+ +        grid->ncx = 1;
+ +        grid->ncy = 1;
+ +    }
+ +
+ +    grid->sx     = size[XX]/grid->ncx;
+ +    grid->sy     = size[YY]/grid->ncy;
+ +    grid->inv_sx = 1/grid->sx;
+ +    grid->inv_sy = 1/grid->sy;
+ +
+ +    if (dd_zone > 0)
+ +    {
+ +        /* This is a non-home zone, add an extra row of cells
+ +         * for particles communicated for bonded interactions.
+ +         * These can be beyond the cut-off. It doesn't matter where
+ +         * they end up on the grid, but for performance it's better
+ +         * if they don't end up in cells that can be within cut-off range.
+ +         */
+ +        grid->ncx++;
+ +        grid->ncy++;
+ +    }
+ +
+ +    /* We need one additional cell entry for particles moved by DD */
+ +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
+ +    {
+ +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
+ +        srenew(grid->cxy_na, grid->cxy_nalloc);
+ +        srenew(grid->cxy_ind, grid->cxy_nalloc+1);
+ +    }
+ +    for (t = 0; t < nbs->nthread_max; t++)
+ +    {
+ +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
+ +        {
+ +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
+ +            srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
+ +        }
+ +    }
+ +
+ +    /* Worst case scenario of 1 atom in each last cell */
+ +    if (grid->na_cj <= grid->na_c)
+ +    {
+ +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
+ +    }
+ +    else
+ +    {
+ +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
+ +    }
+ +
+ +    if (nc_max > grid->nc_nalloc)
+ +    {
+ +        int bb_nalloc;
+ +
+ +        grid->nc_nalloc = over_alloc_large(nc_max);
+ +        srenew(grid->nsubc, grid->nc_nalloc);
+ +        srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
+ +#ifdef NBNXN_PBB_SSE
+ +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
+ +#else
+ +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
+ +#endif
+ +        sfree_aligned(grid->bb);
+ +        /* This snew also zeros the contents, this avoid possible
+ +         * floating exceptions in SSE with the unused bb elements.
+ +         */
+ +        snew_aligned(grid->bb, bb_nalloc, 16);
+ +
+ +        if (grid->bSimple)
+ +        {
+ +            if (grid->na_cj == grid->na_c)
+ +            {
+ +                grid->bbj = grid->bb;
+ +            }
+ +            else
+ +            {
+ +                sfree_aligned(grid->bbj);
+ +                snew_aligned(grid->bbj, bb_nalloc*grid->na_c/grid->na_cj, 16);
+ +            }
+ +        }
+ +
+ +        srenew(grid->flags, grid->nc_nalloc);
+ +    }
+ +
+ +    copy_rvec(corner0, grid->c0);
+ +    copy_rvec(corner1, grid->c1);
+ +
+ +    return nc_max;
+ +}
+ +
+ +/* We need to sort paricles in grid columns on z-coordinate.
+ + * As particle are very often distributed homogeneously, we a sorting
+ + * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ + * by a factor, cast to an int and try to store in that hole. If the hole
+ + * is full, we move this or another particle. A second pass is needed to make
+ + * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ + * 4 is the optimal value for homogeneous particle distribution and allows
+ + * for an O(#particles) sort up till distributions were all particles are
+ + * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ + * as it can be expensive to detect imhomogeneous particle distributions.
+ + * SGSF is the maximum ratio of holes used, in the worst case all particles
+ + * end up in the last hole and we need #particles extra holes at the end.
+ + */
+ +#define SORT_GRID_OVERSIZE 4
+ +#define SGSF (SORT_GRID_OVERSIZE + 1)
+ +
+ +/* Sort particle index a on coordinates x along dim.
+ + * Backwards tells if we want decreasing iso increasing coordinates.
+ + * h0 is the minimum of the coordinate range.
+ + * invh is the 1/length of the sorting range.
+ + * n_per_h (>=n) is the expected average number of particles per 1/invh
+ + * sort is the sorting work array.
+ + * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
+ + * or easier, allocate at least n*SGSF elements.
+ + */
+ +static void sort_atoms(int dim, gmx_bool Backwards,
+ +                       int *a, int n, rvec *x,
+ +                       real h0, real invh, int n_per_h,
+ +                       int *sort)
+ +{
+ +    int nsort, i, c;
+ +    int zi, zim, zi_min, zi_max;
+ +    int cp, tmp;
+ +
+ +    if (n <= 1)
+ +    {
+ +        /* Nothing to do */
+ +        return;
+ +    }
+ +
+ +#ifndef NDEBUG
+ +    if (n > n_per_h)
+ +    {
+ +        gmx_incons("n > n_per_h");
+ +    }
+ +#endif
+ +
+ +    /* Transform the inverse range height into the inverse hole height */
+ +    invh *= n_per_h*SORT_GRID_OVERSIZE;
+ +
+ +    /* Set nsort to the maximum possible number of holes used.
+ +     * In worst case all n elements end up in the last bin.
+ +     */
+ +    nsort = n_per_h*SORT_GRID_OVERSIZE + n;
+ +
+ +    /* Determine the index range used, so we can limit it for the second pass */
+ +    zi_min = INT_MAX;
+ +    zi_max = -1;
+ +
+ +    /* Sort the particles using a simple index sort */
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        /* The cast takes care of float-point rounding effects below zero.
+ +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
+ +         * times the box height out of the box.
+ +         */
+ +        zi = (int)((x[a[i]][dim] - h0)*invh);
+ +
+ +#ifndef NDEBUG
+ +        /* As we can have rounding effect, we use > iso >= here */
+ +        if (zi < 0 || zi > n_per_h*SORT_GRID_OVERSIZE)
+ +        {
+ +            gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
+ +                      a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
+ +                      n_per_h, SORT_GRID_OVERSIZE);
+ +        }
+ +#endif
+ +
+ +        /* Ideally this particle should go in sort cell zi,
+ +         * but that might already be in use,
+ +         * in that case find the first empty cell higher up
+ +         */
+ +        if (sort[zi] < 0)
+ +        {
+ +            sort[zi] = a[i];
+ +            zi_min   = min(zi_min, zi);
+ +            zi_max   = max(zi_max, zi);
+ +        }
+ +        else
+ +        {
+ +            /* We have multiple atoms in the same sorting slot.
+ +             * Sort on real z for minimal bounding box size.
+ +             * There is an extra check for identical z to ensure
+ +             * well-defined output order, independent of input order
+ +             * to ensure binary reproducibility after restarts.
+ +             */
+ +            while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
+ +                                      (x[a[i]][dim] == x[sort[zi]][dim] &&
+ +                                       a[i] > sort[zi])))
+ +            {
+ +                zi++;
+ +            }
+ +
+ +            if (sort[zi] >= 0)
+ +            {
+ +                /* Shift all elements by one slot until we find an empty slot */
+ +                cp  = sort[zi];
+ +                zim = zi + 1;
+ +                while (sort[zim] >= 0)
+ +                {
+ +                    tmp       = sort[zim];
+ +                    sort[zim] = cp;
+ +                    cp        = tmp;
+ +                    zim++;
+ +                }
+ +                sort[zim] = cp;
+ +                zi_max    = max(zi_max, zim);
+ +            }
+ +            sort[zi] = a[i];
+ +            zi_max   = max(zi_max, zi);
+ +        }
+ +    }
+ +
+ +    c = 0;
+ +    if (!Backwards)
+ +    {
+ +        for (zi = 0; zi < nsort; zi++)
+ +        {
+ +            if (sort[zi] >= 0)
+ +            {
+ +                a[c++]   = sort[zi];
+ +                sort[zi] = -1;
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        for (zi = zi_max; zi >= zi_min; zi--)
+ +        {
+ +            if (sort[zi] >= 0)
+ +            {
+ +                a[c++]   = sort[zi];
+ +                sort[zi] = -1;
+ +            }
+ +        }
+ +    }
+ +    if (c < n)
+ +    {
+ +        gmx_incons("Lost particles while sorting");
+ +    }
+ +}
+ +
+ +#ifdef GMX_DOUBLE
+ +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
+ +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
+ +#else
+ +#define R2F_D(x) (x)
+ +#define R2F_U(x) (x)
+ +#endif
+ +
+ +/* Coordinate order x,y,z, bb order xyz0 */
+ +static void calc_bounding_box(int na, int stride, const real *x, float *bb)
+ +{
+ +    int  i, j;
+ +    real xl, xh, yl, yh, zl, zh;
+ +
+ +    i  = 0;
+ +    xl = x[i+XX];
+ +    xh = x[i+XX];
+ +    yl = x[i+YY];
+ +    yh = x[i+YY];
+ +    zl = x[i+ZZ];
+ +    zh = x[i+ZZ];
+ +    i += stride;
+ +    for (j = 1; j < na; j++)
+ +    {
+ +        xl = min(xl, x[i+XX]);
+ +        xh = max(xh, x[i+XX]);
+ +        yl = min(yl, x[i+YY]);
+ +        yh = max(yh, x[i+YY]);
+ +        zl = min(zl, x[i+ZZ]);
+ +        zh = max(zh, x[i+ZZ]);
+ +        i += stride;
+ +    }
+ +    /* Note: possible double to float conversion here */
+ +    bb[BBL_X] = R2F_D(xl);
+ +    bb[BBL_Y] = R2F_D(yl);
+ +    bb[BBL_Z] = R2F_D(zl);
+ +    bb[BBU_X] = R2F_U(xh);
+ +    bb[BBU_Y] = R2F_U(yh);
+ +    bb[BBU_Z] = R2F_U(zh);
+ +}
+ +
+ +/* Packed coordinates, bb order xyz0 */
+ +static void calc_bounding_box_x_x4(int na, const real *x, float *bb)
+ +{
+ +    int  j;
+ +    real xl, xh, yl, yh, zl, zh;
+ +
+ +    xl = x[XX*PACK_X4];
+ +    xh = x[XX*PACK_X4];
+ +    yl = x[YY*PACK_X4];
+ +    yh = x[YY*PACK_X4];
+ +    zl = x[ZZ*PACK_X4];
+ +    zh = x[ZZ*PACK_X4];
+ +    for (j = 1; j < na; j++)
+ +    {
+ +        xl = min(xl, x[j+XX*PACK_X4]);
+ +        xh = max(xh, x[j+XX*PACK_X4]);
+ +        yl = min(yl, x[j+YY*PACK_X4]);
+ +        yh = max(yh, x[j+YY*PACK_X4]);
+ +        zl = min(zl, x[j+ZZ*PACK_X4]);
+ +        zh = max(zh, x[j+ZZ*PACK_X4]);
+ +    }
+ +    /* Note: possible double to float conversion here */
+ +    bb[BBL_X] = R2F_D(xl);
+ +    bb[BBL_Y] = R2F_D(yl);
+ +    bb[BBL_Z] = R2F_D(zl);
+ +    bb[BBU_X] = R2F_U(xh);
+ +    bb[BBU_Y] = R2F_U(yh);
+ +    bb[BBU_Z] = R2F_U(zh);
+ +}
+ +
+ +/* Packed coordinates, bb order xyz0 */
+ +static void calc_bounding_box_x_x8(int na, const real *x, float *bb)
+ +{
+ +    int  j;
+ +    real xl, xh, yl, yh, zl, zh;
+ +
+ +    xl = x[XX*PACK_X8];
+ +    xh = x[XX*PACK_X8];
+ +    yl = x[YY*PACK_X8];
+ +    yh = x[YY*PACK_X8];
+ +    zl = x[ZZ*PACK_X8];
+ +    zh = x[ZZ*PACK_X8];
+ +    for (j = 1; j < na; j++)
+ +    {
+ +        xl = min(xl, x[j+XX*PACK_X8]);
+ +        xh = max(xh, x[j+XX*PACK_X8]);
+ +        yl = min(yl, x[j+YY*PACK_X8]);
+ +        yh = max(yh, x[j+YY*PACK_X8]);
+ +        zl = min(zl, x[j+ZZ*PACK_X8]);
+ +        zh = max(zh, x[j+ZZ*PACK_X8]);
+ +    }
+ +    /* Note: possible double to float conversion here */
+ +    bb[BBL_X] = R2F_D(xl);
+ +    bb[BBL_Y] = R2F_D(yl);
+ +    bb[BBL_Z] = R2F_D(zl);
+ +    bb[BBU_X] = R2F_U(xh);
+ +    bb[BBU_Y] = R2F_U(yh);
+ +    bb[BBU_Z] = R2F_U(zh);
+ +}
+ +
- #ifdef NBNXN_SEARCH_BB_SSE
+ +/* Packed coordinates, bb order xyz0 */
+ +static void calc_bounding_box_x_x4_halves(int na, const real *x,
+ +                                          float *bb, float *bbj)
+ +{
++#ifndef NBNXN_SEARCH_BB_SSE
++    int i;
++#endif
++
+ +    calc_bounding_box_x_x4(min(na, 2), x, bbj);
+ +
+ +    if (na > 2)
+ +    {
+ +        calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+NNBSBB_B);
+ +    }
+ +    else
+ +    {
+ +        /* Set the "empty" bounding box to the same as the first one,
+ +         * so we don't need to treat special cases in the rest of the code.
+ +         */
++#ifdef NBNXN_SEARCH_BB_SSE
+ +        _mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
+ +        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
++#else
++        for (i = 0; i < NNBSBB_B; i++)
++        {
++            bbj[NNBSBB_B + i] = bbj[i];
++        }
++#endif
+ +    }
+ +
++#ifdef NBNXN_SEARCH_BB_SSE
+ +    _mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
+ +                                _mm_load_ps(bbj+NNBSBB_B)));
+ +    _mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
+ +                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
++#else
++    for (i = 0; i < NNBSBB_C; i++)
++    {
++        bb[           i] = min(bbj[           i], bbj[NNBSBB_B +            i]);
++        bb[NNBSBB_C + i] = max(bbj[NNBSBB_C + i], bbj[NNBSBB_B + NNBSBB_C + i]);
++    }
++#endif
+ +}
+ +
++#ifdef NBNXN_SEARCH_BB_SSE
++
+ +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
+ +static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
+ +{
+ +    int  i, j;
+ +    real xl, xh, yl, yh, zl, zh;
+ +
+ +    i  = 0;
+ +    xl = x[i+XX];
+ +    xh = x[i+XX];
+ +    yl = x[i+YY];
+ +    yh = x[i+YY];
+ +    zl = x[i+ZZ];
+ +    zh = x[i+ZZ];
+ +    i += stride;
+ +    for (j = 1; j < na; j++)
+ +    {
+ +        xl = min(xl, x[i+XX]);
+ +        xh = max(xh, x[i+XX]);
+ +        yl = min(yl, x[i+YY]);
+ +        yh = max(yh, x[i+YY]);
+ +        zl = min(zl, x[i+ZZ]);
+ +        zh = max(zh, x[i+ZZ]);
+ +        i += stride;
+ +    }
+ +    /* Note: possible double to float conversion here */
+ +    bb[0*STRIDE_PBB] = R2F_D(xl);
+ +    bb[1*STRIDE_PBB] = R2F_D(yl);
+ +    bb[2*STRIDE_PBB] = R2F_D(zl);
+ +    bb[3*STRIDE_PBB] = R2F_U(xh);
+ +    bb[4*STRIDE_PBB] = R2F_U(yh);
+ +    bb[5*STRIDE_PBB] = R2F_U(zh);
+ +}
+ +
+ +#endif /* NBNXN_SEARCH_BB_SSE */
+ +
+ +#ifdef NBNXN_SEARCH_SSE_SINGLE
+ +
+ +/* Coordinate order xyz?, bb order xyz0 */
+ +static void calc_bounding_box_sse(int na, const float *x, float *bb)
+ +{
+ +    __m128 bb_0_SSE, bb_1_SSE;
+ +    __m128 x_SSE;
+ +
+ +    int    i;
+ +
+ +    bb_0_SSE = _mm_load_ps(x);
+ +    bb_1_SSE = bb_0_SSE;
+ +
+ +    for (i = 1; i < na; i++)
+ +    {
+ +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
+ +        bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
+ +        bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
+ +    }
+ +
+ +    _mm_store_ps(bb, bb_0_SSE);
+ +    _mm_store_ps(bb+4, bb_1_SSE);
+ +}
+ +
+ +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
+ +static void calc_bounding_box_xxxx_sse(int na, const float *x,
+ +                                       float *bb_work,
+ +                                       real *bb)
+ +{
+ +    calc_bounding_box_sse(na, x, bb_work);
+ +
+ +    bb[0*STRIDE_PBB] = bb_work[BBL_X];
+ +    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+ +    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+ +    bb[3*STRIDE_PBB] = bb_work[BBU_X];
+ +    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+ +    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
+ +}
+ +
+ +#endif /* NBNXN_SEARCH_SSE_SINGLE */
+ +
-     __m128 min_SSE, max_SSE;
+ +
+ +/* Combines pairs of consecutive bounding boxes */
+ +static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
+ +{
+ +    int    i, j, sc2, nc2, c2;
- #endif
- 
+ +
+ +    for (i = 0; i < grid->ncx*grid->ncy; i++)
+ +    {
+ +        /* Starting bb in a column is expected to be 2-aligned */
+ +        sc2 = grid->cxy_ind[i]>>1;
+ +        /* For odd numbers skip the last bb here */
+ +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
+ +        for (c2 = sc2; c2 < sc2+nc2; c2++)
+ +        {
++#ifdef NBNXN_SEARCH_BB_SSE
++            __m128 min_SSE, max_SSE;
++
+ +            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
+ +                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
+ +            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
+ +                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
+ +            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
+ +            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
++#else
++            for (j = 0; j < NNBSBB_C; j++)
++            {
++                grid->bbj[(c2*2+0)*NNBSBB_C+j] = min(bb[(c2*4+0)*NNBSBB_C+j],
++                                                     bb[(c2*4+2)*NNBSBB_C+j]);
++                grid->bbj[(c2*2+1)*NNBSBB_C+j] = max(bb[(c2*4+1)*NNBSBB_C+j],
++                                                     bb[(c2*4+3)*NNBSBB_C+j]);
++            }
++#endif
+ +        }
+ +        if (((grid->cxy_na[i]+3)>>2) & 1)
+ +        {
+ +            /* Copy the last bb for odd bb count in this column */
+ +            for (j = 0; j < NNBSBB_C; j++)
+ +            {
+ +                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
+ +                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
+ +            }
+ +        }
+ +    }
+ +}
+ +
- #if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
+ +
+ +/* Prints the average bb size, used for debug output */
+ +static void print_bbsizes_simple(FILE                *fp,
+ +                                 const nbnxn_search_t nbs,
+ +                                 const nbnxn_grid_t  *grid)
+ +{
+ +    int  c, d;
+ +    dvec ba;
+ +
+ +    clear_dvec(ba);
+ +    for (c = 0; c < grid->nc; c++)
+ +    {
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
+ +        }
+ +    }
+ +    dsvmul(1.0/grid->nc, ba, ba);
+ +
+ +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
+ +            nbs->box[XX][XX]/grid->ncx,
+ +            nbs->box[YY][YY]/grid->ncy,
+ +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
+ +            ba[XX], ba[YY], ba[ZZ],
+ +            ba[XX]*grid->ncx/nbs->box[XX][XX],
+ +            ba[YY]*grid->ncy/nbs->box[YY][YY],
+ +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
+ +}
+ +
+ +/* Prints the average bb size, used for debug output */
+ +static void print_bbsizes_supersub(FILE                *fp,
+ +                                   const nbnxn_search_t nbs,
+ +                                   const nbnxn_grid_t  *grid)
+ +{
+ +    int  ns, c, s;
+ +    dvec ba;
+ +
+ +    clear_dvec(ba);
+ +    ns = 0;
+ +    for (c = 0; c < grid->nc; c++)
+ +    {
+ +#ifdef NBNXN_BBXXXX
+ +        for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
+ +        {
+ +            int cs_w, i, d;
+ +
+ +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+ +            for (i = 0; i < STRIDE_PBB; i++)
+ +            {
+ +                for (d = 0; d < DIM; d++)
+ +                {
+ +                    ba[d] +=
+ +                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+ +                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
+ +                }
+ +            }
+ +        }
+ +#else
+ +        for (s = 0; s < grid->nsubc[c]; s++)
+ +        {
+ +            int cs, d;
+ +
+ +            cs = c*GPU_NSUBCELL + s;
+ +            for (d = 0; d < DIM; d++)
+ +            {
+ +                ba[d] +=
+ +                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
+ +                    grid->bb[cs*NNBSBB_B         +d];
+ +            }
+ +        }
+ +#endif
+ +        ns += grid->nsubc[c];
+ +    }
+ +    dsvmul(1.0/ns, ba, ba);
+ +
+ +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
+ +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
+ +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
+ +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
+ +            ba[XX], ba[YY], ba[ZZ],
+ +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
+ +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
+ +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
+ +}
+ +
+ +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
+ + * Also sets interaction flags.
+ + */
+ +void sort_on_lj(int na_c,
+ +                int a0, int a1, const int *atinfo,
+ +                int *order,
+ +                int *flags)
+ +{
+ +    int      subc, s, a, n1, n2, a_lj_max, i, j;
+ +    int      sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
+ +    int      sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
+ +    gmx_bool haveQ;
+ +
+ +    *flags = 0;
+ +
+ +    subc = 0;
+ +    for (s = a0; s < a1; s += na_c)
+ +    {
+ +        /* Make lists for this (sub-)cell on atoms with and without LJ */
+ +        n1       = 0;
+ +        n2       = 0;
+ +        haveQ    = FALSE;
+ +        a_lj_max = -1;
+ +        for (a = s; a < min(s+na_c, a1); a++)
+ +        {
+ +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
+ +
+ +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
+ +            {
+ +                sort1[n1++] = order[a];
+ +                a_lj_max    = a;
+ +            }
+ +            else
+ +            {
+ +                sort2[n2++] = order[a];
+ +            }
+ +        }
+ +
+ +        /* If we don't have atom with LJ, there's nothing to sort */
+ +        if (n1 > 0)
+ +        {
+ +            *flags |= NBNXN_CI_DO_LJ(subc);
+ +
+ +            if (2*n1 <= na_c)
+ +            {
+ +                /* Only sort when strictly necessary. Ordering particles
+ +                 * Ordering particles can lead to less accurate summation
+ +                 * due to rounding, both for LJ and Coulomb interactions.
+ +                 */
+ +                if (2*(a_lj_max - s) >= na_c)
+ +                {
+ +                    for (i = 0; i < n1; i++)
+ +                    {
+ +                        order[a0+i] = sort1[i];
+ +                    }
+ +                    for (j = 0; j < n2; j++)
+ +                    {
+ +                        order[a0+n1+j] = sort2[j];
+ +                    }
+ +                }
+ +
+ +                *flags |= NBNXN_CI_HALF_LJ(subc);
+ +            }
+ +        }
+ +        if (haveQ)
+ +        {
+ +            *flags |= NBNXN_CI_DO_COUL(subc);
+ +        }
+ +        subc++;
+ +    }
+ +}
+ +
+ +/* Fill a pair search cell with atoms.
+ + * Potentially sorts atoms and sets the interaction flags.
+ + */
+ +void fill_cell(const nbnxn_search_t nbs,
+ +               nbnxn_grid_t *grid,
+ +               nbnxn_atomdata_t *nbat,
+ +               int a0, int a1,
+ +               const int *atinfo,
+ +               rvec *x,
+ +               int sx, int sy, int sz,
+ +               float *bb_work)
+ +{
+ +    int     na, a;
+ +    size_t  offset;
+ +    float  *bb_ptr;
+ +
+ +    na = a1 - a0;
+ +
+ +    if (grid->bSimple)
+ +    {
+ +        sort_on_lj(grid->na_c, a0, a1, atinfo, nbs->a,
+ +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
+ +    }
+ +
+ +    /* Now we have sorted the atoms, set the cell indices */
+ +    for (a = a0; a < a1; a++)
+ +    {
+ +        nbs->cell[nbs->a[a]] = a;
+ +    }
+ +
+ +    copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
+ +                           nbat->XFormat, nbat->x, a0,
+ +                           sx, sy, sz);
+ +
+ +    if (nbat->XFormat == nbatX4)
+ +    {
+ +        /* Store the bounding boxes as xyz.xyz. */
+ +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ +        bb_ptr = grid->bb + offset;
+ +
- #ifdef NBNXN_SEARCH_BB_SSE
++#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
+ +        if (2*grid->na_cj == grid->na_c)
+ +        {
+ +            calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
+ +                                          grid->bbj+offset*2);
+ +        }
+ +        else
+ +#endif
+ +        {
+ +            calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
+ +        }
+ +    }
+ +    else if (nbat->XFormat == nbatX8)
+ +    {
+ +        /* Store the bounding boxes as xyz.xyz. */
+ +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ +        bb_ptr = grid->bb + offset;
+ +
+ +        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
+ +    }
+ +#ifdef NBNXN_BBXXXX
+ +    else if (!grid->bSimple)
+ +    {
+ +        /* Store the bounding boxes in a format convenient
+ +         * for SSE calculations: xxxxyyyyzzzz...
+ +         */
+ +        bb_ptr =
+ +            grid->bb +
+ +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+ +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
+ +
+ +#ifdef NBNXN_SEARCH_SSE_SINGLE
+ +        if (nbat->XFormat == nbatXYZQ)
+ +        {
+ +            calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
+ +                                       bb_work, bb_ptr);
+ +        }
+ +        else
+ +#endif
+ +        {
+ +            calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
+ +                                   bb_ptr);
+ +        }
+ +        if (gmx_debug_at)
+ +        {
+ +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
+ +                    sx, sy, sz,
+ +                    bb_ptr[0*STRIDE_PBB], bb_ptr[3*STRIDE_PBB],
+ +                    bb_ptr[1*STRIDE_PBB], bb_ptr[4*STRIDE_PBB],
+ +                    bb_ptr[2*STRIDE_PBB], bb_ptr[5*STRIDE_PBB]);
+ +        }
+ +    }
+ +#endif
+ +    else
+ +    {
+ +        /* Store the bounding boxes as xyz.xyz. */
+ +        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ +
+ +        calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
+ +                          bb_ptr);
+ +
+ +        if (gmx_debug_at)
+ +        {
+ +            int bbo;
+ +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
+ +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
+ +                    sx, sy, sz,
+ +                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
+ +                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
+ +                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
+ +                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
+ +                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
+ +                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
+ +        }
+ +    }
+ +}
+ +
+ +/* Spatially sort the atoms within one grid column */
+ +static void sort_columns_simple(const nbnxn_search_t nbs,
+ +                                int dd_zone,
+ +                                nbnxn_grid_t *grid,
+ +                                int a0, int a1,
+ +                                const int *atinfo,
+ +                                rvec *x,
+ +                                nbnxn_atomdata_t *nbat,
+ +                                int cxy_start, int cxy_end,
+ +                                int *sort_work)
+ +{
+ +    int  cxy;
+ +    int  cx, cy, cz, ncz, cfilled, c;
+ +    int  na, ash, ind, a;
+ +    int  na_c, ash_c;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
+ +                grid->cell0, cxy_start, cxy_end, a0, a1);
+ +    }
+ +
+ +    /* Sort the atoms within each x,y column in 3 dimensions */
+ +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
+ +    {
+ +        cx = cxy/grid->ncy;
+ +        cy = cxy - cx*grid->ncy;
+ +
+ +        na  = grid->cxy_na[cxy];
+ +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
+ +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+ +
+ +        /* Sort the atoms within each x,y column on z coordinate */
+ +        sort_atoms(ZZ, FALSE,
+ +                   nbs->a+ash, na, x,
+ +                   grid->c0[ZZ],
+ +                   1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
+ +                   sort_work);
+ +
+ +        /* Fill the ncz cells in this column */
+ +        cfilled = grid->cxy_ind[cxy];
+ +        for (cz = 0; cz < ncz; cz++)
+ +        {
+ +            c  = grid->cxy_ind[cxy] + cz;
+ +
+ +            ash_c = ash + cz*grid->na_sc;
+ +            na_c  = min(grid->na_sc, na-(ash_c-ash));
+ +
+ +            fill_cell(nbs, grid, nbat,
+ +                      ash_c, ash_c+na_c, atinfo, x,
+ +                      grid->na_sc*cx + (dd_zone >> 2),
+ +                      grid->na_sc*cy + (dd_zone & 3),
+ +                      grid->na_sc*cz,
+ +                      NULL);
+ +
+ +            /* This copy to bbcz is not really necessary.
+ +             * But it allows to use the same grid search code
+ +             * for the simple and supersub cell setups.
+ +             */
+ +            if (na_c > 0)
+ +            {
+ +                cfilled = c;
+ +            }
+ +            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
+ +            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
+ +        }
+ +
+ +        /* Set the unused atom indices to -1 */
+ +        for (ind = na; ind < ncz*grid->na_sc; ind++)
+ +        {
+ +            nbs->a[ash+ind] = -1;
+ +        }
+ +    }
+ +}
+ +
+ +/* Spatially sort the atoms within one grid column */
+ +static void sort_columns_supersub(const nbnxn_search_t nbs,
+ +                                  int dd_zone,
+ +                                  nbnxn_grid_t *grid,
+ +                                  int a0, int a1,
+ +                                  const int *atinfo,
+ +                                  rvec *x,
+ +                                  nbnxn_atomdata_t *nbat,
+ +                                  int cxy_start, int cxy_end,
+ +                                  int *sort_work)
+ +{
+ +    int  cxy;
+ +    int  cx, cy, cz = -1, c = -1, ncz;
+ +    int  na, ash, na_c, ind, a;
+ +    int  subdiv_z, sub_z, na_z, ash_z;
+ +    int  subdiv_y, sub_y, na_y, ash_y;
+ +    int  subdiv_x, sub_x, na_x, ash_x;
+ +
+ +    /* cppcheck-suppress unassignedVariable */
+ +    float bb_work_array[NNBSBB_B+3], *bb_work_align;
+ +
+ +    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
+ +                grid->cell0, cxy_start, cxy_end, a0, a1);
+ +    }
+ +
+ +    subdiv_x = grid->na_c;
+ +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
+ +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
+ +
+ +    /* Sort the atoms within each x,y column in 3 dimensions */
+ +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
+ +    {
+ +        cx = cxy/grid->ncy;
+ +        cy = cxy - cx*grid->ncy;
+ +
+ +        na  = grid->cxy_na[cxy];
+ +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
+ +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+ +
+ +        /* Sort the atoms within each x,y column on z coordinate */
+ +        sort_atoms(ZZ, FALSE,
+ +                   nbs->a+ash, na, x,
+ +                   grid->c0[ZZ],
+ +                   1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
+ +                   sort_work);
+ +
+ +        /* This loop goes over the supercells and subcells along z at once */
+ +        for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
+ +        {
+ +            ash_z = ash + sub_z*subdiv_z;
+ +            na_z  = min(subdiv_z, na-(ash_z-ash));
+ +
+ +            /* We have already sorted on z */
+ +
+ +            if (sub_z % GPU_NSUBCELL_Z == 0)
+ +            {
+ +                cz = sub_z/GPU_NSUBCELL_Z;
+ +                c  = grid->cxy_ind[cxy] + cz;
+ +
+ +                /* The number of atoms in this supercell */
+ +                na_c = min(grid->na_sc, na-(ash_z-ash));
+ +
+ +                grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
+ +
+ +                /* Store the z-boundaries of the super cell */
+ +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
+ +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
+ +            }
+ +
+ +#if GPU_NSUBCELL_Y > 1
+ +            /* Sort the atoms along y */
+ +            sort_atoms(YY, (sub_z & 1),
+ +                       nbs->a+ash_z, na_z, x,
+ +                       grid->c0[YY]+cy*grid->sy,
+ +                       grid->inv_sy, subdiv_z,
+ +                       sort_work);
+ +#endif
+ +
+ +            for (sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
+ +            {
+ +                ash_y = ash_z + sub_y*subdiv_y;
+ +                na_y  = min(subdiv_y, na-(ash_y-ash));
+ +
+ +#if GPU_NSUBCELL_X > 1
+ +                /* Sort the atoms along x */
+ +                sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
+ +                           nbs->a+ash_y, na_y, x,
+ +                           grid->c0[XX]+cx*grid->sx,
+ +                           grid->inv_sx, subdiv_y,
+ +                           sort_work);
+ +#endif
+ +
+ +                for (sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
+ +                {
+ +                    ash_x = ash_y + sub_x*subdiv_x;
+ +                    na_x  = min(subdiv_x, na-(ash_x-ash));
+ +
+ +                    fill_cell(nbs, grid, nbat,
+ +                              ash_x, ash_x+na_x, atinfo, x,
+ +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
+ +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
+ +                              grid->na_c*sub_z,
+ +                              bb_work_align);
+ +                }
+ +            }
+ +        }
+ +
+ +        /* Set the unused atom indices to -1 */
+ +        for (ind = na; ind < ncz*grid->na_sc; ind++)
+ +        {
+ +            nbs->a[ash+ind] = -1;
+ +        }
+ +    }
+ +}
+ +
+ +/* Determine in which grid column atoms should go */
+ +static void calc_column_indices(nbnxn_grid_t *grid,
+ +                                int a0, int a1,
+ +                                rvec *x,
+ +                                int dd_zone, const int *move,
+ +                                int thread, int nthread,
+ +                                int *cell,
+ +                                int *cxy_na)
+ +{
+ +    int  n0, n1, i;
+ +    int  cx, cy;
+ +
+ +    /* We add one extra cell for particles which moved during DD */
+ +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
+ +    {
+ +        cxy_na[i] = 0;
+ +    }
+ +
+ +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
+ +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
+ +    if (dd_zone == 0)
+ +    {
+ +        /* Home zone */
+ +        for (i = n0; i < n1; i++)
+ +        {
+ +            if (move == NULL || move[i] >= 0)
+ +            {
+ +                /* We need to be careful with rounding,
+ +                 * particles might be a few bits outside the local zone.
+ +                 * The int cast takes care of the lower bound,
+ +                 * we will explicitly take care of the upper bound.
+ +                 */
+ +                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ +                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+ +
+ +#ifndef NDEBUG
+ +                if (cx < 0 || cx > grid->ncx ||
+ +                    cy < 0 || cy > grid->ncy)
+ +                {
+ +                    gmx_fatal(FARGS,
+ +                              "grid cell cx %d cy %d out of range (max %d %d)\n"
+ +                              "atom %f %f %f, grid->c0 %f %f",
+ +                              cx, cy, grid->ncx, grid->ncy,
+ +                              x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
+ +                }
+ +#endif
+ +                /* Take care of potential rouding issues */
+ +                cx = min(cx, grid->ncx - 1);
+ +                cy = min(cy, grid->ncy - 1);
+ +
+ +                /* For the moment cell will contain only the, grid local,
+ +                 * x and y indices, not z.
+ +                 */
+ +                cell[i] = cx*grid->ncy + cy;
+ +            }
+ +            else
+ +            {
+ +                /* Put this moved particle after the end of the grid,
+ +                 * so we can process it later without using conditionals.
+ +                 */
+ +                cell[i] = grid->ncx*grid->ncy;
+ +            }
+ +
+ +            cxy_na[cell[i]]++;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* Non-home zone */
+ +        for (i = n0; i < n1; i++)
+ +        {
+ +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+ +
+ +            /* For non-home zones there could be particles outside
+ +             * the non-bonded cut-off range, which have been communicated
+ +             * for bonded interactions only. For the result it doesn't
+ +             * matter where these end up on the grid. For performance
+ +             * we put them in an extra row at the border.
+ +             */
+ +            cx = max(cx, 0);
+ +            cx = min(cx, grid->ncx - 1);
+ +            cy = max(cy, 0);
+ +            cy = min(cy, grid->ncy - 1);
+ +
+ +            /* For the moment cell will contain only the, grid local,
+ +             * x and y indices, not z.
+ +             */
+ +            cell[i] = cx*grid->ncy + cy;
+ +
+ +            cxy_na[cell[i]]++;
+ +        }
+ +    }
+ +}
+ +
+ +/* Determine in which grid cells the atoms should go */
+ +static void calc_cell_indices(const nbnxn_search_t nbs,
+ +                              int dd_zone,
+ +                              nbnxn_grid_t *grid,
+ +                              int a0, int a1,
+ +                              const int *atinfo,
+ +                              rvec *x,
+ +                              const int *move,
+ +                              nbnxn_atomdata_t *nbat)
+ +{
+ +    int   n0, n1, i;
+ +    int   cx, cy, cxy, ncz_max, ncz;
+ +    int   nthread, thread;
+ +    int  *cxy_na, cxy_na_i;
+ +
+ +    nthread = gmx_omp_nthreads_get(emntPairsearch);
+ +
+ +#pragma omp parallel for num_threads(nthread) schedule(static)
+ +    for (thread = 0; thread < nthread; thread++)
+ +    {
+ +        calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
+ +                            nbs->cell, nbs->work[thread].cxy_na);
+ +    }
+ +
+ +    /* Make the cell index as a function of x and y */
+ +    ncz_max          = 0;
+ +    ncz              = 0;
+ +    grid->cxy_ind[0] = 0;
+ +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
+ +    {
+ +        /* We set ncz_max at the beginning of the loop iso at the end
+ +         * to skip i=grid->ncx*grid->ncy which are moved particles
+ +         * that do not need to be ordered on the grid.
+ +         */
+ +        if (ncz > ncz_max)
+ +        {
+ +            ncz_max = ncz;
+ +        }
+ +        cxy_na_i = nbs->work[0].cxy_na[i];
+ +        for (thread = 1; thread < nthread; thread++)
+ +        {
+ +            cxy_na_i += nbs->work[thread].cxy_na[i];
+ +        }
+ +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
+ +        if (nbat->XFormat == nbatX8)
+ +        {
+ +            /* Make the number of cell a multiple of 2 */
+ +            ncz = (ncz + 1) & ~1;
+ +        }
+ +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
+ +        /* Clear cxy_na, so we can reuse the array below */
+ +        grid->cxy_na[i] = 0;
+ +    }
+ +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
+ +
+ +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
+ +                grid->na_sc, grid->na_c, grid->nc,
+ +                grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
+ +                ncz_max);
+ +        if (gmx_debug_at)
+ +        {
+ +            i = 0;
+ +            for (cy = 0; cy < grid->ncy; cy++)
+ +            {
+ +                for (cx = 0; cx < grid->ncx; cx++)
+ +                {
+ +                    fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
+ +                    i++;
+ +                }
+ +                fprintf(debug, "\n");
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Make sure the work array for sorting is large enough */
+ +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
+ +    {
+ +        for (thread = 0; thread < nbs->nthread_max; thread++)
+ +        {
+ +            nbs->work[thread].sort_work_nalloc =
+ +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
+ +            srenew(nbs->work[thread].sort_work,
+ +                   nbs->work[thread].sort_work_nalloc);
+ +            /* When not in use, all elements should be -1 */
+ +            for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
+ +            {
+ +                nbs->work[thread].sort_work[i] = -1;
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Now we know the dimensions we can fill the grid.
+ +     * This is the first, unsorted fill. We sort the columns after this.
+ +     */
+ +    for (i = a0; i < a1; i++)
+ +    {
+ +        /* At this point nbs->cell contains the local grid x,y indices */
+ +        cxy = nbs->cell[i];
+ +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
+ +    }
+ +
+ +    if (dd_zone == 0)
+ +    {
+ +        /* Set the cell indices for the moved particles */
+ +        n0 = grid->nc*grid->na_sc;
+ +        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+ +        if (dd_zone == 0)
+ +        {
+ +            for (i = n0; i < n1; i++)
+ +            {
+ +                nbs->cell[nbs->a[i]] = i;
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Sort the super-cell columns along z into the sub-cells. */
+ +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
+ +    for (thread = 0; thread < nbs->nthread_max; thread++)
+ +    {
+ +        if (grid->bSimple)
+ +        {
+ +            sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
+ +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
+ +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
+ +                                nbs->work[thread].sort_work);
+ +        }
+ +        else
+ +        {
+ +            sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
+ +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
+ +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
+ +                                  nbs->work[thread].sort_work);
+ +        }
+ +    }
+ +
- #endif
+ +    if (grid->bSimple && nbat->XFormat == nbatX8)
+ +    {
+ +        combine_bounding_box_pairs(grid, grid->bb);
+ +    }
- #ifdef NBNXN_SEARCH_BB_SSE
+ +
+ +    if (!grid->bSimple)
+ +    {
+ +        grid->nsubc_tot = 0;
+ +        for (i = 0; i < grid->nc; i++)
+ +        {
+ +            grid->nsubc_tot += grid->nsubc[i];
+ +        }
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        if (grid->bSimple)
+ +        {
+ +            print_bbsizes_simple(debug, nbs, grid);
+ +        }
+ +        else
+ +        {
+ +            fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
+ +                    grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
+ +
+ +            print_bbsizes_supersub(debug, nbs, grid);
+ +        }
+ +    }
+ +}
+ +
+ +static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
+ +                              int                   natoms)
+ +{
+ +    int b;
+ +
+ +    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
+ +    if (flags->nflag > flags->flag_nalloc)
+ +    {
+ +        flags->flag_nalloc = over_alloc_large(flags->nflag);
+ +        srenew(flags->flag, flags->flag_nalloc);
+ +    }
+ +    for (b = 0; b < flags->nflag; b++)
+ +    {
+ +        flags->flag[b] = 0;
+ +    }
+ +}
+ +
+ +/* Sets up a grid and puts the atoms on the grid.
+ + * This function only operates on one domain of the domain decompostion.
+ + * Note that without domain decomposition there is only one domain.
+ + */
+ +void nbnxn_put_on_grid(nbnxn_search_t nbs,
+ +                       int ePBC, matrix box,
+ +                       int dd_zone,
+ +                       rvec corner0, rvec corner1,
+ +                       int a0, int a1,
+ +                       real atom_density,
+ +                       const int *atinfo,
+ +                       rvec *x,
+ +                       int nmoved, int *move,
+ +                       int nb_kernel_type,
+ +                       nbnxn_atomdata_t *nbat)
+ +{
+ +    nbnxn_grid_t *grid;
+ +    int           n;
+ +    int           nc_max_grid, nc_max;
+ +
+ +    grid = &nbs->grid[dd_zone];
+ +
+ +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
+ +
+ +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
+ +
+ +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
+ +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
+ +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
+ +    grid->na_c_2log = get_2log(grid->na_c);
+ +
+ +    nbat->na_c = grid->na_c;
+ +
+ +    if (dd_zone == 0)
+ +    {
+ +        grid->cell0 = 0;
+ +    }
+ +    else
+ +    {
+ +        grid->cell0 =
+ +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
+ +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
+ +    }
+ +
+ +    n = a1 - a0;
+ +
+ +    if (dd_zone == 0)
+ +    {
+ +        nbs->ePBC = ePBC;
+ +        copy_mat(box, nbs->box);
+ +
+ +        if (atom_density >= 0)
+ +        {
+ +            grid->atom_density = atom_density;
+ +        }
+ +        else
+ +        {
+ +            grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
+ +        }
+ +
+ +        grid->cell0 = 0;
+ +
+ +        nbs->natoms_local    = a1 - nmoved;
+ +        /* We assume that nbnxn_put_on_grid is called first
+ +         * for the local atoms (dd_zone=0).
+ +         */
+ +        nbs->natoms_nonlocal = a1 - nmoved;
+ +    }
+ +    else
+ +    {
+ +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1);
+ +    }
+ +
+ +    nc_max_grid = set_grid_size_xy(nbs, grid,
+ +                                   dd_zone, n-nmoved, corner0, corner1,
+ +                                   nbs->grid[0].atom_density);
+ +
+ +    nc_max = grid->cell0 + nc_max_grid;
+ +
+ +    if (a1 > nbs->cell_nalloc)
+ +    {
+ +        nbs->cell_nalloc = over_alloc_large(a1);
+ +        srenew(nbs->cell, nbs->cell_nalloc);
+ +    }
+ +
+ +    /* To avoid conditionals we store the moved particles at the end of a,
+ +     * make sure we have enough space.
+ +     */
+ +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
+ +    {
+ +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
+ +        srenew(nbs->a, nbs->a_nalloc);
+ +    }
+ +
+ +    /* We need padding up to a multiple of the buffer flag size: simply add */
+ +    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
+ +    {
+ +        nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
+ +    }
+ +
+ +    calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
+ +
+ +    if (dd_zone == 0)
+ +    {
+ +        nbat->natoms_local = nbat->natoms;
+ +    }
+ +
+ +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
+ +}
+ +
+ +/* Calls nbnxn_put_on_grid for all non-local domains */
+ +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t            nbs,
+ +                                const gmx_domdec_zones_t *zones,
+ +                                const int                *atinfo,
+ +                                rvec                     *x,
+ +                                int                       nb_kernel_type,
+ +                                nbnxn_atomdata_t         *nbat)
+ +{
+ +    int  zone, d;
+ +    rvec c0, c1;
+ +
+ +    for (zone = 1; zone < zones->n; zone++)
+ +    {
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            c0[d] = zones->size[zone].bb_x0[d];
+ +            c1[d] = zones->size[zone].bb_x1[d];
+ +        }
+ +
+ +        nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
+ +                          zone, c0, c1,
+ +                          zones->cg_range[zone],
+ +                          zones->cg_range[zone+1],
+ +                          -1,
+ +                          atinfo,
+ +                          x,
+ +                          0, NULL,
+ +                          nb_kernel_type,
+ +                          nbat);
+ +    }
+ +}
+ +
+ +/* Add simple grid type information to the local super/sub grid */
+ +void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
+ +                           nbnxn_atomdata_t *nbat)
+ +{
+ +    nbnxn_grid_t *grid;
+ +    float        *bbcz, *bb;
+ +    int           ncd, sc;
+ +
+ +    grid = &nbs->grid[0];
+ +
+ +    if (grid->bSimple)
+ +    {
+ +        gmx_incons("nbnxn_grid_simple called with a simple grid");
+ +    }
+ +
+ +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
+ +
+ +    if (grid->nc*ncd > grid->nc_nalloc_simple)
+ +    {
+ +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
+ +        srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
+ +        srenew(grid->bb_simple, grid->nc_nalloc_simple*NNBSBB_B);
+ +        srenew(grid->flags_simple, grid->nc_nalloc_simple);
+ +        if (nbat->XFormat)
+ +        {
+ +            sfree_aligned(grid->bbj);
+ +            snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
+ +        }
+ +    }
+ +
+ +    bbcz = grid->bbcz_simple;
+ +    bb   = grid->bb_simple;
+ +
+ +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+ +    for (sc = 0; sc < grid->nc; sc++)
+ +    {
+ +        int c, tx, na;
+ +
+ +        for (c = 0; c < ncd; c++)
+ +        {
+ +            tx = sc*ncd + c;
+ +
+ +            na = NBNXN_CPU_CLUSTER_I_SIZE;
+ +            while (na > 0 &&
+ +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
+ +            {
+ +                na--;
+ +            }
+ +
+ +            if (na > 0)
+ +            {
+ +                switch (nbat->XFormat)
+ +                {
+ +                    case nbatX4:
+ +                        /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
+ +                        calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
+ +                                               bb+tx*NNBSBB_B);
+ +                        break;
+ +                    case nbatX8:
+ +                        /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
+ +                        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
+ +                                               bb+tx*NNBSBB_B);
+ +                        break;
+ +                    default:
+ +                        calc_bounding_box(na, nbat->xstride,
+ +                                          nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
+ +                                          bb+tx*NNBSBB_B);
+ +                        break;
+ +                }
+ +                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
+ +                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
+ +
+ +                /* No interaction optimization yet here */
+ +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
+ +            }
+ +            else
+ +            {
+ +                grid->flags_simple[tx] = 0;
+ +            }
+ +        }
+ +    }
+ +
- #endif
+ +    if (grid->bSimple && nbat->XFormat == nbatX8)
+ +    {
+ +        combine_bounding_box_pairs(grid, grid->bb_simple);
+ +    }
-         excl->pair[t] = NBNXN_INT_MASK_ALL;
+ +}
+ +
+ +void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
+ +{
+ +    *ncx = nbs->grid[0].ncx;
+ +    *ncy = nbs->grid[0].ncy;
+ +}
+ +
+ +void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)
+ +{
+ +    const nbnxn_grid_t *grid;
+ +
+ +    grid = &nbs->grid[0];
+ +
+ +    /* Return the atom order for the home cell (index 0) */
+ +    *a  = nbs->a;
+ +
+ +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
+ +}
+ +
+ +void nbnxn_set_atomorder(nbnxn_search_t nbs)
+ +{
+ +    nbnxn_grid_t *grid;
+ +    int           ao, cx, cy, cxy, cz, j;
+ +
+ +    /* Set the atom order for the home cell (index 0) */
+ +    grid = &nbs->grid[0];
+ +
+ +    ao = 0;
+ +    for (cx = 0; cx < grid->ncx; cx++)
+ +    {
+ +        for (cy = 0; cy < grid->ncy; cy++)
+ +        {
+ +            cxy = cx*grid->ncy + cy;
+ +            j   = grid->cxy_ind[cxy]*grid->na_sc;
+ +            for (cz = 0; cz < grid->cxy_na[cxy]; cz++)
+ +            {
+ +                nbs->a[j]     = ao;
+ +                nbs->cell[ao] = j;
+ +                ao++;
+ +                j++;
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Determines the cell range along one dimension that
+ + * the bounding box b0 - b1 sees.
+ + */
+ +static void get_cell_range(real b0, real b1,
+ +                           int nc, real c0, real s, real invs,
+ +                           real d2, real r2, int *cf, int *cl)
+ +{
+ +    *cf = max((int)((b0 - c0)*invs), 0);
+ +
+ +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
+ +    {
+ +        (*cf)--;
+ +    }
+ +
+ +    *cl = min((int)((b1 - c0)*invs), nc-1);
+ +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
+ +    {
+ +        (*cl)++;
+ +    }
+ +}
+ +
+ +/* Reference code calculating the distance^2 between two bounding boxes */
+ +static float box_dist2(float bx0, float bx1, float by0,
+ +                       float by1, float bz0, float bz1,
+ +                       const float *bb)
+ +{
+ +    float d2;
+ +    float dl, dh, dm, dm0;
+ +
+ +    d2 = 0;
+ +
+ +    dl  = bx0 - bb[BBU_X];
+ +    dh  = bb[BBL_X] - bx1;
+ +    dm  = max(dl, dh);
+ +    dm0 = max(dm, 0);
+ +    d2 += dm0*dm0;
+ +
+ +    dl  = by0 - bb[BBU_Y];
+ +    dh  = bb[BBL_Y] - by1;
+ +    dm  = max(dl, dh);
+ +    dm0 = max(dm, 0);
+ +    d2 += dm0*dm0;
+ +
+ +    dl  = bz0 - bb[BBU_Z];
+ +    dh  = bb[BBL_Z] - bz1;
+ +    dm  = max(dl, dh);
+ +    dm0 = max(dm, 0);
+ +    d2 += dm0*dm0;
+ +
+ +    return d2;
+ +}
+ +
+ +/* Plain C code calculating the distance^2 between two bounding boxes */
+ +static float subc_bb_dist2(int si, const float *bb_i_ci,
+ +                           int csj, const float *bb_j_all)
+ +{
+ +    const float *bb_i, *bb_j;
+ +    float        d2;
+ +    float        dl, dh, dm, dm0;
+ +
+ +    bb_i = bb_i_ci  +  si*NNBSBB_B;
+ +    bb_j = bb_j_all + csj*NNBSBB_B;
+ +
+ +    d2 = 0;
+ +
+ +    dl  = bb_i[BBL_X] - bb_j[BBU_X];
+ +    dh  = bb_j[BBL_X] - bb_i[BBU_X];
+ +    dm  = max(dl, dh);
+ +    dm0 = max(dm, 0);
+ +    d2 += dm0*dm0;
+ +
+ +    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
+ +    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
+ +    dm  = max(dl, dh);
+ +    dm0 = max(dm, 0);
+ +    d2 += dm0*dm0;
+ +
+ +    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
+ +    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
+ +    dm  = max(dl, dh);
+ +    dm0 = max(dm, 0);
+ +    d2 += dm0*dm0;
+ +
+ +    return d2;
+ +}
+ +
+ +#ifdef NBNXN_SEARCH_BB_SSE
+ +
+ +/* SSE code for bb distance for bb format xyz0 */
+ +static float subc_bb_dist2_sse(int si, const float *bb_i_ci,
+ +                               int csj, const float *bb_j_all)
+ +{
+ +    const float *bb_i, *bb_j;
+ +
+ +    __m128       bb_i_SSE0, bb_i_SSE1;
+ +    __m128       bb_j_SSE0, bb_j_SSE1;
+ +    __m128       dl_SSE;
+ +    __m128       dh_SSE;
+ +    __m128       dm_SSE;
+ +    __m128       dm0_SSE;
+ +    __m128       d2_SSE;
+ +#ifndef GMX_X86_SSE4_1
+ +    float        d2_array[7], *d2_align;
+ +
+ +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
+ +#else
+ +    float d2;
+ +#endif
+ +
+ +    bb_i = bb_i_ci  +  si*NNBSBB_B;
+ +    bb_j = bb_j_all + csj*NNBSBB_B;
+ +
+ +    bb_i_SSE0 = _mm_load_ps(bb_i);
+ +    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
+ +    bb_j_SSE0 = _mm_load_ps(bb_j);
+ +    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
+ +
+ +    dl_SSE    = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
+ +    dh_SSE    = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
+ +
+ +    dm_SSE    = _mm_max_ps(dl_SSE, dh_SSE);
+ +    dm0_SSE   = _mm_max_ps(dm_SSE, _mm_setzero_ps());
+ +#ifndef GMX_X86_SSE4_1
+ +    d2_SSE    = _mm_mul_ps(dm0_SSE, dm0_SSE);
+ +
+ +    _mm_store_ps(d2_align, d2_SSE);
+ +
+ +    return d2_align[0] + d2_align[1] + d2_align[2];
+ +#else
+ +    /* SSE4.1 dot product of components 0,1,2 */
+ +    d2_SSE    = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
+ +
+ +    _mm_store_ss(&d2, d2_SSE);
+ +
+ +    return d2;
+ +#endif
+ +}
+ +
+ +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
+ +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
+ +    {                                                \
+ +        int    shi;                                  \
+ +                                                 \
+ +        __m128 dx_0, dy_0, dz_0;                       \
+ +        __m128 dx_1, dy_1, dz_1;                       \
+ +                                                 \
+ +        __m128 mx, my, mz;                             \
+ +        __m128 m0x, m0y, m0z;                          \
+ +                                                 \
+ +        __m128 d2x, d2y, d2z;                          \
+ +        __m128 d2s, d2t;                              \
+ +                                                 \
+ +        shi = si*NNBSBB_D*DIM;                       \
+ +                                                 \
+ +        xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
+ +        yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
+ +        zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
+ +        xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
+ +        yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
+ +        zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
+ +                                                 \
+ +        dx_0 = _mm_sub_ps(xi_l, xj_h);                \
+ +        dy_0 = _mm_sub_ps(yi_l, yj_h);                \
+ +        dz_0 = _mm_sub_ps(zi_l, zj_h);                \
+ +                                                 \
+ +        dx_1 = _mm_sub_ps(xj_l, xi_h);                \
+ +        dy_1 = _mm_sub_ps(yj_l, yi_h);                \
+ +        dz_1 = _mm_sub_ps(zj_l, zi_h);                \
+ +                                                 \
+ +        mx   = _mm_max_ps(dx_0, dx_1);                \
+ +        my   = _mm_max_ps(dy_0, dy_1);                \
+ +        mz   = _mm_max_ps(dz_0, dz_1);                \
+ +                                                 \
+ +        m0x  = _mm_max_ps(mx, zero);                  \
+ +        m0y  = _mm_max_ps(my, zero);                  \
+ +        m0z  = _mm_max_ps(mz, zero);                  \
+ +                                                 \
+ +        d2x  = _mm_mul_ps(m0x, m0x);                  \
+ +        d2y  = _mm_mul_ps(m0y, m0y);                  \
+ +        d2z  = _mm_mul_ps(m0z, m0z);                  \
+ +                                                 \
+ +        d2s  = _mm_add_ps(d2x, d2y);                  \
+ +        d2t  = _mm_add_ps(d2s, d2z);                  \
+ +                                                 \
+ +        _mm_store_ps(d2+si, d2t);                     \
+ +    }
+ +
+ +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
+ +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
+ +                                   int nsi, const float *bb_i,
+ +                                   float *d2)
+ +{
+ +    __m128 xj_l, yj_l, zj_l;
+ +    __m128 xj_h, yj_h, zj_h;
+ +    __m128 xi_l, yi_l, zi_l;
+ +    __m128 xi_h, yi_h, zi_h;
+ +
+ +    __m128 zero;
+ +
+ +    zero = _mm_setzero_ps();
+ +
+ +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+ +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+ +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+ +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+ +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+ +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
+ +
+ +    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
+ +     * But as we know the number of iterations is 1 or 2, we unroll manually.
+ +     */
+ +    SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
+ +    if (STRIDE_PBB < nsi)
+ +    {
+ +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
+ +    }
+ +}
+ +
+ +#endif /* NBNXN_SEARCH_BB_SSE */
+ +
+ +/* Plain C function which determines if any atom pair between two cells
+ + * is within distance sqrt(rl2).
+ + */
+ +static gmx_bool subc_in_range_x(int na_c,
+ +                                int si, const real *x_i,
+ +                                int csj, int stride, const real *x_j,
+ +                                real rl2)
+ +{
+ +    int  i, j, i0, j0;
+ +    real d2;
+ +
+ +    for (i = 0; i < na_c; i++)
+ +    {
+ +        i0 = (si*na_c + i)*DIM;
+ +        for (j = 0; j < na_c; j++)
+ +        {
+ +            j0 = (csj*na_c + j)*stride;
+ +
+ +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
+ +                sqr(x_i[i0+1] - x_j[j0+1]) +
+ +                sqr(x_i[i0+2] - x_j[j0+2]);
+ +
+ +            if (d2 < rl2)
+ +            {
+ +                return TRUE;
+ +            }
+ +        }
+ +    }
+ +
+ +    return FALSE;
+ +}
+ +
++#ifdef NBNXN_SEARCH_SSE_SINGLE
++/* When we make seperate single/double precision SIMD vector operation
++ * include files, this function should be moved there (also using FMA).
++ */
++static inline __m128
++gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z)
++{
++    return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) );
++}
++#endif
++
+ +/* SSE function which determines if any atom pair between two cells,
+ + * both with 8 atoms, is within distance sqrt(rl2).
++ * Not performance critical, so only uses plain SSE.
+ + */
+ +static gmx_bool subc_in_range_sse8(int na_c,
+ +                                   int si, const real *x_i,
+ +                                   int csj, int stride, const real *x_j,
+ +                                   real rl2)
+ +{
+ +#ifdef NBNXN_SEARCH_SSE_SINGLE
+ +    __m128 ix_SSE0, iy_SSE0, iz_SSE0;
+ +    __m128 ix_SSE1, iy_SSE1, iz_SSE1;
+ +
+ +    __m128 rc2_SSE;
+ +
+ +    int    na_c_sse;
+ +    int    j0, j1;
+ +
+ +    rc2_SSE   = _mm_set1_ps(rl2);
+ +
+ +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+ +    ix_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+ +    iy_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+ +    iz_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+ +    ix_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+ +    iy_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+ +    iz_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
+ +
+ +    /* We loop from the outer to the inner particles to maximize
+ +     * the chance that we find a pair in range quickly and return.
+ +     */
+ +    j0 = csj*na_c;
+ +    j1 = j0 + na_c - 1;
+ +    while (j0 < j1)
+ +    {
+ +        __m128 jx0_SSE, jy0_SSE, jz0_SSE;
+ +        __m128 jx1_SSE, jy1_SSE, jz1_SSE;
+ +
+ +        __m128 dx_SSE0, dy_SSE0, dz_SSE0;
+ +        __m128 dx_SSE1, dy_SSE1, dz_SSE1;
+ +        __m128 dx_SSE2, dy_SSE2, dz_SSE2;
+ +        __m128 dx_SSE3, dy_SSE3, dz_SSE3;
+ +
+ +        __m128 rsq_SSE0;
+ +        __m128 rsq_SSE1;
+ +        __m128 rsq_SSE2;
+ +        __m128 rsq_SSE3;
+ +
+ +        __m128 wco_SSE0;
+ +        __m128 wco_SSE1;
+ +        __m128 wco_SSE2;
+ +        __m128 wco_SSE3;
+ +        __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+ +
+ +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
+ +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
+ +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
+ +
+ +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
+ +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
+ +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
+ +
+ +        /* Calculate distance */
+ +        dx_SSE0            = _mm_sub_ps(ix_SSE0, jx0_SSE);
+ +        dy_SSE0            = _mm_sub_ps(iy_SSE0, jy0_SSE);
+ +        dz_SSE0            = _mm_sub_ps(iz_SSE0, jz0_SSE);
+ +        dx_SSE1            = _mm_sub_ps(ix_SSE1, jx0_SSE);
+ +        dy_SSE1            = _mm_sub_ps(iy_SSE1, jy0_SSE);
+ +        dz_SSE1            = _mm_sub_ps(iz_SSE1, jz0_SSE);
+ +        dx_SSE2            = _mm_sub_ps(ix_SSE0, jx1_SSE);
+ +        dy_SSE2            = _mm_sub_ps(iy_SSE0, jy1_SSE);
+ +        dz_SSE2            = _mm_sub_ps(iz_SSE0, jz1_SSE);
+ +        dx_SSE3            = _mm_sub_ps(ix_SSE1, jx1_SSE);
+ +        dy_SSE3            = _mm_sub_ps(iy_SSE1, jy1_SSE);
+ +        dz_SSE3            = _mm_sub_ps(iz_SSE1, jz1_SSE);
+ +
+ +        /* rsq = dx*dx+dy*dy+dz*dz */
+ +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+ +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+ +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+ +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+ +
+ +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
+ +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
+ +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
+ +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
+ +
+ +        wco_any_SSE01      = _mm_or_ps(wco_SSE0, wco_SSE1);
+ +        wco_any_SSE23      = _mm_or_ps(wco_SSE2, wco_SSE3);
+ +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
+ +
+ +        if (_mm_movemask_ps(wco_any_SSE))
+ +        {
+ +            return TRUE;
+ +        }
+ +
+ +        j0++;
+ +        j1--;
+ +    }
+ +    return FALSE;
+ +
+ +#else
+ +    /* No SSE */
+ +    gmx_incons("SSE function called without SSE support");
+ +
+ +    return TRUE;
+ +#endif
+ +}
+ +
+ +/* Returns the j sub-cell for index cj_ind */
+ +static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)
+ +{
+ +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
+ +}
+ +
+ +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
+ +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)
+ +{
+ +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
+ +}
+ +
+ +/* Ensures there is enough space for extra extra exclusion masks */
+ +static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
+ +{
+ +    if (nbl->nexcl+extra > nbl->excl_nalloc)
+ +    {
+ +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
+ +        nbnxn_realloc_void((void **)&nbl->excl,
+ +                           nbl->nexcl*sizeof(*nbl->excl),
+ +                           nbl->excl_nalloc*sizeof(*nbl->excl),
+ +                           nbl->alloc, nbl->free);
+ +    }
+ +}
+ +
+ +/* Ensures there is enough space for ncell extra j-cells in the list */
+ +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
+ +                                            int               ncell)
+ +{
+ +    int cj_max;
+ +
+ +    cj_max = nbl->ncj + ncell;
+ +
+ +    if (cj_max > nbl->cj_nalloc)
+ +    {
+ +        nbl->cj_nalloc = over_alloc_small(cj_max);
+ +        nbnxn_realloc_void((void **)&nbl->cj,
+ +                           nbl->ncj*sizeof(*nbl->cj),
+ +                           nbl->cj_nalloc*sizeof(*nbl->cj),
+ +                           nbl->alloc, nbl->free);
+ +    }
+ +}
+ +
+ +/* Ensures there is enough space for ncell extra j-subcells in the list */
+ +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
+ +                                              int               nsupercell)
+ +{
+ +    int ncj4_max, j4, j, w, t;
+ +
+ +#define NWARP       2
+ +#define WARP_SIZE  32
+ +
+ +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
+ +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
+ +     * since we round down, we need one extra entry.
+ +     */
+ +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ +
+ +    if (ncj4_max > nbl->cj4_nalloc)
+ +    {
+ +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
+ +        nbnxn_realloc_void((void **)&nbl->cj4,
+ +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
+ +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
+ +                           nbl->alloc, nbl->free);
+ +    }
+ +
+ +    if (ncj4_max > nbl->work->cj4_init)
+ +    {
+ +        for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
+ +        {
+ +            /* No i-subcells and no excl's in the list initially */
+ +            for (w = 0; w < NWARP; w++)
+ +            {
+ +                nbl->cj4[j4].imei[w].imask    = 0U;
+ +                nbl->cj4[j4].imei[w].excl_ind = 0;
+ +
+ +            }
+ +        }
+ +        nbl->work->cj4_init = ncj4_max;
+ +    }
+ +}
+ +
+ +/* Set all excl masks for one GPU warp no exclusions */
+ +static void set_no_excls(nbnxn_excl_t *excl)
+ +{
+ +    int t;
+ +
+ +    for (t = 0; t < WARP_SIZE; t++)
+ +    {
+ +        /* Turn all interaction bits on */
-                nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
++        excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
+ +    }
+ +}
+ +
+ +/* Initializes a single nbnxn_pairlist_t data structure */
+ +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
+ +                                gmx_bool          bSimple,
+ +                                nbnxn_alloc_t    *alloc,
+ +                                nbnxn_free_t     *free)
+ +{
+ +    if (alloc == NULL)
+ +    {
+ +        nbl->alloc = nbnxn_alloc_aligned;
+ +    }
+ +    else
+ +    {
+ +        nbl->alloc = alloc;
+ +    }
+ +    if (free == NULL)
+ +    {
+ +        nbl->free = nbnxn_free_aligned;
+ +    }
+ +    else
+ +    {
+ +        nbl->free = free;
+ +    }
+ +
+ +    nbl->bSimple     = bSimple;
+ +    nbl->na_sc       = 0;
+ +    nbl->na_ci       = 0;
+ +    nbl->na_cj       = 0;
+ +    nbl->nci         = 0;
+ +    nbl->ci          = NULL;
+ +    nbl->ci_nalloc   = 0;
+ +    nbl->ncj         = 0;
+ +    nbl->cj          = NULL;
+ +    nbl->cj_nalloc   = 0;
+ +    nbl->ncj4        = 0;
+ +    /* We need one element extra in sj, so alloc initially with 1 */
+ +    nbl->cj4_nalloc  = 0;
+ +    nbl->cj4         = NULL;
+ +    nbl->nci_tot     = 0;
+ +
+ +    if (!nbl->bSimple)
+ +    {
+ +        nbl->excl        = NULL;
+ +        nbl->excl_nalloc = 0;
+ +        nbl->nexcl       = 0;
+ +        check_excl_space(nbl, 1);
+ +        nbl->nexcl       = 1;
+ +        set_no_excls(&nbl->excl[0]);
+ +    }
+ +
+ +    snew(nbl->work, 1);
+ +#ifdef NBNXN_BBXXXX
+ +    snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
+ +#else
+ +    snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL*NNBSBB_B, NBNXN_MEM_ALIGN);
+ +#endif
+ +    snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
+ +#ifdef GMX_NBNXN_SIMD
+ +    snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
+ +    snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
+ +#endif
+ +    snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
+ +
+ +    nbl->work->sort            = NULL;
+ +    nbl->work->sort_nalloc     = 0;
+ +    nbl->work->sci_sort        = NULL;
+ +    nbl->work->sci_sort_nalloc = 0;
+ +}
+ +
+ +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
+ +                             gmx_bool bSimple, gmx_bool bCombined,
+ +                             nbnxn_alloc_t *alloc,
+ +                             nbnxn_free_t  *free)
+ +{
+ +    int i;
+ +
+ +    nbl_list->bSimple   = bSimple;
+ +    nbl_list->bCombined = bCombined;
+ +
+ +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+ +
+ +    if (!nbl_list->bCombined &&
+ +        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
+ +    {
+ +        gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
+ +                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+ +    }
+ +
+ +    snew(nbl_list->nbl, nbl_list->nnbl);
+ +    /* Execute in order to avoid memory interleaving between threads */
+ +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
+ +    for (i = 0; i < nbl_list->nnbl; i++)
+ +    {
+ +        /* Allocate the nblist data structure locally on each thread
+ +         * to optimize memory access for NUMA architectures.
+ +         */
+ +        snew(nbl_list->nbl[i], 1);
+ +
+ +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+ +        if (i == 0)
+ +        {
+ +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);
+ +        }
+ +        else
+ +        {
+ +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL, NULL);
+ +        }
+ +    }
+ +}
+ +
+ +/* Print statistics of a pair list, used for debug output */
+ +static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
+ +                                           const nbnxn_search_t nbs, real rl)
+ +{
+ +    const nbnxn_grid_t *grid;
+ +    int                 cs[SHIFTS];
+ +    int                 s, i, j;
+ +    int                 npexcl;
+ +
+ +    /* This code only produces correct statistics with domain decomposition */
+ +    grid = &nbs->grid[0];
+ +
+ +    fprintf(fp, "nbl nci %d ncj %d\n",
+ +            nbl->nci, nbl->ncj);
+ +    fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+ +            nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,
+ +            nbl->ncj/(double)grid->nc*grid->na_sc,
+ +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
+ +
+ +    fprintf(fp, "nbl average j cell list length %.1f\n",
+ +            0.25*nbl->ncj/(double)nbl->nci);
+ +
+ +    for (s = 0; s < SHIFTS; s++)
+ +    {
+ +        cs[s] = 0;
+ +    }
+ +    npexcl = 0;
+ +    for (i = 0; i < nbl->nci; i++)
+ +    {
+ +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
+ +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
+ +
+ +        j = nbl->ci[i].cj_ind_start;
+ +        while (j < nbl->ci[i].cj_ind_end &&
-     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
++               nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+ +        {
+ +            npexcl++;
+ +            j++;
+ +        }
+ +    }
+ +    fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
+ +            nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);
+ +    for (s = 0; s < SHIFTS; s++)
+ +    {
+ +        if (cs[s] > 0)
+ +        {
+ +            fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
+ +        }
+ +    }
+ +}
+ +
+ +/* Print statistics of a pair lists, used for debug output */
+ +static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
+ +                                             const nbnxn_search_t nbs, real rl)
+ +{
+ +    const nbnxn_grid_t *grid;
+ +    int                 i, j4, j, si, b;
+ +    int                 c[GPU_NSUBCELL+1];
+ +
+ +    /* This code only produces correct statistics with domain decomposition */
+ +    grid = &nbs->grid[0];
+ +
+ +    fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
+ +            nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
+ +    fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+ +            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
+ +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
+ +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
+ +
+ +    fprintf(fp, "nbl average j super cell list length %.1f\n",
+ +            0.25*nbl->ncj4/(double)nbl->nsci);
+ +    fprintf(fp, "nbl average i sub cell list length %.1f\n",
+ +            nbl->nci_tot/((double)nbl->ncj4));
+ +
+ +    for (si = 0; si <= GPU_NSUBCELL; si++)
+ +    {
+ +        c[si] = 0;
+ +    }
+ +    for (i = 0; i < nbl->nsci; i++)
+ +    {
+ +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
+ +        {
+ +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
+ +            {
+ +                b = 0;
+ +                for (si = 0; si < GPU_NSUBCELL; si++)
+ +                {
+ +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
+ +                    {
+ +                        b++;
+ +                    }
+ +                }
+ +                c[b]++;
+ +            }
+ +        }
+ +    }
+ +    for (b = 0; b <= GPU_NSUBCELL; b++)
+ +    {
+ +        fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
+ +                b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
+ +    }
+ +}
+ +
+ +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
+ +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,
+ +                                   int warp, nbnxn_excl_t **excl)
+ +{
+ +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+ +    {
+ +        /* No exclusions set, make a new list entry */
+ +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
+ +        nbl->nexcl++;
+ +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+ +        set_no_excls(*excl);
+ +    }
+ +    else
+ +    {
+ +        /* We already have some exclusions, new ones can be added to the list */
+ +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+ +    }
+ +}
+ +
+ +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
+ + * allocates extra memory, if necessary.
+ + */
+ +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,
+ +                                 int warp, nbnxn_excl_t **excl)
+ +{
+ +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+ +    {
+ +        /* We need to make a new list entry, check if we have space */
+ +        check_excl_space(nbl, 1);
+ +    }
+ +    low_get_nbl_exclusions(nbl, cj4, warp, excl);
+ +}
+ +
+ +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
+ + * allocates extra memory, if necessary.
+ + */
+ +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,
+ +                                 nbnxn_excl_t **excl_w0,
+ +                                 nbnxn_excl_t **excl_w1)
+ +{
+ +    /* Check for space we might need */
+ +    check_excl_space(nbl, 2);
+ +
+ +    low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
+ +    low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
+ +}
+ +
+ +/* Sets the self exclusions i=j and pair exclusions i>j */
+ +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
+ +                                               int cj4_ind, int sj_offset,
+ +                                               int si)
+ +{
+ +    nbnxn_excl_t *excl[2];
+ +    int           ei, ej, w;
+ +
+ +    /* Here we only set the set self and double pair exclusions */
+ +
+ +    get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
+ +
+ +    /* Only minor < major bits set */
+ +    for (ej = 0; ej < nbl->na_ci; ej++)
+ +    {
+ +        w = (ej>>2);
+ +        for (ei = ej; ei < nbl->na_ci; ei++)
+ +        {
+ +            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+ +                ~(1U << (sj_offset*GPU_NSUBCELL + si));
+ +        }
+ +    }
+ +}
+ +
+ +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+ +static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
+ +{
- /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
- static unsigned int get_imask_simd128(gmx_bool rdiag, int ci, int cj)
++    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+ +}
+ +
- #ifndef GMX_DOUBLE /* cj-size = 4 */
-     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
- #else              /* cj-size = 2 */
-     return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
-             (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
-              NBNXN_INT_MASK_ALL));
- #endif
++/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
++static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
+ +{
- /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
- static unsigned int get_imask_simd256(gmx_bool rdiag, int ci, int cj)
++    return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
++            (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
++             NBNXN_INTERACTION_MASK_ALL));
+ +}
+ +
- #ifndef GMX_DOUBLE /* cj-size = 8 */
-     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
-             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
-              NBNXN_INT_MASK_ALL));
- #else              /* cj-size = 4 */
-     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
- #endif
++/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
++static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
+ +{
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #define get_imask_simd_4xn  get_imask_simd128
- #else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- #define get_imask_simd_4xn  get_imask_simd256
- #define get_imask_simd_2xnn get_imask_simd128
- #else
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
++    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
++}
++
++/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
++static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
++{
++    return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
++            (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
++             NBNXN_INTERACTION_MASK_ALL));
+ +}
+ +
+ +#ifdef GMX_NBNXN_SIMD
-         if (cj[j].excl != NBNXN_INT_MASK_ALL)
++#if GMX_SIMD_WIDTH_HERE == 2
++#define get_imask_simd_4xn  get_imask_simd_j2
++#endif
++#if GMX_SIMD_WIDTH_HERE == 4
++#define get_imask_simd_4xn  get_imask_simd_j4
++#endif
++#if GMX_SIMD_WIDTH_HERE == 8
++#define get_imask_simd_4xn  get_imask_simd_j8
++#define get_imask_simd_2xnn get_imask_simd_j4
+ +#endif
++#if GMX_SIMD_WIDTH_HERE == 16
++#define get_imask_simd_2xnn get_imask_simd_j8
+ +#endif
+ +#endif
+ +
+ +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
+ + * Checks bounding box distances and possibly atom pair distances.
+ + */
+ +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
+ +                                     nbnxn_pairlist_t *nbl,
+ +                                     int ci, int cjf, int cjl,
+ +                                     gmx_bool remove_sub_diag,
+ +                                     const real *x_j,
+ +                                     real rl2, float rbb2,
+ +                                     int *ndistc)
+ +{
+ +    const nbnxn_list_work_t *work;
+ +
+ +    const float             *bb_ci;
+ +    const real              *x_ci;
+ +
+ +    gmx_bool                 InRange;
+ +    real                     d2;
+ +    int                      cjf_gl, cjl_gl, cj;
+ +
+ +    work = nbl->work;
+ +
+ +    bb_ci = nbl->work->bb_ci;
+ +    x_ci  = nbl->work->x_ci;
+ +
+ +    InRange = FALSE;
+ +    while (!InRange && cjf <= cjl)
+ +    {
+ +        d2       = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);
+ +        *ndistc += 2;
+ +
+ +        /* Check if the distance is within the distance where
+ +         * we use only the bounding box distance rbb,
+ +         * or within the cut-off and there is at least one atom pair
+ +         * within the cut-off.
+ +         */
+ +        if (d2 < rbb2)
+ +        {
+ +            InRange = TRUE;
+ +        }
+ +        else if (d2 < rl2)
+ +        {
+ +            int i, j;
+ +
+ +            cjf_gl = gridj->cell0 + cjf;
+ +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
+ +            {
+ +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
+ +                {
+ +                    InRange = InRange ||
+ +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
+ +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
+ +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
+ +                }
+ +            }
+ +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+ +        }
+ +        if (!InRange)
+ +        {
+ +            cjf++;
+ +        }
+ +    }
+ +    if (!InRange)
+ +    {
+ +        return;
+ +    }
+ +
+ +    InRange = FALSE;
+ +    while (!InRange && cjl > cjf)
+ +    {
+ +        d2       = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);
+ +        *ndistc += 2;
+ +
+ +        /* Check if the distance is within the distance where
+ +         * we use only the bounding box distance rbb,
+ +         * or within the cut-off and there is at least one atom pair
+ +         * within the cut-off.
+ +         */
+ +        if (d2 < rbb2)
+ +        {
+ +            InRange = TRUE;
+ +        }
+ +        else if (d2 < rl2)
+ +        {
+ +            int i, j;
+ +
+ +            cjl_gl = gridj->cell0 + cjl;
+ +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
+ +            {
+ +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
+ +                {
+ +                    InRange = InRange ||
+ +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
+ +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
+ +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
+ +                }
+ +            }
+ +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+ +        }
+ +        if (!InRange)
+ +        {
+ +            cjl--;
+ +        }
+ +    }
+ +
+ +    if (cjf <= cjl)
+ +    {
+ +        for (cj = cjf; cj <= cjl; cj++)
+ +        {
+ +            /* Store cj and the interaction mask */
+ +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
+ +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);
+ +            nbl->ncj++;
+ +        }
+ +        /* Increase the closing index in i super-cell list */
+ +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ +    }
+ +}
+ +
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +#include "nbnxn_search_simd_4xn.h"
+ +#endif
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +#include "nbnxn_search_simd_2xnn.h"
+ +#endif
+ +
+ +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
+ + * Checks bounding box distances and possibly atom pair distances.
+ + */
+ +static void make_cluster_list_supersub(const nbnxn_grid_t *gridi,
+ +                                       const nbnxn_grid_t *gridj,
+ +                                       nbnxn_pairlist_t *nbl,
+ +                                       int sci, int scj,
+ +                                       gmx_bool sci_equals_scj,
+ +                                       int stride, const real *x,
+ +                                       real rl2, float rbb2,
+ +                                       int *ndistc)
+ +{
+ +    int          na_c;
+ +    int          npair;
+ +    int          cjo, ci1, ci, cj, cj_gl;
+ +    int          cj4_ind, cj_offset;
+ +    unsigned     imask;
+ +    nbnxn_cj4_t *cj4;
+ +    const float *bb_ci;
+ +    const real  *x_ci;
+ +    float       *d2l, d2;
+ +    int          w;
+ +#define PRUNE_LIST_CPU_ONE
+ +#ifdef PRUNE_LIST_CPU_ONE
+ +    int  ci_last = -1;
+ +#endif
+ +
+ +    d2l = nbl->work->d2;
+ +
+ +    bb_ci = nbl->work->bb_ci;
+ +    x_ci  = nbl->work->x_ci;
+ +
+ +    na_c = gridj->na_c;
+ +
+ +    for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)
+ +    {
+ +        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
+ +        cj4       = &nbl->cj4[cj4_ind];
+ +
+ +        cj = scj*GPU_NSUBCELL + cjo;
+ +
+ +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
+ +
+ +        /* Initialize this j-subcell i-subcell list */
+ +        cj4->cj[cj_offset] = cj_gl;
+ +        imask              = 0;
+ +
+ +        if (sci_equals_scj)
+ +        {
+ +            ci1 = cjo + 1;
+ +        }
+ +        else
+ +        {
+ +            ci1 = gridi->nsubc[sci];
+ +        }
+ +
+ +#ifdef NBNXN_BBXXXX
+ +        /* Determine all ci1 bb distances in one call with SSE */
+ +        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
+ +                               ci1, bb_ci, d2l);
+ +        *ndistc += na_c*2;
+ +#endif
+ +
+ +        npair = 0;
+ +        /* We use a fixed upper-bound instead of ci1 to help optimization */
+ +        for (ci = 0; ci < GPU_NSUBCELL; ci++)
+ +        {
+ +            if (ci == ci1)
+ +            {
+ +                break;
+ +            }
+ +
+ +#ifndef NBNXN_BBXXXX
+ +            /* Determine the bb distance between ci and cj */
+ +            d2l[ci]  = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);
+ +            *ndistc += 2;
+ +#endif
+ +            d2 = d2l[ci];
+ +
+ +#ifdef PRUNE_LIST_CPU_ALL
+ +            /* Check if the distance is within the distance where
+ +             * we use only the bounding box distance rbb,
+ +             * or within the cut-off and there is at least one atom pair
+ +             * within the cut-off. This check is very costly.
+ +             */
+ +            *ndistc += na_c*na_c;
+ +            if (d2 < rbb2 ||
+ +                (d2 < rl2 &&
+ +#ifdef NBNXN_PBB_SSE
+ +                 subc_in_range_sse8
+ +#else
+ +                 subc_in_range_x
+ +#endif
+ +                     (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
+ +#else
+ +            /* Check if the distance between the two bounding boxes
+ +             * in within the pair-list cut-off.
+ +             */
+ +            if (d2 < rl2)
+ +#endif
+ +            {
+ +                /* Flag this i-subcell to be taken into account */
+ +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
+ +
+ +#ifdef PRUNE_LIST_CPU_ONE
+ +                ci_last = ci;
+ +#endif
+ +
+ +                npair++;
+ +            }
+ +        }
+ +
+ +#ifdef PRUNE_LIST_CPU_ONE
+ +        /* If we only found 1 pair, check if any atoms are actually
+ +         * within the cut-off, so we could get rid of it.
+ +         */
+ +        if (npair == 1 && d2l[ci_last] >= rbb2)
+ +        {
+ +            /* Avoid using function pointers here, as it's slower */
+ +            if (
+ +#ifdef NBNXN_PBB_SSE
+ +                !subc_in_range_sse8
+ +#else
+ +                !subc_in_range_x
+ +#endif
+ +                    (na_c, ci_last, x_ci, cj_gl, stride, x, rl2))
+ +            {
+ +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
+ +                npair--;
+ +            }
+ +        }
+ +#endif
+ +
+ +        if (npair > 0)
+ +        {
+ +            /* We have a useful sj entry, close it now */
+ +
+ +            /* Set the exclucions for the ci== sj entry.
+ +             * Here we don't bother to check if this entry is actually flagged,
+ +             * as it will nearly always be in the list.
+ +             */
+ +            if (sci_equals_scj)
+ +            {
+ +                set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);
+ +            }
+ +
+ +            /* Copy the cluster interaction mask to the list */
+ +            for (w = 0; w < NWARP; w++)
+ +            {
+ +                cj4->imei[w].imask |= imask;
+ +            }
+ +
+ +            nbl->work->cj_ind++;
+ +
+ +            /* Keep the count */
+ +            nbl->nci_tot += npair;
+ +
+ +            /* Increase the closing index in i super-cell list */
+ +            nbl->sci[nbl->nsci].cj4_ind_end =
+ +                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ +        }
+ +    }
+ +}
+ +
+ +/* Set all atom-pair exclusions from the topology stored in excl
+ + * as masks in the pair-list for simple list i-entry nbl_ci
+ + */
+ +static void set_ci_top_excls(const nbnxn_search_t nbs,
+ +                             nbnxn_pairlist_t    *nbl,
+ +                             gmx_bool             diagRemoved,
+ +                             int                  na_ci_2log,
+ +                             int                  na_cj_2log,
+ +                             const nbnxn_ci_t    *nbl_ci,
+ +                             const t_blocka      *excl)
+ +{
+ +    const int    *cell;
+ +    int           ci;
+ +    int           cj_ind_first, cj_ind_last;
+ +    int           cj_first, cj_last;
+ +    int           ndirect;
+ +    int           i, ai, aj, si, eind, ge, se;
+ +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
+ +    int           cj_m;
+ +    gmx_bool      Found_si;
+ +    int           si_ind;
+ +    nbnxn_excl_t *nbl_excl;
+ +    int           inner_i, inner_e;
+ +
+ +    cell = nbs->cell;
+ +
+ +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
+ +    {
+ +        /* Empty list */
+ +        return;
+ +    }
+ +
+ +    ci = nbl_ci->ci;
+ +
+ +    cj_ind_first = nbl_ci->cj_ind_start;
+ +    cj_ind_last  = nbl->ncj - 1;
+ +
+ +    cj_first = nbl->cj[cj_ind_first].cj;
+ +    cj_last  = nbl->cj[cj_ind_last].cj;
+ +
+ +    /* Determine how many contiguous j-cells we have starting
+ +     * from the first i-cell. This number can be used to directly
+ +     * calculate j-cell indices for excluded atoms.
+ +     */
+ +    ndirect = 0;
+ +    if (na_ci_2log == na_cj_2log)
+ +    {
+ +        while (cj_ind_first + ndirect <= cj_ind_last &&
+ +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
+ +        {
+ +            ndirect++;
+ +        }
+ +    }
+ +#ifdef NBNXN_SEARCH_BB_SSE
+ +    else
+ +    {
+ +        while (cj_ind_first + ndirect <= cj_ind_last &&
+ +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)
+ +        {
+ +            ndirect++;
+ +        }
+ +    }
+ +#endif
+ +
+ +    /* Loop over the atoms in the i super-cell */
+ +    for (i = 0; i < nbl->na_sc; i++)
+ +    {
+ +        ai = nbs->a[ci*nbl->na_sc+i];
+ +        if (ai >= 0)
+ +        {
+ +            si  = (i>>na_ci_2log);
+ +
+ +            /* Loop over the topology-based exclusions for this i-atom */
+ +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
+ +            {
+ +                aj = excl->a[eind];
+ +
+ +                if (aj == ai)
+ +                {
+ +                    /* The self exclusion are already set, save some time */
+ +                    continue;
+ +                }
+ +
+ +                ge = cell[aj];
+ +
+ +                /* Without shifts we only calculate interactions j>i
+ +                 * for one-way pair-lists.
+ +                 */
+ +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
+ +                {
+ +                    continue;
+ +                }
+ +
+ +                se = (ge >> na_cj_2log);
+ +
+ +                /* Could the cluster se be in our list? */
+ +                if (se >= cj_first && se <= cj_last)
+ +                {
+ +                    if (se < cj_first + ndirect)
+ +                    {
+ +                        /* We can calculate cj_ind directly from se */
+ +                        found = cj_ind_first + se - cj_first;
+ +                    }
+ +                    else
+ +                    {
+ +                        /* Search for se using bisection */
+ +                        found    = -1;
+ +                        cj_ind_0 = cj_ind_first + ndirect;
+ +                        cj_ind_1 = cj_ind_last + 1;
+ +                        while (found == -1 && cj_ind_0 < cj_ind_1)
+ +                        {
+ +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
+ +
+ +                            cj_m = nbl->cj[cj_ind_m].cj;
+ +
+ +                            if (se == cj_m)
+ +                            {
+ +                                found = cj_ind_m;
+ +                            }
+ +                            else if (se < cj_m)
+ +                            {
+ +                                cj_ind_1 = cj_ind_m;
+ +                            }
+ +                            else
+ +                            {
+ +                                cj_ind_0 = cj_ind_m + 1;
+ +                            }
+ +                        }
+ +                    }
+ +
+ +                    if (found >= 0)
+ +                    {
+ +                        inner_i = i  - (si << na_ci_2log);
+ +                        inner_e = ge - (se << na_cj_2log);
+ +
+ +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Set all atom-pair exclusions from the topology stored in excl
+ + * as masks in the pair-list for i-super-cell entry nbl_sci
+ + */
+ +static void set_sci_top_excls(const nbnxn_search_t nbs,
+ +                              nbnxn_pairlist_t    *nbl,
+ +                              gmx_bool             diagRemoved,
+ +                              int                  na_c_2log,
+ +                              const nbnxn_sci_t   *nbl_sci,
+ +                              const t_blocka      *excl)
+ +{
+ +    const int    *cell;
+ +    int           na_c;
+ +    int           sci;
+ +    int           cj_ind_first, cj_ind_last;
+ +    int           cj_first, cj_last;
+ +    int           ndirect;
+ +    int           i, ai, aj, si, eind, ge, se;
+ +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
+ +    int           cj_m;
+ +    gmx_bool      Found_si;
+ +    int           si_ind;
+ +    nbnxn_excl_t *nbl_excl;
+ +    int           inner_i, inner_e, w;
+ +
+ +    cell = nbs->cell;
+ +
+ +    na_c = nbl->na_ci;
+ +
+ +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
+ +    {
+ +        /* Empty list */
+ +        return;
+ +    }
+ +
+ +    sci = nbl_sci->sci;
+ +
+ +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
+ +    cj_ind_last  = nbl->work->cj_ind - 1;
+ +
+ +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
+ +    cj_last  = nbl_cj(nbl, cj_ind_last);
+ +
+ +    /* Determine how many contiguous j-clusters we have starting
+ +     * from the first i-cluster. This number can be used to directly
+ +     * calculate j-cluster indices for excluded atoms.
+ +     */
+ +    ndirect = 0;
+ +    while (cj_ind_first + ndirect <= cj_ind_last &&
+ +           nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
+ +    {
+ +        ndirect++;
+ +    }
+ +
+ +    /* Loop over the atoms in the i super-cell */
+ +    for (i = 0; i < nbl->na_sc; i++)
+ +    {
+ +        ai = nbs->a[sci*nbl->na_sc+i];
+ +        if (ai >= 0)
+ +        {
+ +            si  = (i>>na_c_2log);
+ +
+ +            /* Loop over the topology-based exclusions for this i-atom */
+ +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
+ +            {
+ +                aj = excl->a[eind];
+ +
+ +                if (aj == ai)
+ +                {
+ +                    /* The self exclusion are already set, save some time */
+ +                    continue;
+ +                }
+ +
+ +                ge = cell[aj];
+ +
+ +                /* Without shifts we only calculate interactions j>i
+ +                 * for one-way pair-lists.
+ +                 */
+ +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
+ +                {
+ +                    continue;
+ +                }
+ +
+ +                se = ge>>na_c_2log;
+ +                /* Could the cluster se be in our list? */
+ +                if (se >= cj_first && se <= cj_last)
+ +                {
+ +                    if (se < cj_first + ndirect)
+ +                    {
+ +                        /* We can calculate cj_ind directly from se */
+ +                        found = cj_ind_first + se - cj_first;
+ +                    }
+ +                    else
+ +                    {
+ +                        /* Search for se using bisection */
+ +                        found    = -1;
+ +                        cj_ind_0 = cj_ind_first + ndirect;
+ +                        cj_ind_1 = cj_ind_last + 1;
+ +                        while (found == -1 && cj_ind_0 < cj_ind_1)
+ +                        {
+ +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
+ +
+ +                            cj_m = nbl_cj(nbl, cj_ind_m);
+ +
+ +                            if (se == cj_m)
+ +                            {
+ +                                found = cj_ind_m;
+ +                            }
+ +                            else if (se < cj_m)
+ +                            {
+ +                                cj_ind_1 = cj_ind_m;
+ +                            }
+ +                            else
+ +                            {
+ +                                cj_ind_0 = cj_ind_m + 1;
+ +                            }
+ +                        }
+ +                    }
+ +
+ +                    if (found >= 0)
+ +                    {
+ +                        inner_i = i  - si*na_c;
+ +                        inner_e = ge - se*na_c;
+ +
+ +/* Macro for getting the index of atom a within a cluster */
+ +#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
+ +/* Macro for converting an atom number to a cluster number */
+ +#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+ +/* Macro for getting the index of an i-atom within a warp */
+ +#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
+ +
+ +                        if (nbl_imask0(nbl, found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
+ +                        {
+ +                            w       = (inner_e >> 2);
+ +
+ +                            get_nbl_exclusions_1(nbl, A2CJ4(found), w, &nbl_excl);
+ +
+ +                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+ +                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
+ +                        }
+ +
+ +#undef AMODCJ4
+ +#undef A2CJ4
+ +#undef AMODWI
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Reallocate the simple ci list for at least n entries */
+ +static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
+ +{
+ +    nbl->ci_nalloc = over_alloc_small(n);
+ +    nbnxn_realloc_void((void **)&nbl->ci,
+ +                       nbl->nci*sizeof(*nbl->ci),
+ +                       nbl->ci_nalloc*sizeof(*nbl->ci),
+ +                       nbl->alloc, nbl->free);
+ +}
+ +
+ +/* Reallocate the super-cell sci list for at least n entries */
+ +static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
+ +{
+ +    nbl->sci_nalloc = over_alloc_small(n);
+ +    nbnxn_realloc_void((void **)&nbl->sci,
+ +                       nbl->nsci*sizeof(*nbl->sci),
+ +                       nbl->sci_nalloc*sizeof(*nbl->sci),
+ +                       nbl->alloc, nbl->free);
+ +}
+ +
+ +/* Make a new ci entry at index nbl->nci */
+ +static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags)
+ +{
+ +    if (nbl->nci + 1 > nbl->ci_nalloc)
+ +    {
+ +        nb_realloc_ci(nbl, nbl->nci+1);
+ +    }
+ +    nbl->ci[nbl->nci].ci            = ci;
+ +    nbl->ci[nbl->nci].shift         = shift;
+ +    /* Store the interaction flags along with the shift */
+ +    nbl->ci[nbl->nci].shift        |= flags;
+ +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
+ +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
+ +}
+ +
+ +/* Make a new sci entry at index nbl->nsci */
+ +static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift)
+ +{
+ +    if (nbl->nsci + 1 > nbl->sci_nalloc)
+ +    {
+ +        nb_realloc_sci(nbl, nbl->nsci+1);
+ +    }
+ +    nbl->sci[nbl->nsci].sci           = sci;
+ +    nbl->sci[nbl->nsci].shift         = shift;
+ +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
+ +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
+ +}
+ +
+ +/* Sort the simple j-list cj on exclusions.
+ + * Entries with exclusions will all be sorted to the beginning of the list.
+ + */
+ +static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
+ +                         nbnxn_list_work_t *work)
+ +{
+ +    int jnew, j;
+ +
+ +    if (ncj > work->cj_nalloc)
+ +    {
+ +        work->cj_nalloc = over_alloc_large(ncj);
+ +        srenew(work->cj, work->cj_nalloc);
+ +    }
+ +
+ +    /* Make a list of the j-cells involving exclusions */
+ +    jnew = 0;
+ +    for (j = 0; j < ncj; j++)
+ +    {
-           (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
++        if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+ +        {
+ +            work->cj[jnew++] = cj[j];
+ +        }
+ +    }
+ +    /* Check if there are exclusions at all or not just the first entry */
+ +    if (!((jnew == 0) ||
-             if (cj[j].excl == NBNXN_INT_MASK_ALL)
++          (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
+ +    {
+ +        for (j = 0; j < ncj; j++)
+ +        {
++            if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
+ +            {
+ +                work->cj[jnew++] = cj[j];
+ +            }
+ +        }
+ +        for (j = 0; j < ncj; j++)
+ +        {
+ +            cj[j] = work->cj[j];
+ +        }
+ +    }
+ +}
+ +
+ +/* Close this simple list i entry */
+ +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
+ +{
+ +    int jlen;
+ +
+ +    /* All content of the new ci entry have already been filled correctly,
+ +     * we only need to increase the count here (for non empty lists).
+ +     */
+ +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
+ +    if (jlen > 0)
+ +    {
+ +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);
+ +
+ +        /* The counts below are used for non-bonded pair/flop counts
+ +         * and should therefore match the available kernel setups.
+ +         */
+ +        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+ +        {
+ +            nbl->work->ncj_noq += jlen;
+ +        }
+ +        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+ +                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
+ +        {
+ +            nbl->work->ncj_hlj += jlen;
+ +        }
+ +
+ +        nbl->nci++;
+ +    }
+ +}
+ +
+ +/* Split sci entry for load balancing on the GPU.
+ + * Splitting ensures we have enough lists to fully utilize the whole GPU.
+ + * With progBal we generate progressively smaller lists, which improves
+ + * load balancing. As we only know the current count on our own thread,
+ + * we will need to estimate the current total amount of i-entries.
+ + * As the lists get concatenated later, this estimate depends
+ + * both on nthread and our own thread index.
+ + */
+ +static void split_sci_entry(nbnxn_pairlist_t *nbl,
+ +                            int nsp_max_av, gmx_bool progBal, int nc_bal,
+ +                            int thread, int nthread)
+ +{
+ +    int nsci_est;
+ +    int nsp_max;
+ +    int cj4_start, cj4_end, j4len, cj4;
+ +    int sci;
+ +    int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
+ +    int p;
+ +
+ +    if (progBal)
+ +    {
+ +        /* Estimate the total numbers of ci's of the nblist combined
+ +         * over all threads using the target number of ci's.
+ +         */
+ +        nsci_est = nc_bal*thread/nthread + nbl->nsci;
+ +
+ +        /* The first ci blocks should be larger, to avoid overhead.
+ +         * The last ci blocks should be smaller, to improve load balancing.
+ +         */
+ +        nsp_max = max(1,
+ +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
+ +    }
+ +    else
+ +    {
+ +        nsp_max = nsp_max_av;
+ +    }
+ +
+ +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
+ +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
+ +    j4len     = cj4_end - cj4_start;
+ +
+ +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
+ +    {
+ +        /* Remove the last ci entry and process the cj4's again */
+ +        nbl->nsci -= 1;
+ +
+ +        sci        = nbl->nsci;
+ +        nsp        = 0;
+ +        nsp_sci    = 0;
+ +        nsp_cj4_e  = 0;
+ +        nsp_cj4    = 0;
+ +        for (cj4 = cj4_start; cj4 < cj4_end; cj4++)
+ +        {
+ +            nsp_cj4_p = nsp_cj4;
+ +            /* Count the number of cluster pairs in this cj4 group */
+ +            nsp_cj4   = 0;
+ +            for (p = 0; p < GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
+ +            {
+ +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
+ +            }
+ +
+ +            if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
+ +            {
+ +                /* Split the list at cj4 */
+ +                nbl->sci[sci].cj4_ind_end = cj4;
+ +                /* Create a new sci entry */
+ +                sci++;
+ +                nbl->nsci++;
+ +                if (nbl->nsci+1 > nbl->sci_nalloc)
+ +                {
+ +                    nb_realloc_sci(nbl, nbl->nsci+1);
+ +                }
+ +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
+ +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
+ +                nbl->sci[sci].cj4_ind_start = cj4;
+ +                nsp_sci                     = nsp;
+ +                nsp_cj4_e                   = nsp_cj4_p;
+ +                nsp                         = 0;
+ +            }
+ +            nsp += nsp_cj4;
+ +        }
+ +
+ +        /* Put the remaining cj4's in the last sci entry */
+ +        nbl->sci[sci].cj4_ind_end = cj4_end;
+ +
+ +        /* Possibly balance out the last two sci's
+ +         * by moving the last cj4 of the second last sci.
+ +         */
+ +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
+ +        {
+ +            nbl->sci[sci-1].cj4_ind_end--;
+ +            nbl->sci[sci].cj4_ind_start--;
+ +        }
+ +
+ +        nbl->nsci++;
+ +    }
+ +}
+ +
+ +/* Clost this super/sub list i entry */
+ +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
+ +                                    int nsp_max_av,
+ +                                    gmx_bool progBal, int nc_bal,
+ +                                    int thread, int nthread)
+ +{
+ +    int j4len, tlen;
+ +    int nb, b;
+ +
+ +    /* All content of the new ci entry have already been filled correctly,
+ +     * we only need to increase the count here (for non empty lists).
+ +     */
+ +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
+ +    if (j4len > 0)
+ +    {
+ +        /* We can only have complete blocks of 4 j-entries in a list,
+ +         * so round the count up before closing.
+ +         */
+ +        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
+ +
+ +        nbl->nsci++;
+ +
+ +        if (nsp_max_av > 0)
+ +        {
+ +            /* Measure the size of the new entry and potentially split it */
+ +            split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);
+ +        }
+ +    }
+ +}
+ +
+ +/* Syncs the working array before adding another grid pair to the list */
+ +static void sync_work(nbnxn_pairlist_t *nbl)
+ +{
+ +    if (!nbl->bSimple)
+ +    {
+ +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
+ +        nbl->work->cj4_init = nbl->ncj4;
+ +    }
+ +}
+ +
+ +/* Clears an nbnxn_pairlist_t data structure */
+ +static void clear_pairlist(nbnxn_pairlist_t *nbl)
+ +{
+ +    nbl->nci           = 0;
+ +    nbl->nsci          = 0;
+ +    nbl->ncj           = 0;
+ +    nbl->ncj4          = 0;
+ +    nbl->nci_tot       = 0;
+ +    nbl->nexcl         = 1;
+ +
+ +    nbl->work->ncj_noq = 0;
+ +    nbl->work->ncj_hlj = 0;
+ +}
+ +
+ +/* Sets a simple list i-cell bounding box, including PBC shift */
+ +static void set_icell_bb_simple(const float *bb, int ci,
+ +                                real shx, real shy, real shz,
+ +                                float *bb_ci)
+ +{
+ +    int ia;
+ +
+ +    ia           = ci*NNBSBB_B;
+ +    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
+ +    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
+ +    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
+ +    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
+ +    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
+ +    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
+ +}
+ +
+ +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+ +static void set_icell_bb_supersub(const float *bb, int ci,
+ +                                  real shx, real shy, real shz,
+ +                                  float *bb_ci)
+ +{
+ +    int ia, m, i;
+ +
+ +#ifdef NBNXN_BBXXXX
+ +    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+ +    for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
+ +    {
+ +        for (i = 0; i < STRIDE_PBB; i++)
+ +        {
+ +            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+ +            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+ +            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+ +            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+ +            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+ +            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
+ +        }
+ +    }
+ +#else
+ +    ia = ci*GPU_NSUBCELL*NNBSBB_B;
+ +    for (i = 0; i < GPU_NSUBCELL*NNBSBB_B; i += NNBSBB_B)
+ +    {
+ +        bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
+ +        bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
+ +        bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
+ +        bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
+ +        bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
+ +        bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
+ +    }
+ +#endif
+ +}
+ +
+ +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
+ +static void icell_set_x_simple(int ci,
+ +                               real shx, real shy, real shz,
+ +                               int gmx_unused na_c,
+ +                               int stride, const real *x,
+ +                               nbnxn_list_work_t *work)
+ +{
+ +    int  ia, i;
+ +
+ +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
+ +
+ +    for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE; i++)
+ +    {
+ +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
+ +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
+ +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
+ +    }
+ +}
+ +
+ +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
+ +static void icell_set_x_supersub(int ci,
+ +                                 real shx, real shy, real shz,
+ +                                 int na_c,
+ +                                 int stride, const real *x,
+ +                                 nbnxn_list_work_t *work)
+ +{
+ +    int  ia, i;
+ +    real *x_ci;
+ +
+ +    x_ci = work->x_ci;
+ +
+ +    ia = ci*GPU_NSUBCELL*na_c;
+ +    for (i = 0; i < GPU_NSUBCELL*na_c; i++)
+ +    {
+ +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
+ +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
+ +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
+ +    }
+ +}
+ +
+ +#ifdef NBNXN_SEARCH_BB_SSE
+ +/* Copies PBC shifted super-cell packed atom coordinates to working array */
+ +static void icell_set_x_supersub_sse8(int ci,
+ +                                      real shx, real shy, real shz,
+ +                                      int na_c,
+ +                                      int stride, const real *x,
+ +                                      nbnxn_list_work_t *work)
+ +{
+ +    int  si, io, ia, i, j;
+ +    real *x_ci;
+ +
+ +    x_ci = work->x_ci;
+ +
+ +    for (si = 0; si < GPU_NSUBCELL; si++)
+ +    {
+ +        for (i = 0; i < na_c; i += STRIDE_PBB)
+ +        {
+ +            io = si*na_c + i;
+ +            ia = ci*GPU_NSUBCELL*na_c + io;
+ +            for (j = 0; j < STRIDE_PBB; j++)
+ +            {
+ +                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+ +                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+ +                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
+ +            }
+ +        }
+ +    }
+ +}
+ +#endif
+ +
+ +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
+ +
+ +/* Due to the cluster size the effective pair-list is longer than
+ + * that of a simple atom pair-list. This function gives the extra distance.
+ + */
+ +real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density)
+ +{
+ +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density), 1.0/3.0));
+ +}
+ +
+ +/* Estimates the interaction volume^2 for non-local interactions */
+ +static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)
+ +{
+ +    int  z, d;
+ +    real cl, ca, za;
+ +    real vold_est;
+ +    real vol2_est_tot;
+ +
+ +    vol2_est_tot = 0;
+ +
+ +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
+ +     * not home interaction volume^2. As these volumes are not additive,
+ +     * this is an overestimate, but it would only be significant in the limit
+ +     * of small cells, where we anyhow need to split the lists into
+ +     * as small parts as possible.
+ +     */
+ +
+ +    for (z = 0; z < zones->n; z++)
+ +    {
+ +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
+ +        {
+ +            cl = 0;
+ +            ca = 1;
+ +            za = 1;
+ +            for (d = 0; d < DIM; d++)
+ +            {
+ +                if (zones->shift[z][d] == 0)
+ +                {
+ +                    cl += 0.5*ls[d];
+ +                    ca *= ls[d];
+ +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
+ +                }
+ +            }
+ +
+ +            /* 4 octants of a sphere */
+ +            vold_est  = 0.25*M_PI*r*r*r*r;
+ +            /* 4 quarter pie slices on the edges */
+ +            vold_est += 4*cl*M_PI/6.0*r*r*r;
+ +            /* One rectangular volume on a face */
+ +            vold_est += ca*0.5*r*r;
+ +
+ +            vol2_est_tot += vold_est*za;
+ +        }
+ +    }
+ +
+ +    return vol2_est_tot;
+ +}
+ +
+ +/* Estimates the average size of a full j-list for super/sub setup */
+ +static int get_nsubpair_max(const nbnxn_search_t nbs,
+ +                            int                  iloc,
+ +                            real                 rlist,
+ +                            int                  min_ci_balanced)
+ +{
+ +    const nbnxn_grid_t *grid;
+ +    rvec ls;
+ +    real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;
+ +    int  nsubpair_max;
+ +
+ +    grid = &nbs->grid[0];
+ +
+ +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
+ +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
+ +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
+ +
+ +    /* The average squared length of the diagonal of a sub cell */
+ +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
+ +
+ +    /* The formulas below are a heuristic estimate of the average nsj per si*/
+ +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
+ +
+ +    if (!nbs->DomDec || nbs->zones->n == 1)
+ +    {
+ +        nsp_est_nl = 0;
+ +    }
+ +    else
+ +    {
+ +        nsp_est_nl =
+ +            sqr(grid->atom_density/grid->na_c)*
+ +            nonlocal_vol2(nbs->zones, ls, r_eff_sup);
+ +    }
+ +
+ +    if (LOCAL_I(iloc))
+ +    {
+ +        /* Sub-cell interacts with itself */
+ +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
+ +        /* 6/2 rectangular volume on the faces */
+ +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
+ +        /* 12/2 quarter pie slices on the edges */
+ +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
+ +        /* 4 octants of a sphere */
+ +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
+ +
+ +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
+ +
+ +        /* Subtract the non-local pair count */
+ +        nsp_est -= nsp_est_nl;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
+ +                    nsp_est, nsp_est_nl);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        nsp_est = nsp_est_nl;
+ +    }
+ +
+ +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
+ +    {
+ +        /* We don't need to worry */
+ +        nsubpair_max = -1;
+ +    }
+ +    else
+ +    {
+ +        /* Thus the (average) maximum j-list size should be as follows */
+ +        nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5));
+ +
+ +        /* Since the target value is a maximum (this avoids high outliers,
+ +         * which lead to load imbalance), not average, we add half the
+ +         * number of pairs in a cj4 block to get the average about right.
+ +         */
+ +        nsubpair_max += GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE/2;
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",
+ +                nsp_est, nsubpair_max);
+ +    }
+ +
+ +    return nsubpair_max;
+ +}
+ +
+ +/* Debug list print function */
+ +static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
+ +{
+ +    int i, j;
+ +
+ +    for (i = 0; i < nbl->nci; i++)
+ +    {
+ +        fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
+ +                nbl->ci[i].ci, nbl->ci[i].shift,
+ +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
+ +
+ +        for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)
+ +        {
+ +            fprintf(fp, "  cj %5d  imask %x\n",
+ +                    nbl->cj[j].cj,
+ +                    nbl->cj[j].excl);
+ +        }
+ +    }
+ +}
+ +
+ +/* Debug list print function */
+ +static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
+ +{
+ +    int i, j4, j, ncp, si;
+ +
+ +    for (i = 0; i < nbl->nsci; i++)
+ +    {
+ +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
+ +                nbl->sci[i].sci, nbl->sci[i].shift,
+ +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
+ +
+ +        ncp = 0;
+ +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
+ +        {
+ +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
+ +            {
+ +                fprintf(fp, "  sj %5d  imask %x\n",
+ +                        nbl->cj4[j4].cj[j],
+ +                        nbl->cj4[j4].imei[0].imask);
+ +                for (si = 0; si < GPU_NSUBCELL; si++)
+ +                {
+ +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
+ +                    {
+ +                        ncp++;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
+ +                nbl->sci[i].sci, nbl->sci[i].shift,
+ +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
+ +                ncp);
+ +    }
+ +}
+ +
+ +/* Combine pair lists *nbl generated on multiple threads nblc */
+ +static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
+ +                            nbnxn_pairlist_t *nblc)
+ +{
+ +    int nsci, ncj4, nexcl;
+ +    int n, i;
+ +
+ +    if (nblc->bSimple)
+ +    {
+ +        gmx_incons("combine_nblists does not support simple lists");
+ +    }
+ +
+ +    nsci  = nblc->nsci;
+ +    ncj4  = nblc->ncj4;
+ +    nexcl = nblc->nexcl;
+ +    for (i = 0; i < nnbl; i++)
+ +    {
+ +        nsci  += nbl[i]->nsci;
+ +        ncj4  += nbl[i]->ncj4;
+ +        nexcl += nbl[i]->nexcl;
+ +    }
+ +
+ +    if (nsci > nblc->sci_nalloc)
+ +    {
+ +        nb_realloc_sci(nblc, nsci);
+ +    }
+ +    if (ncj4 > nblc->cj4_nalloc)
+ +    {
+ +        nblc->cj4_nalloc = over_alloc_small(ncj4);
+ +        nbnxn_realloc_void((void **)&nblc->cj4,
+ +                           nblc->ncj4*sizeof(*nblc->cj4),
+ +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
+ +                           nblc->alloc, nblc->free);
+ +    }
+ +    if (nexcl > nblc->excl_nalloc)
+ +    {
+ +        nblc->excl_nalloc = over_alloc_small(nexcl);
+ +        nbnxn_realloc_void((void **)&nblc->excl,
+ +                           nblc->nexcl*sizeof(*nblc->excl),
+ +                           nblc->excl_nalloc*sizeof(*nblc->excl),
+ +                           nblc->alloc, nblc->free);
+ +    }
+ +
+ +    /* Each thread should copy its own data to the combined arrays,
+ +     * as otherwise data will go back and forth between different caches.
+ +     */
+ +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+ +    for (n = 0; n < nnbl; n++)
+ +    {
+ +        int sci_offset;
+ +        int cj4_offset;
+ +        int ci_offset;
+ +        int excl_offset;
+ +        int i, j4;
+ +        const nbnxn_pairlist_t *nbli;
+ +
+ +        /* Determine the offset in the combined data for our thread */
+ +        sci_offset  = nblc->nsci;
+ +        cj4_offset  = nblc->ncj4;
+ +        ci_offset   = nblc->nci_tot;
+ +        excl_offset = nblc->nexcl;
+ +
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            sci_offset  += nbl[i]->nsci;
+ +            cj4_offset  += nbl[i]->ncj4;
+ +            ci_offset   += nbl[i]->nci_tot;
+ +            excl_offset += nbl[i]->nexcl;
+ +        }
+ +
+ +        nbli = nbl[n];
+ +
+ +        for (i = 0; i < nbli->nsci; i++)
+ +        {
+ +            nblc->sci[sci_offset+i]                = nbli->sci[i];
+ +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
+ +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
+ +        }
+ +
+ +        for (j4 = 0; j4 < nbli->ncj4; j4++)
+ +        {
+ +            nblc->cj4[cj4_offset+j4]                   = nbli->cj4[j4];
+ +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
+ +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
+ +        }
+ +
+ +        for (j4 = 0; j4 < nbli->nexcl; j4++)
+ +        {
+ +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
+ +        }
+ +    }
+ +
+ +    for (n = 0; n < nnbl; n++)
+ +    {
+ +        nblc->nsci    += nbl[n]->nsci;
+ +        nblc->ncj4    += nbl[n]->ncj4;
+ +        nblc->nci_tot += nbl[n]->nci_tot;
+ +        nblc->nexcl   += nbl[n]->nexcl;
+ +    }
+ +}
+ +
+ +/* Returns the next ci to be processes by our thread */
+ +static gmx_bool next_ci(const nbnxn_grid_t *grid,
+ +                        int conv,
+ +                        int nth, int ci_block,
+ +                        int *ci_x, int *ci_y,
+ +                        int *ci_b, int *ci)
+ +{
+ +    (*ci_b)++;
+ +    (*ci)++;
+ +
+ +    if (*ci_b == ci_block)
+ +    {
+ +        /* Jump to the next block assigned to this task */
+ +        *ci   += (nth - 1)*ci_block;
+ +        *ci_b  = 0;
+ +    }
+ +
+ +    if (*ci >= grid->nc*conv)
+ +    {
+ +        return FALSE;
+ +    }
+ +
+ +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
+ +    {
+ +        *ci_y += 1;
+ +        if (*ci_y == grid->ncy)
+ +        {
+ +            *ci_x += 1;
+ +            *ci_y  = 0;
+ +        }
+ +    }
+ +
+ +    return TRUE;
+ +}
+ +
+ +/* Returns the distance^2 for which we put cell pairs in the list
+ + * without checking atom pair distances. This is usually < rlist^2.
+ + */
+ +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
+ +                                        const nbnxn_grid_t *gridj,
+ +                                        real                rlist,
+ +                                        gmx_bool            simple)
+ +{
+ +    /* If the distance between two sub-cell bounding boxes is less
+ +     * than this distance, do not check the distance between
+ +     * all particle pairs in the sub-cell, since then it is likely
+ +     * that the box pair has atom pairs within the cut-off.
+ +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
+ +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
+ +     * Using more than 0.5 gains at most 0.5%.
+ +     * If forces are calculated more than twice, the performance gain
+ +     * in the force calculation outweighs the cost of checking.
+ +     * Note that with subcell lists, the atom-pair distance check
+ +     * is only performed when only 1 out of 8 sub-cells in within range,
+ +     * this is because the GPU is much faster than the cpu.
+ +     */
+ +    real bbx, bby;
+ +    real rbb2;
+ +
+ +    bbx = 0.5*(gridi->sx + gridj->sx);
+ +    bby = 0.5*(gridi->sy + gridj->sy);
+ +    if (!simple)
+ +    {
+ +        bbx /= GPU_NSUBCELL_X;
+ +        bby /= GPU_NSUBCELL_Y;
+ +    }
+ +
+ +    rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
+ +
+ +#ifndef GMX_DOUBLE
+ +    return rbb2;
+ +#else
+ +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
+ +#endif
+ +}
+ +
+ +static int get_ci_block_size(const nbnxn_grid_t *gridi,
+ +                             gmx_bool bDomDec, int nth)
+ +{
+ +    const int ci_block_enum      = 5;
+ +    const int ci_block_denom     = 11;
+ +    const int ci_block_min_atoms = 16;
+ +    int ci_block;
+ +
+ +    /* Here we decide how to distribute the blocks over the threads.
+ +     * We use prime numbers to try to avoid that the grid size becomes
+ +     * a multiple of the number of threads, which would lead to some
+ +     * threads getting "inner" pairs and others getting boundary pairs,
+ +     * which in turns will lead to load imbalance between threads.
+ +     * Set the block size as 5/11/ntask times the average number of cells
+ +     * in a y,z slab. This should ensure a quite uniform distribution
+ +     * of the grid parts of the different thread along all three grid
+ +     * zone boundaries with 3D domain decomposition. At the same time
+ +     * the blocks will not become too small.
+ +     */
+ +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
+ +
+ +    /* Ensure the blocks are not too small: avoids cache invalidation */
+ +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
+ +    {
+ +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
+ +    }
+ +
+ +    /* Without domain decomposition
+ +     * or with less than 3 blocks per task, divide in nth blocks.
+ +     */
+ +    if (!bDomDec || ci_block*3*nth > gridi->nc)
+ +    {
+ +        ci_block = (gridi->nc + nth - 1)/nth;
+ +    }
+ +
+ +    return ci_block;
+ +}
+ +
+ +/* Generates the part of pair-list nbl assigned to our thread */
+ +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
+ +                                     const nbnxn_grid_t *gridi,
+ +                                     const nbnxn_grid_t *gridj,
+ +                                     nbnxn_search_work_t *work,
+ +                                     const nbnxn_atomdata_t *nbat,
+ +                                     const t_blocka *excl,
+ +                                     real rlist,
+ +                                     int nb_kernel_type,
+ +                                     int ci_block,
+ +                                     gmx_bool bFBufferFlag,
+ +                                     int nsubpair_max,
+ +                                     gmx_bool progBal,
+ +                                     int min_ci_balanced,
+ +                                     int th, int nth,
+ +                                     nbnxn_pairlist_t *nbl)
+ +{
+ +    int  na_cj_2log;
+ +    matrix box;
+ +    real rl2;
+ +    float rbb2;
+ +    int  d;
+ +    int  ci_b, ci, ci_x, ci_y, ci_xy, cj;
+ +    ivec shp;
+ +    int  tx, ty, tz;
+ +    int  shift;
+ +    gmx_bool bMakeList;
+ +    real shx, shy, shz;
+ +    int  conv_i, cell0_i;
+ +    const float *bb_i, *bbcz_i, *bbcz_j;
+ +    const int *flags_i;
+ +    real bx0, bx1, by0, by1, bz0, bz1;
+ +    real bz1_frac;
+ +    real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
+ +    int  cxf, cxl, cyf, cyf_x, cyl;
+ +    int  cx, cy;
+ +    int  c0, c1, cs, cf, cl;
+ +    int  ndistc;
+ +    int  ncpcheck;
+ +    int  gridi_flag_shift = 0, gridj_flag_shift = 0;
+ +    unsigned *gridj_flag  = NULL;
+ +    int  ncj_old_i, ncj_old_j;
+ +
+ +    nbs_cycle_start(&work->cc[enbsCCsearch]);
+ +
+ +    if (gridj->bSimple != nbl->bSimple)
+ +    {
+ +        gmx_incons("Grid incompatible with pair-list");
+ +    }
+ +
+ +    sync_work(nbl);
+ +    nbl->na_sc = gridj->na_sc;
+ +    nbl->na_ci = gridj->na_c;
+ +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
+ +    na_cj_2log = get_2log(nbl->na_cj);
+ +
+ +    nbl->rlist  = rlist;
+ +
+ +    if (bFBufferFlag)
+ +    {
+ +        /* Determine conversion of clusters to flag blocks */
+ +        gridi_flag_shift = 0;
+ +        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
+ +        {
+ +            gridi_flag_shift++;
+ +        }
+ +        gridj_flag_shift = 0;
+ +        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
+ +        {
+ +            gridj_flag_shift++;
+ +        }
+ +
+ +        gridj_flag = work->buffer_flags.flag;
+ +    }
+ +
+ +    copy_mat(nbs->box, box);
+ +
+ +    rl2 = nbl->rlist*nbl->rlist;
+ +
+ +    rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));
+ +    }
+ +
+ +    /* Set the shift range */
+ +    for (d = 0; d < DIM; d++)
+ +    {
+ +        /* Check if we need periodicity shifts.
+ +         * Without PBC or with domain decomposition we don't need them.
+ +         */
+ +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
+ +        {
+ +            shp[d] = 0;
+ +        }
+ +        else
+ +        {
+ +            if (d == XX &&
+ +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
+ +            {
+ +                shp[d] = 2;
+ +            }
+ +            else
+ +            {
+ +                shp[d] = 1;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (nbl->bSimple && !gridi->bSimple)
+ +    {
+ +        conv_i  = gridi->na_sc/gridj->na_sc;
+ +        bb_i    = gridi->bb_simple;
+ +        bbcz_i  = gridi->bbcz_simple;
+ +        flags_i = gridi->flags_simple;
+ +    }
+ +    else
+ +    {
+ +        conv_i  = 1;
+ +        bb_i    = gridi->bb;
+ +        bbcz_i  = gridi->bbcz;
+ +        flags_i = gridi->flags;
+ +    }
+ +    cell0_i = gridi->cell0*conv_i;
+ +
+ +    bbcz_j = gridj->bbcz;
+ +
+ +    if (conv_i != 1)
+ +    {
+ +        /* Blocks of the conversion factor - 1 give a large repeat count
+ +         * combined with a small block size. This should result in good
+ +         * load balancing for both small and large domains.
+ +         */
+ +        ci_block = conv_i - 1;
+ +    }
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
+ +                gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
+ +    }
+ +
+ +    ndistc   = 0;
+ +    ncpcheck = 0;
+ +
+ +    /* Initially ci_b and ci to 1 before where we want them to start,
+ +     * as they will both be incremented in next_ci.
+ +     */
+ +    ci_b = -1;
+ +    ci   = th*ci_block - 1;
+ +    ci_x = 0;
+ +    ci_y = 0;
+ +    while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
+ +    {
+ +        if (nbl->bSimple && flags_i[ci] == 0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        ncj_old_i = nbl->ncj;
+ +
+ +        d2cx = 0;
+ +        if (gridj != gridi && shp[XX] == 0)
+ +        {
+ +            if (nbl->bSimple)
+ +            {
+ +                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
+ +            }
+ +            else
+ +            {
+ +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
+ +            }
+ +            if (bx1 < gridj->c0[XX])
+ +            {
+ +                d2cx = sqr(gridj->c0[XX] - bx1);
+ +
+ +                if (d2cx >= rl2)
+ +                {
+ +                    continue;
+ +                }
+ +            }
+ +        }
+ +
+ +        ci_xy = ci_x*gridi->ncy + ci_y;
+ +
+ +        /* Loop over shift vectors in three dimensions */
+ +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
+ +        {
+ +            shz = tz*box[ZZ][ZZ];
+ +
+ +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
+ +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
+ +
+ +            if (tz == 0)
+ +            {
+ +                d2z = 0;
+ +            }
+ +            else if (tz < 0)
+ +            {
+ +                d2z = sqr(bz1);
+ +            }
+ +            else
+ +            {
+ +                d2z = sqr(bz0 - box[ZZ][ZZ]);
+ +            }
+ +
+ +            d2z_cx = d2z + d2cx;
+ +
+ +            if (d2z_cx >= rl2)
+ +            {
+ +                continue;
+ +            }
+ +
+ +            bz1_frac =
+ +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
+ +            if (bz1_frac < 0)
+ +            {
+ +                bz1_frac = 0;
+ +            }
+ +            /* The check with bz1_frac close to or larger than 1 comes later */
+ +
+ +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
+ +            {
+ +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
+ +
+ +                if (nbl->bSimple)
+ +                {
+ +                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
+ +                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
+ +                }
+ +                else
+ +                {
+ +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
+ +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
+ +                }
+ +
+ +                get_cell_range(by0, by1,
+ +                               gridj->ncy, gridj->c0[YY], gridj->sy, gridj->inv_sy,
+ +                               d2z_cx, rl2,
+ +                               &cyf, &cyl);
+ +
+ +                if (cyf > cyl)
+ +                {
+ +                    continue;
+ +                }
+ +
+ +                d2z_cy = d2z;
+ +                if (by1 < gridj->c0[YY])
+ +                {
+ +                    d2z_cy += sqr(gridj->c0[YY] - by1);
+ +                }
+ +                else if (by0 > gridj->c1[YY])
+ +                {
+ +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
+ +                }
+ +
+ +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
+ +                {
+ +                    shift = XYZ2IS(tx, ty, tz);
+ +
+ +#ifdef NBNXN_SHIFT_BACKWARD
+ +                    if (gridi == gridj && shift > CENTRAL)
+ +                    {
+ +                        continue;
+ +                    }
+ +#endif
+ +
+ +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
+ +
+ +                    if (nbl->bSimple)
+ +                    {
+ +                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
+ +                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
+ +                    }
+ +                    else
+ +                    {
+ +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
+ +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
+ +                    }
+ +
+ +                    get_cell_range(bx0, bx1,
+ +                                   gridj->ncx, gridj->c0[XX], gridj->sx, gridj->inv_sx,
+ +                                   d2z_cy, rl2,
+ +                                   &cxf, &cxl);
+ +
+ +                    if (cxf > cxl)
+ +                    {
+ +                        continue;
+ +                    }
+ +
+ +                    if (nbl->bSimple)
+ +                    {
+ +                        new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci]);
+ +                    }
+ +                    else
+ +                    {
+ +                        new_sci_entry(nbl, cell0_i+ci, shift);
+ +                    }
+ +
+ +#ifndef NBNXN_SHIFT_BACKWARD
+ +                    if (cxf < ci_x)
+ +#else
+ +                    if (shift == CENTRAL && gridi == gridj &&
+ +                        cxf < ci_x)
+ +#endif
+ +                    {
+ +                        /* Leave the pairs with i > j.
+ +                         * x is the major index, so skip half of it.
+ +                         */
+ +                        cxf = ci_x;
+ +                    }
+ +
+ +                    if (nbl->bSimple)
+ +                    {
+ +                        set_icell_bb_simple(bb_i, ci, shx, shy, shz,
+ +                                            nbl->work->bb_ci);
+ +                    }
+ +                    else
+ +                    {
+ +                        set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
+ +                                              nbl->work->bb_ci);
+ +                    }
+ +
+ +                    nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
+ +                                     gridi->na_c, nbat->xstride, nbat->x,
+ +                                     nbl->work);
+ +
+ +                    for (cx = cxf; cx <= cxl; cx++)
+ +                    {
+ +                        d2zx = d2z;
+ +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
+ +                        {
+ +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
+ +                        }
+ +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
+ +                        {
+ +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
+ +                        }
+ +
+ +#ifndef NBNXN_SHIFT_BACKWARD
+ +                        if (gridi == gridj &&
+ +                            cx == 0 && cyf < ci_y)
+ +#else
+ +                        if (gridi == gridj &&
+ +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
+ +#endif
+ +                        {
+ +                            /* Leave the pairs with i > j.
+ +                             * Skip half of y when i and j have the same x.
+ +                             */
+ +                            cyf_x = ci_y;
+ +                        }
+ +                        else
+ +                        {
+ +                            cyf_x = cyf;
+ +                        }
+ +
+ +                        for (cy = cyf_x; cy <= cyl; cy++)
+ +                        {
+ +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
+ +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
+ +#ifdef NBNXN_SHIFT_BACKWARD
+ +                            if (gridi == gridj &&
+ +                                shift == CENTRAL && c0 < ci)
+ +                            {
+ +                                c0 = ci;
+ +                            }
+ +#endif
+ +
+ +                            d2zxy = d2zx;
+ +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
+ +                            {
+ +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
+ +                            }
+ +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
+ +                            {
+ +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
+ +                            }
+ +                            if (c1 > c0 && d2zxy < rl2)
+ +                            {
+ +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
+ +                                if (cs >= c1)
+ +                                {
+ +                                    cs = c1 - 1;
+ +                                }
+ +
+ +                                d2xy = d2zxy - d2z;
+ +
+ +                                /* Find the lowest cell that can possibly
+ +                                 * be within range.
+ +                                 */
+ +                                cf = cs;
+ +                                while (cf > c0 &&
+ +                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
+ +                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
+ +                                {
+ +                                    cf--;
+ +                                }
+ +
+ +                                /* Find the highest cell that can possibly
+ +                                 * be within range.
+ +                                 */
+ +                                cl = cs;
+ +                                while (cl < c1-1 &&
+ +                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
+ +                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
+ +                                {
+ +                                    cl++;
+ +                                }
+ +
+ +#ifdef NBNXN_REFCODE
+ +                                {
+ +                                    /* Simple reference code, for debugging,
+ +                                     * overrides the more complex code above.
+ +                                     */
+ +                                    int k;
+ +                                    cf = c1;
+ +                                    cl = -1;
+ +                                    for (k = c0; k < c1; k++)
+ +                                    {
+ +                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
+ +                                                      bb+k*NNBSBB_B) < rl2 &&
+ +                                            k < cf)
+ +                                        {
+ +                                            cf = k;
+ +                                        }
+ +                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
+ +                                                      bb+k*NNBSBB_B) < rl2 &&
+ +                                            k > cl)
+ +                                        {
+ +                                            cl = k;
+ +                                        }
+ +                                    }
+ +                                }
+ +#endif
+ +
+ +                                if (gridi == gridj)
+ +                                {
+ +                                    /* We want each atom/cell pair only once,
+ +                                     * only use cj >= ci.
+ +                                     */
+ +#ifndef NBNXN_SHIFT_BACKWARD
+ +                                    cf = max(cf, ci);
+ +#else
+ +                                    if (shift == CENTRAL)
+ +                                    {
+ +                                        cf = max(cf, ci);
+ +                                    }
+ +#endif
+ +                                }
+ +
+ +                                if (cf <= cl)
+ +                                {
+ +                                    /* For f buffer flags with simple lists */
+ +                                    ncj_old_j = nbl->ncj;
+ +
+ +                                    switch (nb_kernel_type)
+ +                                    {
+ +                                        case nbnxnk4x4_PlainC:
+ +                                            check_subcell_list_space_simple(nbl, cl-cf+1);
+ +
+ +                                            make_cluster_list_simple(gridj,
+ +                                                                     nbl, ci, cf, cl,
+ +                                                                     (gridi == gridj && shift == CENTRAL),
+ +                                                                     nbat->x,
+ +                                                                     rl2, rbb2,
+ +                                                                     &ndistc);
+ +                                            break;
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +                                        case nbnxnk4xN_SIMD_4xN:
+ +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
+ +                                            make_cluster_list_simd_4xn(gridj,
+ +                                                                       nbl, ci, cf, cl,
+ +                                                                       (gridi == gridj && shift == CENTRAL),
+ +                                                                       nbat->x,
+ +                                                                       rl2, rbb2,
+ +                                                                       &ndistc);
+ +                                            break;
+ +#endif
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +                                        case nbnxnk4xN_SIMD_2xNN:
+ +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
+ +                                            make_cluster_list_simd_2xnn(gridj,
+ +                                                                        nbl, ci, cf, cl,
+ +                                                                        (gridi == gridj && shift == CENTRAL),
+ +                                                                        nbat->x,
+ +                                                                        rl2, rbb2,
+ +                                                                        &ndistc);
+ +                                            break;
+ +#endif
+ +                                        case nbnxnk8x8x8_PlainC:
+ +                                        case nbnxnk8x8x8_CUDA:
+ +                                            check_subcell_list_space_supersub(nbl, cl-cf+1);
+ +                                            for (cj = cf; cj <= cl; cj++)
+ +                                            {
+ +                                                make_cluster_list_supersub(gridi, gridj,
+ +                                                                           nbl, ci, cj,
+ +                                                                           (gridi == gridj && shift == CENTRAL && ci == cj),
+ +                                                                           nbat->xstride, nbat->x,
+ +                                                                           rl2, rbb2,
+ +                                                                           &ndistc);
+ +                                            }
+ +                                            break;
+ +                                    }
+ +                                    ncpcheck += cl - cf + 1;
+ +
+ +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
+ +                                    {
+ +                                        int cbf, cbl, cb;
+ +
+ +                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
+ +                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
+ +                                        for (cb = cbf; cb <= cbl; cb++)
+ +                                        {
+ +                                            gridj_flag[cb] = 1U<<th;
+ +                                        }
+ +                                    }
+ +                                }
+ +                            }
+ +                        }
+ +                    }
+ +
+ +                    /* Set the exclusions for this ci list */
+ +                    if (nbl->bSimple)
+ +                    {
+ +                        set_ci_top_excls(nbs,
+ +                                         nbl,
+ +                                         shift == CENTRAL && gridi == gridj,
+ +                                         gridj->na_c_2log,
+ +                                         na_cj_2log,
+ +                                         &(nbl->ci[nbl->nci]),
+ +                                         excl);
+ +                    }
+ +                    else
+ +                    {
+ +                        set_sci_top_excls(nbs,
+ +                                          nbl,
+ +                                          shift == CENTRAL && gridi == gridj,
+ +                                          gridj->na_c_2log,
+ +                                          &(nbl->sci[nbl->nsci]),
+ +                                          excl);
+ +                    }
+ +
+ +                    /* Close this ci list */
+ +                    if (nbl->bSimple)
+ +                    {
+ +                        close_ci_entry_simple(nbl);
+ +                    }
+ +                    else
+ +                    {
+ +                        close_ci_entry_supersub(nbl,
+ +                                                nsubpair_max,
+ +                                                progBal, min_ci_balanced,
+ +                                                th, nth);
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
+ +        {
+ +            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
+ +        }
+ +    }
+ +
+ +    work->ndistc = ndistc;
+ +
+ +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "number of distance checks %d\n", ndistc);
+ +        fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",
+ +                ncpcheck);
+ +
+ +        if (nbl->bSimple)
+ +        {
+ +            print_nblist_statistics_simple(debug, nbl, nbs, rlist);
+ +        }
+ +        else
+ +        {
+ +            print_nblist_statistics_supersub(debug, nbl, nbs, rlist);
+ +        }
+ +
+ +    }
+ +}
+ +
+ +static void reduce_buffer_flags(const nbnxn_search_t        nbs,
+ +                                int                         nsrc,
+ +                                const nbnxn_buffer_flags_t *dest)
+ +{
+ +    int s, b;
+ +    const unsigned *flag;
+ +
+ +    for (s = 0; s < nsrc; s++)
+ +    {
+ +        flag = nbs->work[s].buffer_flags.flag;
+ +
+ +        for (b = 0; b < dest->nflag; b++)
+ +        {
+ +            dest->flag[b] |= flag[b];
+ +        }
+ +    }
+ +}
+ +
+ +static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
+ +{
+ +    int nelem, nkeep, ncopy, nred, b, c, out;
+ +
+ +    nelem = 0;
+ +    nkeep = 0;
+ +    ncopy = 0;
+ +    nred  = 0;
+ +    for (b = 0; b < flags->nflag; b++)
+ +    {
+ +        if (flags->flag[b] == 1)
+ +        {
+ +            /* Only flag 0 is set, no copy of reduction required */
+ +            nelem++;
+ +            nkeep++;
+ +        }
+ +        else if (flags->flag[b] > 0)
+ +        {
+ +            c = 0;
+ +            for (out = 0; out < nout; out++)
+ +            {
+ +                if (flags->flag[b] & (1U<<out))
+ +                {
+ +                    c++;
+ +                }
+ +            }
+ +            nelem += c;
+ +            if (c == 1)
+ +            {
+ +                ncopy++;
+ +            }
+ +            else
+ +            {
+ +                nred += c;
+ +            }
+ +        }
+ +    }
+ +
+ +    fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
+ +            flags->nflag, nout,
+ +            nelem/(double)(flags->nflag),
+ +            nkeep/(double)(flags->nflag),
+ +            ncopy/(double)(flags->nflag),
+ +            nred/(double)(flags->nflag));
+ +}
+ +
+ +/* Perform a count (linear) sort to sort the smaller lists to the end.
+ + * This avoids load imbalance on the GPU, as large lists will be
+ + * scheduled and executed first and the smaller lists later.
+ + * Load balancing between multi-processors only happens at the end
+ + * and there smaller lists lead to more effective load balancing.
+ + * The sorting is done on the cj4 count, not on the actual pair counts.
+ + * Not only does this make the sort faster, but it also results in
+ + * better load balancing than using a list sorted on exact load.
+ + * This function swaps the pointer in the pair list to avoid a copy operation.
+ + */
+ +static void sort_sci(nbnxn_pairlist_t *nbl)
+ +{
+ +    nbnxn_list_work_t *work;
+ +    int                m, i, s, s0, s1;
+ +    nbnxn_sci_t       *sci_sort;
+ +
+ +    if (nbl->ncj4 <= nbl->nsci)
+ +    {
+ +        /* nsci = 0 or all sci have size 1, sorting won't change the order */
+ +        return;
+ +    }
+ +
+ +    work = nbl->work;
+ +
+ +    /* We will distinguish differences up to double the average */
+ +    m = (2*nbl->ncj4)/nbl->nsci;
+ +
+ +    if (m + 1 > work->sort_nalloc)
+ +    {
+ +        work->sort_nalloc = over_alloc_large(m + 1);
+ +        srenew(work->sort, work->sort_nalloc);
+ +    }
+ +
+ +    if (work->sci_sort_nalloc != nbl->sci_nalloc)
+ +    {
+ +        work->sci_sort_nalloc = nbl->sci_nalloc;
+ +        nbnxn_realloc_void((void **)&work->sci_sort,
+ +                           0,
+ +                           work->sci_sort_nalloc*sizeof(*work->sci_sort),
+ +                           nbl->alloc, nbl->free);
+ +    }
+ +
+ +    /* Count the entries of each size */
+ +    for (i = 0; i <= m; i++)
+ +    {
+ +        work->sort[i] = 0;
+ +    }
+ +    for (s = 0; s < nbl->nsci; s++)
+ +    {
+ +        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
+ +        work->sort[i]++;
+ +    }
+ +    /* Calculate the offset for each count */
+ +    s0            = work->sort[m];
+ +    work->sort[m] = 0;
+ +    for (i = m - 1; i >= 0; i--)
+ +    {
+ +        s1            = work->sort[i];
+ +        work->sort[i] = work->sort[i + 1] + s0;
+ +        s0            = s1;
+ +    }
+ +
+ +    /* Sort entries directly into place */
+ +    sci_sort = work->sci_sort;
+ +    for (s = 0; s < nbl->nsci; s++)
+ +    {
+ +        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
+ +        sci_sort[work->sort[i]++] = nbl->sci[s];
+ +    }
+ +
+ +    /* Swap the sci pointers so we use the new, sorted list */
+ +    work->sci_sort = nbl->sci;
+ +    nbl->sci       = sci_sort;
+ +}
+ +
+ +/* Make a local or non-local pair-list, depending on iloc */
+ +void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
+ +                         nbnxn_atomdata_t     *nbat,
+ +                         const t_blocka       *excl,
+ +                         real                  rlist,
+ +                         int                   min_ci_balanced,
+ +                         nbnxn_pairlist_set_t *nbl_list,
+ +                         int                   iloc,
+ +                         int                   nb_kernel_type,
+ +                         t_nrnb               *nrnb)
+ +{
+ +    nbnxn_grid_t *gridi, *gridj;
+ +    gmx_bool bGPUCPU;
+ +    int nzi, zi, zj0, zj1, zj;
+ +    int nsubpair_max;
+ +    int th;
+ +    int nnbl;
+ +    nbnxn_pairlist_t **nbl;
+ +    int ci_block;
+ +    gmx_bool CombineNBLists;
+ +    gmx_bool progBal;
+ +    int np_tot, np_noq, np_hlj, nap;
+ +
+ +    /* Check if we are running hybrid GPU + CPU nbnxn mode */
+ +    bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
+ +
+ +    nnbl            = nbl_list->nnbl;
+ +    nbl             = nbl_list->nbl;
+ +    CombineNBLists  = nbl_list->bCombined;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "ns making %d nblists\n", nnbl);
+ +    }
+ +
+ +    nbat->bUseBufferFlags = (nbat->nout > 1);
+ +    /* We should re-init the flags before making the first list */
+ +    if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
+ +    {
+ +        init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
+ +    }
+ +
+ +    if (nbl_list->bSimple)
+ +    {
+ +        switch (nb_kernel_type)
+ +        {
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +            case nbnxnk4xN_SIMD_4xN:
+ +                nbs->icell_set_x = icell_set_x_simd_4xn;
+ +                break;
+ +#endif
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +            case nbnxnk4xN_SIMD_2xNN:
+ +                nbs->icell_set_x = icell_set_x_simd_2xnn;
+ +                break;
+ +#endif
+ +            default:
+ +                nbs->icell_set_x = icell_set_x_simple;
+ +                break;
+ +        }
+ +    }
+ +    else
+ +    {
+ +#ifdef NBNXN_SEARCH_BB_SSE
+ +        nbs->icell_set_x = icell_set_x_supersub_sse8;
+ +#else
+ +        nbs->icell_set_x = icell_set_x_supersub;
+ +#endif
+ +    }
+ +
+ +    if (LOCAL_I(iloc))
+ +    {
+ +        /* Only zone (grid) 0 vs 0 */
+ +        nzi = 1;
+ +        zj0 = 0;
+ +        zj1 = 1;
+ +    }
+ +    else
+ +    {
+ +        nzi = nbs->zones->nizone;
+ +    }
+ +
+ +    if (!nbl_list->bSimple && min_ci_balanced > 0)
+ +    {
+ +        nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);
+ +    }
+ +    else
+ +    {
+ +        nsubpair_max = 0;
+ +    }
+ +
+ +    /* Clear all pair-lists */
+ +    for (th = 0; th < nnbl; th++)
+ +    {
+ +        clear_pairlist(nbl[th]);
+ +    }
+ +
+ +    for (zi = 0; zi < nzi; zi++)
+ +    {
+ +        gridi = &nbs->grid[zi];
+ +
+ +        if (NONLOCAL_I(iloc))
+ +        {
+ +            zj0 = nbs->zones->izone[zi].j0;
+ +            zj1 = nbs->zones->izone[zi].j1;
+ +            if (zi == 0)
+ +            {
+ +                zj0++;
+ +            }
+ +        }
+ +        for (zj = zj0; zj < zj1; zj++)
+ +        {
+ +            gridj = &nbs->grid[zj];
+ +
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
+ +            }
+ +
+ +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
+ +
+ +            if (nbl[0]->bSimple && !gridi->bSimple)
+ +            {
+ +                /* Hybrid list, determine blocking later */
+ +                ci_block = 0;
+ +            }
+ +            else
+ +            {
+ +                ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
+ +            }
+ +
+ +#pragma omp parallel for num_threads(nnbl) schedule(static)
+ +            for (th = 0; th < nnbl; th++)
+ +            {
+ +                /* Re-init the thread-local work flag data before making
+ +                 * the first list (not an elegant conditional).
+ +                 */
+ +                if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
+ +                                              (bGPUCPU && zi == 0 && zj == 1)))
+ +                {
+ +                    init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
+ +                }
+ +
+ +                if (CombineNBLists && th > 0)
+ +                {
+ +                    clear_pairlist(nbl[th]);
+ +                }
+ +
+ +                /* With GPU: generate progressively smaller lists for
+ +                 * load balancing for local only or non-local with 2 zones.
+ +                 */
+ +                progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
+ +
+ +                /* Divide the i super cell equally over the nblists */
+ +                nbnxn_make_pairlist_part(nbs, gridi, gridj,
+ +                                         &nbs->work[th], nbat, excl,
+ +                                         rlist,
+ +                                         nb_kernel_type,
+ +                                         ci_block,
+ +                                         nbat->bUseBufferFlags,
+ +                                         nsubpair_max,
+ +                                         progBal, min_ci_balanced,
+ +                                         th, nnbl,
+ +                                         nbl[th]);
+ +            }
+ +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
+ +
+ +            np_tot = 0;
+ +            np_noq = 0;
+ +            np_hlj = 0;
+ +            for (th = 0; th < nnbl; th++)
+ +            {
+ +                inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
+ +
+ +                if (nbl_list->bSimple)
+ +                {
+ +                    np_tot += nbl[th]->ncj;
+ +                    np_noq += nbl[th]->work->ncj_noq;
+ +                    np_hlj += nbl[th]->work->ncj_hlj;
+ +                }
+ +                else
+ +                {
+ +                    /* This count ignores potential subsequent pair pruning */
+ +                    np_tot += nbl[th]->nci_tot;
+ +                }
+ +            }
+ +            nap                   = nbl[0]->na_ci*nbl[0]->na_cj;
+ +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
+ +            nbl_list->natpair_lj  = np_noq*nap;
+ +            nbl_list->natpair_q   = np_hlj*nap/2;
+ +
+ +            if (CombineNBLists && nnbl > 1)
+ +            {
+ +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
+ +
+ +                combine_nblists(nnbl-1, nbl+1, nbl[0]);
+ +
+ +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
+ +            }
+ +        }
+ +    }
+ +
+ +    if (!nbl_list->bSimple)
+ +    {
+ +        /* Sort the entries on size, large ones first */
+ +        if (CombineNBLists || nnbl == 1)
+ +        {
+ +            sort_sci(nbl[0]);
+ +        }
+ +        else
+ +        {
+ +#pragma omp parallel for num_threads(nnbl) schedule(static)
+ +            for (th = 0; th < nnbl; th++)
+ +            {
+ +                sort_sci(nbl[th]);
+ +            }
+ +        }
+ +    }
+ +
+ +    if (nbat->bUseBufferFlags)
+ +    {
+ +        reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);
+ +    }
+ +
+ +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+ +    if (LOCAL_I(iloc))
+ +    {
+ +        nbs->search_count++;
+ +    }
+ +    if (nbs->print_cycles &&
+ +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
+ +        nbs->search_count % 100 == 0)
+ +    {
+ +        nbs_cycle_print(stderr, nbs);
+ +    }
+ +
+ +    if (debug && (CombineNBLists && nnbl > 1))
+ +    {
+ +        if (nbl[0]->bSimple)
+ +        {
+ +            print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);
+ +        }
+ +        else
+ +        {
+ +            print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);
+ +        }
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        if (gmx_debug_at)
+ +        {
+ +            if (nbl[0]->bSimple)
+ +            {
+ +                print_nblist_ci_cj(debug, nbl[0]);
+ +            }
+ +            else
+ +            {
+ +                print_nblist_sci_cj(debug, nbl[0]);
+ +            }
+ +        }
+ +
+ +        if (nbat->bUseBufferFlags)
+ +        {
+ +            print_reduction_cost(&nbat->buffer_flags, nnbl);
+ +        }
+ +    }
+ +}
diff --cc src/gromacs/mdlib/nbnxn_search_simd_2xnn.h

index 8f173cdf652d93e03188478b31e09d3ba9d08f7d,0000000000000000000000000000000000000000..d6e4fea39f60a15f56f053fa94babb6cf43e1c01

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
@@@ -1,317 -1,0 +1,301 @@@
- #if GMX_NBNXN_SIMD_BITWIDTH != 256
- #error "unsupported SIMD width"
- #endif
- 
- #include "gmx_simd_macros.h"
- 
- /* Define a few macros for half-width SIMD */
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
- /* Half-width SIMD real type */
- #define gmx_mm_hpr  __m128
- /* Half-width SIMD operations */
- /* Load reals at half-width aligned pointer b into half-width SIMD register a */
- #define gmx_load_hpr(a,b)       a = _mm_load_ps(b)
- #define gmx_set1_hpr                _mm_set1_ps
- /* Load reals at half-width aligned pointer b into two halves of a */
- #define gmx_loaddh_pr(a, b)     a = gmx_mm256_load4_ps(b)
- /* Store half width SIMD registers b and c in ful width register a */
- #define gmx_2hpr_to_pr(a, b, c) a = _mm256_insertf128_ps(_mm256_castps128_ps256(b), c, 0x1)
- #else
- #error "Half-width SIMD macros are not yet defined"
- #endif
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
-     gmx_load_hpr(a_S, a);
++/* Get the half-width SIMD stuff from the kernel utils files */
++#include "nbnxn_kernels/nbnxn_kernel_simd_utils.h"
+ +
+ +
+ +#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+ +#define STRIDE_S  (GMX_SIMD_WIDTH_HERE/2)
+ +#else
+ +#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+ +#endif
+ +
+ +static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+ +{
+ +    gmx_mm_hpr a_S;
+ +    gmx_mm_pr  a_a_S;
+ +
-     gmx_2hpr_to_pr(a_a_S, a_S, a_S);
++    gmx_load_hpr(&a_S, a);
+ +
-     a0_S = gmx_set1_hpr(a[0] + shift);
-     a1_S = gmx_set1_hpr(a[1] + shift);
++    gmx_2hpr_to_pr(a_S, a_S, &a_a_S);
+ +
+ +    return a_a_S;
+ +}
+ +
+ +static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift)
+ +{
+ +    gmx_mm_hpr a0_S, a1_S;
+ +    gmx_mm_pr  a0_a1_S;
+ +
-     gmx_2hpr_to_pr(a0_a1_S, a0_S, a1_S);
++    gmx_set1_hpr(&a0_S, a[0] + shift);
++    gmx_set1_hpr(&a1_S, a[1] + shift);
+ +
- #ifndef GMX_HAVE_SIMD_ANYTRUE
++    gmx_2hpr_to_pr(a0_S, a1_S, &a0_a1_S);
+ +
+ +    return a0_a1_S;
+ +}
+ +
+ +/* Copies PBC shifted i-cell packed atom coordinates to working array */
+ +static gmx_inline void
+ +icell_set_x_simd_2xnn(int ci,
+ +                      real shx, real shy, real shz,
+ +                      int gmx_unused na_c,
+ +                      int gmx_unused stride, const real *x,
+ +                      nbnxn_list_work_t *work)
+ +{
+ +    int                     ia;
+ +    nbnxn_x_ci_simd_2xnn_t *x_ci;
+ +
+ +    x_ci = work->x_ci_simd_2xnn;
+ +
+ +    ia = X_IND_CI_SIMD_2XNN(ci);
+ +
+ +    x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+ +    x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+ +    x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+ +    x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+ +    x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+ +    x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+ +}
+ +
- gmx_anytrue_2xn_pr(gmx_mm_pr bool_S)
++#ifndef GMX_SIMD_HAVE_ANYTRUE
+ +/* Fallback function in case gmx_anytrue_pr is not present */
+ +static gmx_inline gmx_bool
-     gmx_store_pr(bools, bool_S);
++gmx_anytrue_2xn_pb(gmx_mm_pb bool_S)
+ +{
+ +    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
+ +    gmx_bool any;
+ +    int      s;
+ +
+ +    bools = gmx_simd_align_real(bools_array);
+ +
-     gmx_mm_pr                     wco_SSE0;
-     gmx_mm_pr                     wco_SSE2;
-     gmx_mm_pr                     wco_any_SSE;
++    gmx_store_pb(bools, bool_S);
+ +
+ +    any = FALSE;
+ +    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
+ +    {
+ +        if (GMX_SIMD_IS_TRUE(s))
+ +        {
+ +            any = TRUE;
+ +        }
+ +    }
+ +
+ +    return any;
+ +}
+ +#endif
+ +
+ +/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ + * for coordinates in packed format.
+ + * Checks bouding box distances and possibly atom pair distances.
+ + * This is an accelerated version of make_cluster_list_simple.
+ + */
+ +static gmx_inline void
+ +make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+ +                            nbnxn_pairlist_t *nbl,
+ +                            int ci, int cjf, int cjl,
+ +                            gmx_bool remove_sub_diag,
+ +                            const real *x_j,
+ +                            real rl2, float rbb2,
+ +                            int *ndistc)
+ +{
+ +    const nbnxn_x_ci_simd_2xnn_t *work;
+ +    const float                  *bb_ci;
+ +
+ +    gmx_mm_pr                     jx_SSE, jy_SSE, jz_SSE;
+ +
+ +    gmx_mm_pr                     dx_SSE0, dy_SSE0, dz_SSE0;
+ +    gmx_mm_pr                     dx_SSE2, dy_SSE2, dz_SSE2;
+ +
+ +    gmx_mm_pr                     rsq_SSE0;
+ +    gmx_mm_pr                     rsq_SSE2;
+ +
-         d2       = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++    gmx_mm_pb                     wco_SSE0;
++    gmx_mm_pb                     wco_SSE2;
++    gmx_mm_pb                     wco_any_SSE;
+ +
+ +    gmx_mm_pr                     rc2_SSE;
+ +
+ +    gmx_bool                      InRange;
+ +    float                         d2;
+ +    int                           xind_f, xind_l, cj;
+ +
+ +    cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+ +    cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+ +
+ +    work = nbl->work->x_ci_simd_2xnn;
+ +
+ +    bb_ci = nbl->work->bb_ci;
+ +
+ +    rc2_SSE   = gmx_set1_pr(rl2);
+ +
+ +    InRange = FALSE;
+ +    while (!InRange && cjf <= cjl)
+ +    {
-             wco_any_SSE        = gmx_or_pr(wco_SSE0, wco_SSE2);
++#ifdef NBNXN_SEARCH_BB_SSE
++        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++#else
++        d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
++#endif
+ +        *ndistc += 2;
+ +
+ +        /* Check if the distance is within the distance where
+ +         * we use only the bounding box distance rbb,
+ +         * or within the cut-off and there is at least one atom pair
+ +         * within the cut-off.
+ +         */
+ +        if (d2 < rbb2)
+ +        {
+ +            InRange = TRUE;
+ +        }
+ +        else if (d2 < rl2)
+ +        {
+ +            xind_f  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+ +
+ +            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+ +            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+ +            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+ +
+ +            /* Calculate distance */
+ +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ +
+ +            /* rsq = dx*dx+dy*dy+dz*dz */
+ +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ +
+ +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ +
- #ifdef GMX_HAVE_SIMD_ANYTRUE
-             InRange            = gmx_anytrue_pr(wco_any_SSE);
++            wco_any_SSE        = gmx_or_pb(wco_SSE0, wco_SSE2);
+ +
-             InRange            = gmx_anytrue_2xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++            InRange            = gmx_anytrue_pb(wco_any_SSE);
+ +#else
-         d2       = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++            InRange            = gmx_anytrue_2xn_pb(wco_any_SSE);
+ +#endif
+ +
+ +            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ +        }
+ +        if (!InRange)
+ +        {
+ +            cjf++;
+ +        }
+ +    }
+ +    if (!InRange)
+ +    {
+ +        return;
+ +    }
+ +
+ +    InRange = FALSE;
+ +    while (!InRange && cjl > cjf)
+ +    {
-             wco_any_SSE        = gmx_or_pr(wco_SSE0, wco_SSE2);
++#ifdef NBNXN_SEARCH_BB_SSE
++        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++#else
++        d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
++#endif
+ +        *ndistc += 2;
+ +
+ +        /* Check if the distance is within the distance where
+ +         * we use only the bounding box distance rbb,
+ +         * or within the cut-off and there is at least one atom pair
+ +         * within the cut-off.
+ +         */
+ +        if (d2 < rbb2)
+ +        {
+ +            InRange = TRUE;
+ +        }
+ +        else if (d2 < rl2)
+ +        {
+ +            xind_l  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+ +
+ +            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+ +            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+ +            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+ +
+ +            /* Calculate distance */
+ +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ +
+ +            /* rsq = dx*dx+dy*dy+dz*dz */
+ +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ +
+ +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ +
- #ifdef GMX_HAVE_SIMD_ANYTRUE
-             InRange            = gmx_anytrue_pr(wco_any_SSE);
++            wco_any_SSE        = gmx_or_pb(wco_SSE0, wco_SSE2);
+ +
-             InRange            = gmx_anytrue_2xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++            InRange            = gmx_anytrue_pb(wco_any_SSE);
+ +#else
- 
- #undef gmx_mm_hpr
- #undef gmx_load_hpr
- #undef gmx_set1_hpr
- #undef gmx_2hpr_to_pr
++            InRange            = gmx_anytrue_2xn_pb(wco_any_SSE);
+ +#endif
+ +
+ +            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ +        }
+ +        if (!InRange)
+ +        {
+ +            cjl--;
+ +        }
+ +    }
+ +
+ +    if (cjf <= cjl)
+ +    {
+ +        for (cj = cjf; cj <= cjl; cj++)
+ +        {
+ +            /* Store cj and the interaction mask */
+ +            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+ +            nbl->cj[nbl->ncj].excl = get_imask_simd_2xnn(remove_sub_diag, ci, cj);
+ +            nbl->ncj++;
+ +        }
+ +        /* Increase the closing index in i super-cell list */
+ +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ +    }
+ +}
+ +
+ +#undef STRIDE_S
diff --cc src/gromacs/mdlib/nbnxn_search_simd_4xn.h

index e96887ef330c96b894dadee3cbf6720509774446,0000000000000000000000000000000000000000..eb4252fbd9a0c019342f0a5f418404570c4272cb

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_search_simd_4xn.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h
@@@ -1,311 -1,0 +1,311 @@@
- #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
- #endif
- 
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
- #ifndef GMX_HAVE_SIMD_ANYTRUE
+ +
+ +#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+ +#define STRIDE_S  (GMX_SIMD_WIDTH_HERE)
+ +#else
+ +#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+ +#endif
+ +
+ +/* Copies PBC shifted i-cell packed atom coordinates to working array */
+ +static gmx_inline void
+ +icell_set_x_simd_4xn(int ci,
+ +                     real shx, real shy, real shz,
+ +                     int gmx_unused na_c,
+ +                     int gmx_unused stride, const real *x,
+ +                     nbnxn_list_work_t *work)
+ +{
+ +    int                    ia;
+ +    nbnxn_x_ci_simd_4xn_t *x_ci;
+ +
+ +    x_ci = work->x_ci_simd_4xn;
+ +
+ +    ia = X_IND_CI_SIMD_4XN(ci);
+ +
+ +    x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
+ +    x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
+ +    x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
+ +    x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
+ +    x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
+ +    x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
+ +    x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
+ +    x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
+ +    x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
+ +    x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
+ +    x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
+ +    x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+ +}
+ +
- gmx_anytrue_4xn_pr(gmx_mm_pr bool_S)
++#ifndef GMX_SIMD_HAVE_ANYTRUE
+ +/* Fallback function in case gmx_anytrue_pr is not present */
+ +static gmx_inline gmx_bool
-     gmx_store_pr(bools, bool_S);
++gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
+ +{
+ +    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
+ +    gmx_bool any;
+ +    int      s;
+ +
+ +    bools = gmx_simd_align_real(bools_array);
+ +
-     gmx_mm_pr                    wco_SSE0;
-     gmx_mm_pr                    wco_SSE1;
-     gmx_mm_pr                    wco_SSE2;
-     gmx_mm_pr                    wco_SSE3;
-     gmx_mm_pr                    wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
++    gmx_store_pb(bools, bool_S);
+ +
+ +    any = FALSE;
+ +    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
+ +    {
+ +        if (GMX_SIMD_IS_TRUE(bools[s]))
+ +        {
+ +            any = TRUE;
+ +        }
+ +    }
+ +
+ +    return any;
+ +}
+ +#endif
+ +
+ +/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ + * for coordinates in packed format.
+ + * Checks bouding box distances and possibly atom pair distances.
+ + * This is an accelerated version of make_cluster_list_simple.
+ + */
+ +static gmx_inline void
+ +make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+ +                           nbnxn_pairlist_t *nbl,
+ +                           int ci, int cjf, int cjl,
+ +                           gmx_bool remove_sub_diag,
+ +                           const real *x_j,
+ +                           real rl2, float rbb2,
+ +                           int *ndistc)
+ +{
+ +    const nbnxn_x_ci_simd_4xn_t *work;
+ +    const float                 *bb_ci;
+ +
+ +    gmx_mm_pr                    jx_SSE, jy_SSE, jz_SSE;
+ +
+ +    gmx_mm_pr                    dx_SSE0, dy_SSE0, dz_SSE0;
+ +    gmx_mm_pr                    dx_SSE1, dy_SSE1, dz_SSE1;
+ +    gmx_mm_pr                    dx_SSE2, dy_SSE2, dz_SSE2;
+ +    gmx_mm_pr                    dx_SSE3, dy_SSE3, dz_SSE3;
+ +
+ +    gmx_mm_pr                    rsq_SSE0;
+ +    gmx_mm_pr                    rsq_SSE1;
+ +    gmx_mm_pr                    rsq_SSE2;
+ +    gmx_mm_pr                    rsq_SSE3;
+ +
-         d2       = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++    gmx_mm_pb                    wco_SSE0;
++    gmx_mm_pb                    wco_SSE1;
++    gmx_mm_pb                    wco_SSE2;
++    gmx_mm_pb                    wco_SSE3;
++    gmx_mm_pb                    wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+ +
+ +    gmx_mm_pr                    rc2_SSE;
+ +
+ +    gmx_bool                     InRange;
+ +    float                        d2;
+ +    int                          xind_f, xind_l, cj;
+ +
+ +    cjf = CI_TO_CJ_SIMD_4XN(cjf);
+ +    cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
+ +
+ +    work = nbl->work->x_ci_simd_4xn;
+ +
+ +    bb_ci = nbl->work->bb_ci;
+ +
+ +    rc2_SSE   = gmx_set1_pr(rl2);
+ +
+ +    InRange = FALSE;
+ +    while (!InRange && cjf <= cjl)
+ +    {
-             wco_any_SSE01      = gmx_or_pr(wco_SSE0, wco_SSE1);
-             wco_any_SSE23      = gmx_or_pr(wco_SSE2, wco_SSE3);
-             wco_any_SSE        = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
++#ifdef NBNXN_SEARCH_BB_SSE
++        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++#else
++        d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
++#endif
+ +        *ndistc += 2;
+ +
+ +        /* Check if the distance is within the distance where
+ +         * we use only the bounding box distance rbb,
+ +         * or within the cut-off and there is at least one atom pair
+ +         * within the cut-off.
+ +         */
+ +        if (d2 < rbb2)
+ +        {
+ +            InRange = TRUE;
+ +        }
+ +        else if (d2 < rl2)
+ +        {
+ +            xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+ +
+ +            jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
+ +            jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
+ +            jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+ +
+ +
+ +            /* Calculate distance */
+ +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ +            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
+ +            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
+ +            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
+ +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ +            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
+ +            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
+ +            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+ +
+ +            /* rsq = dx*dx+dy*dy+dz*dz */
+ +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ +            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
+ +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ +            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
+ +
+ +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ +            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
+ +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ +            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
+ +
- #ifdef GMX_HAVE_SIMD_ANYTRUE
-             InRange            = gmx_anytrue_pr(wco_any_SSE);
++            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
++            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
++            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
+ +
-             InRange            = gmx_anytrue_4xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++            InRange            = gmx_anytrue_pb(wco_any_SSE);
+ +#else
-         d2       = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
+ +#endif
+ +
+ +            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
+ +        }
+ +        if (!InRange)
+ +        {
+ +            cjf++;
+ +        }
+ +    }
+ +    if (!InRange)
+ +    {
+ +        return;
+ +    }
+ +
+ +    InRange = FALSE;
+ +    while (!InRange && cjl > cjf)
+ +    {
-             wco_any_SSE01      = gmx_or_pr(wco_SSE0, wco_SSE1);
-             wco_any_SSE23      = gmx_or_pr(wco_SSE2, wco_SSE3);
-             wco_any_SSE        = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
++#ifdef NBNXN_SEARCH_BB_SSE
++        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++#else
++        d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
++#endif
+ +        *ndistc += 2;
+ +
+ +        /* Check if the distance is within the distance where
+ +         * we use only the bounding box distance rbb,
+ +         * or within the cut-off and there is at least one atom pair
+ +         * within the cut-off.
+ +         */
+ +        if (d2 < rbb2)
+ +        {
+ +            InRange = TRUE;
+ +        }
+ +        else if (d2 < rl2)
+ +        {
+ +            xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+ +
+ +            jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
+ +            jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
+ +            jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+ +
+ +            /* Calculate distance */
+ +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ +            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
+ +            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
+ +            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
+ +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ +            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
+ +            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
+ +            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+ +
+ +            /* rsq = dx*dx+dy*dy+dz*dz */
+ +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ +            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
+ +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ +            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
+ +
+ +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ +            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
+ +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ +            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
+ +
- #ifdef GMX_HAVE_SIMD_ANYTRUE
-             InRange            = gmx_anytrue_pr(wco_any_SSE);
++            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
++            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
++            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
+ +
-             InRange            = gmx_anytrue_4xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++            InRange            = gmx_anytrue_pb(wco_any_SSE);
+ +#else
- #undef GMX_USE_HALF_WIDTH_SIMD_HERE
++            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
+ +#endif
+ +
+ +            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
+ +        }
+ +        if (!InRange)
+ +        {
+ +            cjl--;
+ +        }
+ +    }
+ +
+ +    if (cjf <= cjl)
+ +    {
+ +        for (cj = cjf; cj <= cjl; cj++)
+ +        {
+ +            /* Store cj and the interaction mask */
+ +            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+ +            nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
+ +            nbl->ncj++;
+ +        }
+ +        /* Increase the closing index in i super-cell list */
+ +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ +    }
+ +}
+ +
+ +#undef STRIDE_S
++
diff --cc src/gromacs/mdlib/pme.c

index 2491100abf66448061f53a31d5a8ea178b2afb29,0000000000000000000000000000000000000000..0f804b37db7df7fc5161b2e22302feee42505901

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/pme.c
--- /dev/null
+++ b/src/gromacs/mdlib/pme.c
@@@ -1,4612 -1,0 +1,4626 @@@
- /* Single precision, with SSE2 or higher available */
- #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROwing Monsters And Cloning Shrimps
+ + */
+ +/* IMPORTANT FOR DEVELOPERS:
+ + *
+ + * Triclinic pme stuff isn't entirely trivial, and we've experienced
+ + * some bugs during development (many of them due to me). To avoid
+ + * this in the future, please check the following things if you make
+ + * changes in this file:
+ + *
+ + * 1. You should obtain identical (at least to the PME precision)
+ + *    energies, forces, and virial for
+ + *    a rectangular box and a triclinic one where the z (or y) axis is
+ + *    tilted a whole box side. For instance you could use these boxes:
+ + *
+ + *    rectangular       triclinic
+ + *     2  0  0           2  0  0
+ + *     0  2  0           0  2  0
+ + *     0  0  6           2  2  6
+ + *
+ + * 2. You should check the energy conservation in a triclinic box.
+ + *
+ + * It might seem an overkill, but better safe than sorry.
+ + * /Erik 001109
+ + */
+ +
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include "gromacs/fft/parallel_3dfft.h"
+ +#include "gromacs/utility/gmxmpi.h"
+ +
+ +#include <stdio.h>
+ +#include <string.h>
+ +#include <math.h>
+ +#include <assert.h>
+ +#include "typedefs.h"
+ +#include "txtdump.h"
+ +#include "vec.h"
+ +#include "gmxcomplex.h"
+ +#include "smalloc.h"
+ +#include "futil.h"
+ +#include "coulomb.h"
+ +#include "gmx_fatal.h"
+ +#include "pme.h"
+ +#include "network.h"
+ +#include "physics.h"
+ +#include "nrnb.h"
+ +#include "gmx_wallcycle.h"
+ +#include "pdbio.h"
+ +#include "gmx_cyclecounter.h"
+ +#include "gmx_omp.h"
+ +#include "macros.h"
+ +
- #include "gmx_x86_simd_single.h"
+ +
- #define PME_SSE
++/* Include the SIMD macro file and then check for support */
++#include "gmx_simd_macros.h"
++#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
++/* Turn on SIMD intrinsics for PME solve */
++#define PME_SIMD
++#endif
+ +
- #ifdef PME_SSE
++/* SIMD spread+gather only in single precision with SSE2 or higher available.
++ * We might want to switch to use gmx_simd_macros.h, but this is somewhat
++ * complicated, as we use unaligned and/or 4-wide only loads.
++ */
++#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
++#define PME_SSE_SPREAD_GATHER
++#include <emmintrin.h>
+ +/* Some old AMD processors could have problems with unaligned loads+stores */
+ +#ifndef GMX_FAHCORE
+ +#define PME_SSE_UNALIGNED
+ +#endif
+ +#endif
+ +
+ +#define DFT_TOL 1e-7
+ +/* #define PRT_FORCE */
+ +/* conditions for on the fly time-measurement */
+ +/* #define TAKETIME (step > 1 && timesteps < 10) */
+ +#define TAKETIME FALSE
+ +
+ +/* #define PME_TIME_THREADS */
+ +
+ +#ifdef GMX_DOUBLE
+ +#define mpi_type MPI_DOUBLE
+ +#else
+ +#define mpi_type MPI_FLOAT
+ +#endif
+ +
+ +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+ +#define GMX_CACHE_SEP 64
+ +
+ +/* We only define a maximum to be able to use local arrays without allocation.
+ + * An order larger than 12 should never be needed, even for test cases.
+ + * If needed it can be changed here.
+ + */
+ +#define PME_ORDER_MAX 12
+ +
+ +/* Internal datastructures */
+ +typedef struct {
+ +    int send_index0;
+ +    int send_nindex;
+ +    int recv_index0;
+ +    int recv_nindex;
+ +    int recv_size;   /* Receive buffer width, used with OpenMP */
+ +} pme_grid_comm_t;
+ +
+ +typedef struct {
+ +#ifdef GMX_MPI
+ +    MPI_Comm         mpi_comm;
+ +#endif
+ +    int              nnodes, nodeid;
+ +    int             *s2g0;
+ +    int             *s2g1;
+ +    int              noverlap_nodes;
+ +    int             *send_id, *recv_id;
+ +    int              send_size; /* Send buffer width, used with OpenMP */
+ +    pme_grid_comm_t *comm_data;
+ +    real            *sendbuf;
+ +    real            *recvbuf;
+ +} pme_overlap_t;
+ +
+ +typedef struct {
+ +    int *n;      /* Cumulative counts of the number of particles per thread */
+ +    int  nalloc; /* Allocation size of i */
+ +    int *i;      /* Particle indices ordered on thread index (n) */
+ +} thread_plist_t;
+ +
+ +typedef struct {
+ +    int      *thread_one;
+ +    int       n;
+ +    int      *ind;
+ +    splinevec theta;
+ +    real     *ptr_theta_z;
+ +    splinevec dtheta;
+ +    real     *ptr_dtheta_z;
+ +} splinedata_t;
+ +
+ +typedef struct {
+ +    int      dimind;        /* The index of the dimension, 0=x, 1=y */
+ +    int      nslab;
+ +    int      nodeid;
+ +#ifdef GMX_MPI
+ +    MPI_Comm mpi_comm;
+ +#endif
+ +
+ +    int     *node_dest;     /* The nodes to send x and q to with DD */
+ +    int     *node_src;      /* The nodes to receive x and q from with DD */
+ +    int     *buf_index;     /* Index for commnode into the buffers */
+ +
+ +    int      maxshift;
+ +
+ +    int      npd;
+ +    int      pd_nalloc;
+ +    int     *pd;
+ +    int     *count;         /* The number of atoms to send to each node */
+ +    int    **count_thread;
+ +    int     *rcount;        /* The number of atoms to receive */
+ +
+ +    int      n;
+ +    int      nalloc;
+ +    rvec    *x;
+ +    real    *q;
+ +    rvec    *f;
+ +    gmx_bool bSpread;       /* These coordinates are used for spreading */
+ +    int      pme_order;
+ +    ivec    *idx;
+ +    rvec    *fractx;            /* Fractional coordinate relative to the
+ +                                 * lower cell boundary
+ +                                 */
+ +    int             nthread;
+ +    int            *thread_idx; /* Which thread should spread which charge */
+ +    thread_plist_t *thread_plist;
+ +    splinedata_t   *spline;
+ +} pme_atomcomm_t;
+ +
+ +#define FLBS  3
+ +#define FLBSZ 4
+ +
+ +typedef struct {
+ +    ivec  ci;     /* The spatial location of this grid         */
+ +    ivec  n;      /* The used size of *grid, including order-1 */
+ +    ivec  offset; /* The grid offset from the full node grid   */
+ +    int   order;  /* PME spreading order                       */
+ +    ivec  s;      /* The allocated size of *grid, s >= n       */
+ +    real *grid;   /* The grid local thread, size n             */
+ +} pmegrid_t;
+ +
+ +typedef struct {
+ +    pmegrid_t  grid;         /* The full node grid (non thread-local)            */
+ +    int        nthread;      /* The number of threads operating on this grid     */
+ +    ivec       nc;           /* The local spatial decomposition over the threads */
+ +    pmegrid_t *grid_th;      /* Array of grids for each thread                   */
+ +    real      *grid_all;     /* Allocated array for the grids in *grid_th        */
+ +    int      **g2t;          /* The grid to thread index                         */
+ +    ivec       nthread_comm; /* The number of threads to communicate with        */
+ +} pmegrids_t;
+ +
+ +
+ +typedef struct {
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ +    /* Masks for SSE aligned spreading and gathering */
+ +    __m128 mask_SSE0[6], mask_SSE1[6];
+ +#else
+ +    int    dummy; /* C89 requires that struct has at least one member */
+ +#endif
+ +} pme_spline_work_t;
+ +
+ +typedef struct {
+ +    /* work data for solve_pme */
+ +    int      nalloc;
+ +    real *   mhx;
+ +    real *   mhy;
+ +    real *   mhz;
+ +    real *   m2;
+ +    real *   denom;
+ +    real *   tmp1_alloc;
+ +    real *   tmp1;
+ +    real *   eterm;
+ +    real *   m2inv;
+ +
+ +    real     energy;
+ +    matrix   vir;
+ +} pme_work_t;
+ +
+ +typedef struct gmx_pme {
+ +    int           ndecompdim; /* The number of decomposition dimensions */
+ +    int           nodeid;     /* Our nodeid in mpi->mpi_comm */
+ +    int           nodeid_major;
+ +    int           nodeid_minor;
+ +    int           nnodes;    /* The number of nodes doing PME */
+ +    int           nnodes_major;
+ +    int           nnodes_minor;
+ +
+ +    MPI_Comm      mpi_comm;
+ +    MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
+ +#ifdef GMX_MPI
+ +    MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
+ +#endif
+ +
+ +    gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
+ +    int        nthread;       /* The number of threads doing PME on our rank */
+ +
+ +    gmx_bool   bPPnode;       /* Node also does particle-particle forces */
+ +    gmx_bool   bFEP;          /* Compute Free energy contribution */
+ +    int        nkx, nky, nkz; /* Grid dimensions */
+ +    gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
+ +    int        pme_order;
+ +    real       epsilon_r;
+ +
+ +    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
+ +    pmegrids_t pmegridB;
+ +    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+ +    int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
+ +    /* pmegrid_nz might be larger than strictly necessary to ensure
+ +     * memory alignment, pmegrid_nz_base gives the real base size.
+ +     */
+ +    int     pmegrid_nz_base;
+ +    /* The local PME grid starting indices */
+ +    int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
+ +
+ +    /* Work data for spreading and gathering */
+ +    pme_spline_work_t    *spline_work;
+ +
+ +    real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+ +    real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
+ +    int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
+ +
+ +    t_complex            *cfftgridA;  /* Grids for complex FFT data */
+ +    t_complex            *cfftgridB;
+ +    int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
+ +
+ +    gmx_parallel_3dfft_t  pfft_setupA;
+ +    gmx_parallel_3dfft_t  pfft_setupB;
+ +
+ +    int                  *nnx, *nny, *nnz;
+ +    real                 *fshx, *fshy, *fshz;
+ +
+ +    pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
+ +    matrix                recipbox;
+ +    splinevec             bsp_mod;
+ +
+ +    pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+ +
+ +    pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
+ +
+ +    rvec                 *bufv;       /* Communication buffer */
+ +    real                 *bufr;       /* Communication buffer */
+ +    int                   buf_nalloc; /* The communication buffer size */
+ +
+ +    /* thread local work data for solve_pme */
+ +    pme_work_t *work;
+ +
+ +    /* Work data for PME_redist */
+ +    gmx_bool redist_init;
+ +    int *    scounts;
+ +    int *    rcounts;
+ +    int *    sdispls;
+ +    int *    rdispls;
+ +    int *    sidx;
+ +    int *    idxa;
+ +    real *   redist_buf;
+ +    int      redist_buf_nalloc;
+ +
+ +    /* Work data for sum_qgrid */
+ +    real *   sum_qgrid_tmp;
+ +    real *   sum_qgrid_dd_tmp;
+ +} t_gmx_pme;
+ +
+ +
+ +static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
+ +                                   int start, int end, int thread)
+ +{
+ +    int             i;
+ +    int            *idxptr, tix, tiy, tiz;
+ +    real           *xptr, *fptr, tx, ty, tz;
+ +    real            rxx, ryx, ryy, rzx, rzy, rzz;
+ +    int             nx, ny, nz;
+ +    int             start_ix, start_iy, start_iz;
+ +    int            *g2tx, *g2ty, *g2tz;
+ +    gmx_bool        bThreads;
+ +    int            *thread_idx = NULL;
+ +    thread_plist_t *tpl        = NULL;
+ +    int            *tpl_n      = NULL;
+ +    int             thread_i;
+ +
+ +    nx  = pme->nkx;
+ +    ny  = pme->nky;
+ +    nz  = pme->nkz;
+ +
+ +    start_ix = pme->pmegrid_start_ix;
+ +    start_iy = pme->pmegrid_start_iy;
+ +    start_iz = pme->pmegrid_start_iz;
+ +
+ +    rxx = pme->recipbox[XX][XX];
+ +    ryx = pme->recipbox[YY][XX];
+ +    ryy = pme->recipbox[YY][YY];
+ +    rzx = pme->recipbox[ZZ][XX];
+ +    rzy = pme->recipbox[ZZ][YY];
+ +    rzz = pme->recipbox[ZZ][ZZ];
+ +
+ +    g2tx = pme->pmegridA.g2t[XX];
+ +    g2ty = pme->pmegridA.g2t[YY];
+ +    g2tz = pme->pmegridA.g2t[ZZ];
+ +
+ +    bThreads = (atc->nthread > 1);
+ +    if (bThreads)
+ +    {
+ +        thread_idx = atc->thread_idx;
+ +
+ +        tpl   = &atc->thread_plist[thread];
+ +        tpl_n = tpl->n;
+ +        for (i = 0; i < atc->nthread; i++)
+ +        {
+ +            tpl_n[i] = 0;
+ +        }
+ +    }
+ +
+ +    for (i = start; i < end; i++)
+ +    {
+ +        xptr   = atc->x[i];
+ +        idxptr = atc->idx[i];
+ +        fptr   = atc->fractx[i];
+ +
+ +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
+ +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
+ +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
+ +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
+ +
+ +        tix = (int)(tx);
+ +        tiy = (int)(ty);
+ +        tiz = (int)(tz);
+ +
+ +        /* Because decomposition only occurs in x and y,
+ +         * we never have a fraction correction in z.
+ +         */
+ +        fptr[XX] = tx - tix + pme->fshx[tix];
+ +        fptr[YY] = ty - tiy + pme->fshy[tiy];
+ +        fptr[ZZ] = tz - tiz;
+ +
+ +        idxptr[XX] = pme->nnx[tix];
+ +        idxptr[YY] = pme->nny[tiy];
+ +        idxptr[ZZ] = pme->nnz[tiz];
+ +
+ +#ifdef DEBUG
+ +        range_check(idxptr[XX], 0, pme->pmegrid_nx);
+ +        range_check(idxptr[YY], 0, pme->pmegrid_ny);
+ +        range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
+ +#endif
+ +
+ +        if (bThreads)
+ +        {
+ +            thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
+ +            thread_idx[i] = thread_i;
+ +            tpl_n[thread_i]++;
+ +        }
+ +    }
+ +
+ +    if (bThreads)
+ +    {
+ +        /* Make a list of particle indices sorted on thread */
+ +
+ +        /* Get the cumulative count */
+ +        for (i = 1; i < atc->nthread; i++)
+ +        {
+ +            tpl_n[i] += tpl_n[i-1];
+ +        }
+ +        /* The current implementation distributes particles equally
+ +         * over the threads, so we could actually allocate for that
+ +         * in pme_realloc_atomcomm_things.
+ +         */
+ +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
+ +        {
+ +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
+ +            srenew(tpl->i, tpl->nalloc);
+ +        }
+ +        /* Set tpl_n to the cumulative start */
+ +        for (i = atc->nthread-1; i >= 1; i--)
+ +        {
+ +            tpl_n[i] = tpl_n[i-1];
+ +        }
+ +        tpl_n[0] = 0;
+ +
+ +        /* Fill our thread local array with indices sorted on thread */
+ +        for (i = start; i < end; i++)
+ +        {
+ +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
+ +        }
+ +        /* Now tpl_n contains the cummulative count again */
+ +    }
+ +}
+ +
+ +static void make_thread_local_ind(pme_atomcomm_t *atc,
+ +                                  int thread, splinedata_t *spline)
+ +{
+ +    int             n, t, i, start, end;
+ +    thread_plist_t *tpl;
+ +
+ +    /* Combine the indices made by each thread into one index */
+ +
+ +    n     = 0;
+ +    start = 0;
+ +    for (t = 0; t < atc->nthread; t++)
+ +    {
+ +        tpl = &atc->thread_plist[t];
+ +        /* Copy our part (start - end) from the list of thread t */
+ +        if (thread > 0)
+ +        {
+ +            start = tpl->n[thread-1];
+ +        }
+ +        end = tpl->n[thread];
+ +        for (i = start; i < end; i++)
+ +        {
+ +            spline->ind[n++] = tpl->i[i];
+ +        }
+ +    }
+ +
+ +    spline->n = n;
+ +}
+ +
+ +
+ +static void pme_calc_pidx(int start, int end,
+ +                          matrix recipbox, rvec x[],
+ +                          pme_atomcomm_t *atc, int *count)
+ +{
+ +    int   nslab, i;
+ +    int   si;
+ +    real *xptr, s;
+ +    real  rxx, ryx, rzx, ryy, rzy;
+ +    int  *pd;
+ +
+ +    /* Calculate PME task index (pidx) for each grid index.
+ +     * Here we always assign equally sized slabs to each node
+ +     * for load balancing reasons (the PME grid spacing is not used).
+ +     */
+ +
+ +    nslab = atc->nslab;
+ +    pd    = atc->pd;
+ +
+ +    /* Reset the count */
+ +    for (i = 0; i < nslab; i++)
+ +    {
+ +        count[i] = 0;
+ +    }
+ +
+ +    if (atc->dimind == 0)
+ +    {
+ +        rxx = recipbox[XX][XX];
+ +        ryx = recipbox[YY][XX];
+ +        rzx = recipbox[ZZ][XX];
+ +        /* Calculate the node index in x-dimension */
+ +        for (i = start; i < end; i++)
+ +        {
+ +            xptr   = x[i];
+ +            /* Fractional coordinates along box vectors */
+ +            s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
+ +            si    = (int)(s + 2*nslab) % nslab;
+ +            pd[i] = si;
+ +            count[si]++;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        ryy = recipbox[YY][YY];
+ +        rzy = recipbox[ZZ][YY];
+ +        /* Calculate the node index in y-dimension */
+ +        for (i = start; i < end; i++)
+ +        {
+ +            xptr   = x[i];
+ +            /* Fractional coordinates along box vectors */
+ +            s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
+ +            si    = (int)(s + 2*nslab) % nslab;
+ +            pd[i] = si;
+ +            count[si]++;
+ +        }
+ +    }
+ +}
+ +
+ +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
+ +                                  pme_atomcomm_t *atc)
+ +{
+ +    int nthread, thread, slab;
+ +
+ +    nthread = atc->nthread;
+ +
+ +#pragma omp parallel for num_threads(nthread) schedule(static)
+ +    for (thread = 0; thread < nthread; thread++)
+ +    {
+ +        pme_calc_pidx(natoms* thread   /nthread,
+ +                      natoms*(thread+1)/nthread,
+ +                      recipbox, x, atc, atc->count_thread[thread]);
+ +    }
+ +    /* Non-parallel reduction, since nslab is small */
+ +
+ +    for (thread = 1; thread < nthread; thread++)
+ +    {
+ +        for (slab = 0; slab < atc->nslab; slab++)
+ +        {
+ +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
+ +        }
+ +    }
+ +}
+ +
+ +static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
+ +{
+ +    const int padding = 4;
+ +    int       i;
+ +
+ +    srenew(th[XX], nalloc);
+ +    srenew(th[YY], nalloc);
+ +    /* In z we add padding, this is only required for the aligned SSE code */
+ +    srenew(*ptr_z, nalloc+2*padding);
+ +    th[ZZ] = *ptr_z + padding;
+ +
+ +    for (i = 0; i < padding; i++)
+ +    {
+ +        (*ptr_z)[               i] = 0;
+ +        (*ptr_z)[padding+nalloc+i] = 0;
+ +    }
+ +}
+ +
+ +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
+ +{
+ +    int i, d;
+ +
+ +    srenew(spline->ind, atc->nalloc);
+ +    /* Initialize the index to identity so it works without threads */
+ +    for (i = 0; i < atc->nalloc; i++)
+ +    {
+ +        spline->ind[i] = i;
+ +    }
+ +
+ +    realloc_splinevec(spline->theta, &spline->ptr_theta_z,
+ +                      atc->pme_order*atc->nalloc);
+ +    realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
+ +                      atc->pme_order*atc->nalloc);
+ +}
+ +
+ +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
+ +{
+ +    int nalloc_old, i, j, nalloc_tpl;
+ +
+ +    /* We have to avoid a NULL pointer for atc->x to avoid
+ +     * possible fatal errors in MPI routines.
+ +     */
+ +    if (atc->n > atc->nalloc || atc->nalloc == 0)
+ +    {
+ +        nalloc_old  = atc->nalloc;
+ +        atc->nalloc = over_alloc_dd(max(atc->n, 1));
+ +
+ +        if (atc->nslab > 1)
+ +        {
+ +            srenew(atc->x, atc->nalloc);
+ +            srenew(atc->q, atc->nalloc);
+ +            srenew(atc->f, atc->nalloc);
+ +            for (i = nalloc_old; i < atc->nalloc; i++)
+ +            {
+ +                clear_rvec(atc->f[i]);
+ +            }
+ +        }
+ +        if (atc->bSpread)
+ +        {
+ +            srenew(atc->fractx, atc->nalloc);
+ +            srenew(atc->idx, atc->nalloc);
+ +
+ +            if (atc->nthread > 1)
+ +            {
+ +                srenew(atc->thread_idx, atc->nalloc);
+ +            }
+ +
+ +            for (i = 0; i < atc->nthread; i++)
+ +            {
+ +                pme_realloc_splinedata(&atc->spline[i], atc);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
+ +                         int n, gmx_bool bXF, rvec *x_f, real *charge,
+ +                         pme_atomcomm_t *atc)
+ +/* Redistribute particle data for PME calculation */
+ +/* domain decomposition by x coordinate           */
+ +{
+ +    int *idxa;
+ +    int  i, ii;
+ +
+ +    if (FALSE == pme->redist_init)
+ +    {
+ +        snew(pme->scounts, atc->nslab);
+ +        snew(pme->rcounts, atc->nslab);
+ +        snew(pme->sdispls, atc->nslab);
+ +        snew(pme->rdispls, atc->nslab);
+ +        snew(pme->sidx, atc->nslab);
+ +        pme->redist_init = TRUE;
+ +    }
+ +    if (n > pme->redist_buf_nalloc)
+ +    {
+ +        pme->redist_buf_nalloc = over_alloc_dd(n);
+ +        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
+ +    }
+ +
+ +    pme->idxa = atc->pd;
+ +
+ +#ifdef GMX_MPI
+ +    if (forw && bXF)
+ +    {
+ +        /* forward, redistribution from pp to pme */
+ +
+ +        /* Calculate send counts and exchange them with other nodes */
+ +        for (i = 0; (i < atc->nslab); i++)
+ +        {
+ +            pme->scounts[i] = 0;
+ +        }
+ +        for (i = 0; (i < n); i++)
+ +        {
+ +            pme->scounts[pme->idxa[i]]++;
+ +        }
+ +        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
+ +
+ +        /* Calculate send and receive displacements and index into send
+ +           buffer */
+ +        pme->sdispls[0] = 0;
+ +        pme->rdispls[0] = 0;
+ +        pme->sidx[0]    = 0;
+ +        for (i = 1; i < atc->nslab; i++)
+ +        {
+ +            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
+ +            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
+ +            pme->sidx[i]    = pme->sdispls[i];
+ +        }
+ +        /* Total # of particles to be received */
+ +        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
+ +
+ +        pme_realloc_atomcomm_things(atc);
+ +
+ +        /* Copy particle coordinates into send buffer and exchange*/
+ +        for (i = 0; (i < n); i++)
+ +        {
+ +            ii = DIM*pme->sidx[pme->idxa[i]];
+ +            pme->sidx[pme->idxa[i]]++;
+ +            pme->redist_buf[ii+XX] = x_f[i][XX];
+ +            pme->redist_buf[ii+YY] = x_f[i][YY];
+ +            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
+ +        }
+ +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
+ +                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
+ +                      pme->rvec_mpi, atc->mpi_comm);
+ +    }
+ +    if (forw)
+ +    {
+ +        /* Copy charge into send buffer and exchange*/
+ +        for (i = 0; i < atc->nslab; i++)
+ +        {
+ +            pme->sidx[i] = pme->sdispls[i];
+ +        }
+ +        for (i = 0; (i < n); i++)
+ +        {
+ +            ii = pme->sidx[pme->idxa[i]];
+ +            pme->sidx[pme->idxa[i]]++;
+ +            pme->redist_buf[ii] = charge[i];
+ +        }
+ +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
+ +                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
+ +                      atc->mpi_comm);
+ +    }
+ +    else   /* backward, redistribution from pme to pp */
+ +    {
+ +        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
+ +                      pme->redist_buf, pme->scounts, pme->sdispls,
+ +                      pme->rvec_mpi, atc->mpi_comm);
+ +
+ +        /* Copy data from receive buffer */
+ +        for (i = 0; i < atc->nslab; i++)
+ +        {
+ +            pme->sidx[i] = pme->sdispls[i];
+ +        }
+ +        for (i = 0; (i < n); i++)
+ +        {
+ +            ii          = DIM*pme->sidx[pme->idxa[i]];
+ +            x_f[i][XX] += pme->redist_buf[ii+XX];
+ +            x_f[i][YY] += pme->redist_buf[ii+YY];
+ +            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
+ +            pme->sidx[pme->idxa[i]]++;
+ +        }
+ +    }
+ +#endif
+ +}
+ +
+ +static void pme_dd_sendrecv(pme_atomcomm_t *atc,
+ +                            gmx_bool bBackward, int shift,
+ +                            void *buf_s, int nbyte_s,
+ +                            void *buf_r, int nbyte_r)
+ +{
+ +#ifdef GMX_MPI
+ +    int        dest, src;
+ +    MPI_Status stat;
+ +
+ +    if (bBackward == FALSE)
+ +    {
+ +        dest = atc->node_dest[shift];
+ +        src  = atc->node_src[shift];
+ +    }
+ +    else
+ +    {
+ +        dest = atc->node_src[shift];
+ +        src  = atc->node_dest[shift];
+ +    }
+ +
+ +    if (nbyte_s > 0 && nbyte_r > 0)
+ +    {
+ +        MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
+ +                     dest, shift,
+ +                     buf_r, nbyte_r, MPI_BYTE,
+ +                     src, shift,
+ +                     atc->mpi_comm, &stat);
+ +    }
+ +    else if (nbyte_s > 0)
+ +    {
+ +        MPI_Send(buf_s, nbyte_s, MPI_BYTE,
+ +                 dest, shift,
+ +                 atc->mpi_comm);
+ +    }
+ +    else if (nbyte_r > 0)
+ +    {
+ +        MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
+ +                 src, shift,
+ +                 atc->mpi_comm, &stat);
+ +    }
+ +#endif
+ +}
+ +
+ +static void dd_pmeredist_x_q(gmx_pme_t pme,
+ +                             int n, gmx_bool bX, rvec *x, real *charge,
+ +                             pme_atomcomm_t *atc)
+ +{
+ +    int *commnode, *buf_index;
+ +    int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
+ +
+ +    commnode  = atc->node_dest;
+ +    buf_index = atc->buf_index;
+ +
+ +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
+ +
+ +    nsend = 0;
+ +    for (i = 0; i < nnodes_comm; i++)
+ +    {
+ +        buf_index[commnode[i]] = nsend;
+ +        nsend                 += atc->count[commnode[i]];
+ +    }
+ +    if (bX)
+ +    {
+ +        if (atc->count[atc->nodeid] + nsend != n)
+ +        {
+ +            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+ +                      "This usually means that your system is not well equilibrated.",
+ +                      n - (atc->count[atc->nodeid] + nsend),
+ +                      pme->nodeid, 'x'+atc->dimind);
+ +        }
+ +
+ +        if (nsend > pme->buf_nalloc)
+ +        {
+ +            pme->buf_nalloc = over_alloc_dd(nsend);
+ +            srenew(pme->bufv, pme->buf_nalloc);
+ +            srenew(pme->bufr, pme->buf_nalloc);
+ +        }
+ +
+ +        atc->n = atc->count[atc->nodeid];
+ +        for (i = 0; i < nnodes_comm; i++)
+ +        {
+ +            scount = atc->count[commnode[i]];
+ +            /* Communicate the count */
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+ +                        atc->dimind, atc->nodeid, commnode[i], scount);
+ +            }
+ +            pme_dd_sendrecv(atc, FALSE, i,
+ +                            &scount, sizeof(int),
+ +                            &atc->rcount[i], sizeof(int));
+ +            atc->n += atc->rcount[i];
+ +        }
+ +
+ +        pme_realloc_atomcomm_things(atc);
+ +    }
+ +
+ +    local_pos = 0;
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        node = atc->pd[i];
+ +        if (node == atc->nodeid)
+ +        {
+ +            /* Copy direct to the receive buffer */
+ +            if (bX)
+ +            {
+ +                copy_rvec(x[i], atc->x[local_pos]);
+ +            }
+ +            atc->q[local_pos] = charge[i];
+ +            local_pos++;
+ +        }
+ +        else
+ +        {
+ +            /* Copy to the send buffer */
+ +            if (bX)
+ +            {
+ +                copy_rvec(x[i], pme->bufv[buf_index[node]]);
+ +            }
+ +            pme->bufr[buf_index[node]] = charge[i];
+ +            buf_index[node]++;
+ +        }
+ +    }
+ +
+ +    buf_pos = 0;
+ +    for (i = 0; i < nnodes_comm; i++)
+ +    {
+ +        scount = atc->count[commnode[i]];
+ +        rcount = atc->rcount[i];
+ +        if (scount > 0 || rcount > 0)
+ +        {
+ +            if (bX)
+ +            {
+ +                /* Communicate the coordinates */
+ +                pme_dd_sendrecv(atc, FALSE, i,
+ +                                pme->bufv[buf_pos], scount*sizeof(rvec),
+ +                                atc->x[local_pos], rcount*sizeof(rvec));
+ +            }
+ +            /* Communicate the charges */
+ +            pme_dd_sendrecv(atc, FALSE, i,
+ +                            pme->bufr+buf_pos, scount*sizeof(real),
+ +                            atc->q+local_pos, rcount*sizeof(real));
+ +            buf_pos   += scount;
+ +            local_pos += atc->rcount[i];
+ +        }
+ +    }
+ +}
+ +
+ +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
+ +                           int n, rvec *f,
+ +                           gmx_bool bAddF)
+ +{
+ +    int *commnode, *buf_index;
+ +    int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
+ +
+ +    commnode  = atc->node_dest;
+ +    buf_index = atc->buf_index;
+ +
+ +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
+ +
+ +    local_pos = atc->count[atc->nodeid];
+ +    buf_pos   = 0;
+ +    for (i = 0; i < nnodes_comm; i++)
+ +    {
+ +        scount = atc->rcount[i];
+ +        rcount = atc->count[commnode[i]];
+ +        if (scount > 0 || rcount > 0)
+ +        {
+ +            /* Communicate the forces */
+ +            pme_dd_sendrecv(atc, TRUE, i,
+ +                            atc->f[local_pos], scount*sizeof(rvec),
+ +                            pme->bufv[buf_pos], rcount*sizeof(rvec));
+ +            local_pos += scount;
+ +        }
+ +        buf_index[commnode[i]] = buf_pos;
+ +        buf_pos               += rcount;
+ +    }
+ +
+ +    local_pos = 0;
+ +    if (bAddF)
+ +    {
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            node = atc->pd[i];
+ +            if (node == atc->nodeid)
+ +            {
+ +                /* Add from the local force array */
+ +                rvec_inc(f[i], atc->f[local_pos]);
+ +                local_pos++;
+ +            }
+ +            else
+ +            {
+ +                /* Add from the receive buffer */
+ +                rvec_inc(f[i], pme->bufv[buf_index[node]]);
+ +                buf_index[node]++;
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            node = atc->pd[i];
+ +            if (node == atc->nodeid)
+ +            {
+ +                /* Copy from the local force array */
+ +                copy_rvec(atc->f[local_pos], f[i]);
+ +                local_pos++;
+ +            }
+ +            else
+ +            {
+ +                /* Copy from the receive buffer */
+ +                copy_rvec(pme->bufv[buf_index[node]], f[i]);
+ +                buf_index[node]++;
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +#ifdef GMX_MPI
+ +static void
+ +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
+ +{
+ +    pme_overlap_t *overlap;
+ +    int            send_index0, send_nindex;
+ +    int            recv_index0, recv_nindex;
+ +    MPI_Status     stat;
+ +    int            i, j, k, ix, iy, iz, icnt;
+ +    int            ipulse, send_id, recv_id, datasize;
+ +    real          *p;
+ +    real          *sendptr, *recvptr;
+ +
+ +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
+ +    overlap = &pme->overlap[1];
+ +
+ +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
+ +    {
+ +        /* Since we have already (un)wrapped the overlap in the z-dimension,
+ +         * we only have to communicate 0 to nkz (not pmegrid_nz).
+ +         */
+ +        if (direction == GMX_SUM_QGRID_FORWARD)
+ +        {
+ +            send_id       = overlap->send_id[ipulse];
+ +            recv_id       = overlap->recv_id[ipulse];
+ +            send_index0   = overlap->comm_data[ipulse].send_index0;
+ +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
+ +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
+ +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+ +        }
+ +        else
+ +        {
+ +            send_id       = overlap->recv_id[ipulse];
+ +            recv_id       = overlap->send_id[ipulse];
+ +            send_index0   = overlap->comm_data[ipulse].recv_index0;
+ +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
+ +            recv_index0   = overlap->comm_data[ipulse].send_index0;
+ +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
+ +        }
+ +
+ +        /* Copy data to contiguous send buffer */
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ +                    pme->nodeid, overlap->nodeid, send_id,
+ +                    pme->pmegrid_start_iy,
+ +                    send_index0-pme->pmegrid_start_iy,
+ +                    send_index0-pme->pmegrid_start_iy+send_nindex);
+ +        }
+ +        icnt = 0;
+ +        for (i = 0; i < pme->pmegrid_nx; i++)
+ +        {
+ +            ix = i;
+ +            for (j = 0; j < send_nindex; j++)
+ +            {
+ +                iy = j + send_index0 - pme->pmegrid_start_iy;
+ +                for (k = 0; k < pme->nkz; k++)
+ +                {
+ +                    iz = k;
+ +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
+ +                }
+ +            }
+ +        }
+ +
+ +        datasize      = pme->pmegrid_nx * pme->nkz;
+ +
+ +        MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
+ +                     send_id, ipulse,
+ +                     overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
+ +                     recv_id, ipulse,
+ +                     overlap->mpi_comm, &stat);
+ +
+ +        /* Get data from contiguous recv buffer */
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ +                    pme->nodeid, overlap->nodeid, recv_id,
+ +                    pme->pmegrid_start_iy,
+ +                    recv_index0-pme->pmegrid_start_iy,
+ +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
+ +        }
+ +        icnt = 0;
+ +        for (i = 0; i < pme->pmegrid_nx; i++)
+ +        {
+ +            ix = i;
+ +            for (j = 0; j < recv_nindex; j++)
+ +            {
+ +                iy = j + recv_index0 - pme->pmegrid_start_iy;
+ +                for (k = 0; k < pme->nkz; k++)
+ +                {
+ +                    iz = k;
+ +                    if (direction == GMX_SUM_QGRID_FORWARD)
+ +                    {
+ +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
+ +                    }
+ +                    else
+ +                    {
+ +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Major dimension is easier, no copying required,
+ +     * but we might have to sum to separate array.
+ +     * Since we don't copy, we have to communicate up to pmegrid_nz,
+ +     * not nkz as for the minor direction.
+ +     */
+ +    overlap = &pme->overlap[0];
+ +
+ +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
+ +    {
+ +        if (direction == GMX_SUM_QGRID_FORWARD)
+ +        {
+ +            send_id       = overlap->send_id[ipulse];
+ +            recv_id       = overlap->recv_id[ipulse];
+ +            send_index0   = overlap->comm_data[ipulse].send_index0;
+ +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
+ +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
+ +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+ +            recvptr       = overlap->recvbuf;
+ +        }
+ +        else
+ +        {
+ +            send_id       = overlap->recv_id[ipulse];
+ +            recv_id       = overlap->send_id[ipulse];
+ +            send_index0   = overlap->comm_data[ipulse].recv_index0;
+ +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
+ +            recv_index0   = overlap->comm_data[ipulse].send_index0;
+ +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
+ +            recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+ +        }
+ +
+ +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+ +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ +                    pme->nodeid, overlap->nodeid, send_id,
+ +                    pme->pmegrid_start_ix,
+ +                    send_index0-pme->pmegrid_start_ix,
+ +                    send_index0-pme->pmegrid_start_ix+send_nindex);
+ +            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ +                    pme->nodeid, overlap->nodeid, recv_id,
+ +                    pme->pmegrid_start_ix,
+ +                    recv_index0-pme->pmegrid_start_ix,
+ +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
+ +        }
+ +
+ +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
+ +                     send_id, ipulse,
+ +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
+ +                     recv_id, ipulse,
+ +                     overlap->mpi_comm, &stat);
+ +
+ +        /* ADD data from contiguous recv buffer */
+ +        if (direction == GMX_SUM_QGRID_FORWARD)
+ +        {
+ +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+ +            for (i = 0; i < recv_nindex*datasize; i++)
+ +            {
+ +                p[i] += overlap->recvbuf[i];
+ +            }
+ +        }
+ +    }
+ +}
+ +#endif
+ +
+ +
+ +static int
+ +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
+ +{
+ +    ivec    local_fft_ndata, local_fft_offset, local_fft_size;
+ +    ivec    local_pme_size;
+ +    int     i, ix, iy, iz;
+ +    int     pmeidx, fftidx;
+ +
+ +    /* Dimensions should be identical for A/B grid, so we just use A here */
+ +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ +                                   local_fft_ndata,
+ +                                   local_fft_offset,
+ +                                   local_fft_size);
+ +
+ +    local_pme_size[0] = pme->pmegrid_nx;
+ +    local_pme_size[1] = pme->pmegrid_ny;
+ +    local_pme_size[2] = pme->pmegrid_nz;
+ +
+ +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
+ +       the offset is identical, and the PME grid always has more data (due to overlap)
+ +     */
+ +    {
+ +#ifdef DEBUG_PME
+ +        FILE *fp, *fp2;
+ +        char  fn[STRLEN], format[STRLEN];
+ +        real  val;
+ +        sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
+ +        fp = ffopen(fn, "w");
+ +        sprintf(fn, "pmegrid%d.txt", pme->nodeid);
+ +        fp2 = ffopen(fn, "w");
+ +        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+ +#endif
+ +
+ +        for (ix = 0; ix < local_fft_ndata[XX]; ix++)
+ +        {
+ +            for (iy = 0; iy < local_fft_ndata[YY]; iy++)
+ +            {
+ +                for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
+ +                {
+ +                    pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
+ +                    fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
+ +                    fftgrid[fftidx] = pmegrid[pmeidx];
+ +#ifdef DEBUG_PME
+ +                    val = 100*pmegrid[pmeidx];
+ +                    if (pmegrid[pmeidx] != 0)
+ +                    {
+ +                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
+ +                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+ +                    }
+ +                    if (pmegrid[pmeidx] != 0)
+ +                    {
+ +                        fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
+ +                                "qgrid",
+ +                                pme->pmegrid_start_ix + ix,
+ +                                pme->pmegrid_start_iy + iy,
+ +                                pme->pmegrid_start_iz + iz,
+ +                                pmegrid[pmeidx]);
+ +                    }
+ +#endif
+ +                }
+ +            }
+ +        }
+ +#ifdef DEBUG_PME
+ +        ffclose(fp);
+ +        ffclose(fp2);
+ +#endif
+ +    }
+ +    return 0;
+ +}
+ +
+ +
+ +static gmx_cycles_t omp_cyc_start()
+ +{
+ +    return gmx_cycles_read();
+ +}
+ +
+ +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
+ +{
+ +    return gmx_cycles_read() - c;
+ +}
+ +
+ +
+ +static int
+ +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
+ +                        int nthread, int thread)
+ +{
+ +    ivec          local_fft_ndata, local_fft_offset, local_fft_size;
+ +    ivec          local_pme_size;
+ +    int           ixy0, ixy1, ixy, ix, iy, iz;
+ +    int           pmeidx, fftidx;
+ +#ifdef PME_TIME_THREADS
+ +    gmx_cycles_t  c1;
+ +    static double cs1 = 0;
+ +    static int    cnt = 0;
+ +#endif
+ +
+ +#ifdef PME_TIME_THREADS
+ +    c1 = omp_cyc_start();
+ +#endif
+ +    /* Dimensions should be identical for A/B grid, so we just use A here */
+ +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ +                                   local_fft_ndata,
+ +                                   local_fft_offset,
+ +                                   local_fft_size);
+ +
+ +    local_pme_size[0] = pme->pmegrid_nx;
+ +    local_pme_size[1] = pme->pmegrid_ny;
+ +    local_pme_size[2] = pme->pmegrid_nz;
+ +
+ +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
+ +       the offset is identical, and the PME grid always has more data (due to overlap)
+ +     */
+ +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+ +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+ +
+ +    for (ixy = ixy0; ixy < ixy1; ixy++)
+ +    {
+ +        ix = ixy/local_fft_ndata[YY];
+ +        iy = ixy - ix*local_fft_ndata[YY];
+ +
+ +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
+ +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
+ +        for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
+ +        {
+ +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
+ +        }
+ +    }
+ +
+ +#ifdef PME_TIME_THREADS
+ +    c1   = omp_cyc_end(c1);
+ +    cs1 += (double)c1;
+ +    cnt++;
+ +    if (cnt % 20 == 0)
+ +    {
+ +        printf("copy %.2f\n", cs1*1e-9);
+ +    }
+ +#endif
+ +
+ +    return 0;
+ +}
+ +
+ +
+ +static void
+ +wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+ +{
+ +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
+ +
+ +    nx = pme->nkx;
+ +    ny = pme->nky;
+ +    nz = pme->nkz;
+ +
+ +    pnx = pme->pmegrid_nx;
+ +    pny = pme->pmegrid_ny;
+ +    pnz = pme->pmegrid_nz;
+ +
+ +    overlap = pme->pme_order - 1;
+ +
+ +    /* Add periodic overlap in z */
+ +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ +    {
+ +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
+ +        {
+ +            for (iz = 0; iz < overlap; iz++)
+ +            {
+ +                pmegrid[(ix*pny+iy)*pnz+iz] +=
+ +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
+ +            }
+ +        }
+ +    }
+ +
+ +    if (pme->nnodes_minor == 1)
+ +    {
+ +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ +        {
+ +            for (iy = 0; iy < overlap; iy++)
+ +            {
+ +                for (iz = 0; iz < nz; iz++)
+ +                {
+ +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
+ +                        pmegrid[(ix*pny+ny+iy)*pnz+iz];
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    if (pme->nnodes_major == 1)
+ +    {
+ +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
+ +
+ +        for (ix = 0; ix < overlap; ix++)
+ +        {
+ +            for (iy = 0; iy < ny_x; iy++)
+ +            {
+ +                for (iz = 0; iz < nz; iz++)
+ +                {
+ +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
+ +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void
+ +unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+ +{
+ +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
+ +
+ +    nx = pme->nkx;
+ +    ny = pme->nky;
+ +    nz = pme->nkz;
+ +
+ +    pnx = pme->pmegrid_nx;
+ +    pny = pme->pmegrid_ny;
+ +    pnz = pme->pmegrid_nz;
+ +
+ +    overlap = pme->pme_order - 1;
+ +
+ +    if (pme->nnodes_major == 1)
+ +    {
+ +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
+ +
+ +        for (ix = 0; ix < overlap; ix++)
+ +        {
+ +            int iy, iz;
+ +
+ +            for (iy = 0; iy < ny_x; iy++)
+ +            {
+ +                for (iz = 0; iz < nz; iz++)
+ +                {
+ +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
+ +                        pmegrid[(ix*pny+iy)*pnz+iz];
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    if (pme->nnodes_minor == 1)
+ +    {
+ +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ +        {
+ +            int iy, iz;
+ +
+ +            for (iy = 0; iy < overlap; iy++)
+ +            {
+ +                for (iz = 0; iz < nz; iz++)
+ +                {
+ +                    pmegrid[(ix*pny+ny+iy)*pnz+iz] =
+ +                        pmegrid[(ix*pny+iy)*pnz+iz];
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Copy periodic overlap in z */
+ +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ +    {
+ +        int iy, iz;
+ +
+ +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
+ +        {
+ +            for (iz = 0; iz < overlap; iz++)
+ +            {
+ +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
+ +                    pmegrid[(ix*pny+iy)*pnz+iz];
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
+ +#define DO_BSPLINE(order)                            \
+ +    for (ithx = 0; (ithx < order); ithx++)                    \
+ +    {                                                    \
+ +        index_x = (i0+ithx)*pny*pnz;                     \
+ +        valx    = qn*thx[ithx];                          \
+ +                                                     \
+ +        for (ithy = 0; (ithy < order); ithy++)                \
+ +        {                                                \
+ +            valxy    = valx*thy[ithy];                   \
+ +            index_xy = index_x+(j0+ithy)*pnz;            \
+ +                                                     \
+ +            for (ithz = 0; (ithz < order); ithz++)            \
+ +            {                                            \
+ +                index_xyz        = index_xy+(k0+ithz);   \
+ +                grid[index_xyz] += valxy*thz[ithz];      \
+ +            }                                            \
+ +        }                                                \
+ +    }
+ +
+ +
+ +static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
+ +                                     pme_atomcomm_t *atc, splinedata_t *spline,
+ +                                     pme_spline_work_t *work)
+ +{
+ +
+ +    /* spread charges from home atoms to local grid */
+ +    real          *grid;
+ +    pme_overlap_t *ol;
+ +    int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
+ +    int       *    idxptr;
+ +    int            order, norder, index_x, index_xy, index_xyz;
+ +    real           valx, valxy, qn;
+ +    real          *thx, *thy, *thz;
+ +    int            localsize, bndsize;
+ +    int            pnx, pny, pnz, ndatatot;
+ +    int            offx, offy, offz;
+ +
+ +    pnx = pmegrid->s[XX];
+ +    pny = pmegrid->s[YY];
+ +    pnz = pmegrid->s[ZZ];
+ +
+ +    offx = pmegrid->offset[XX];
+ +    offy = pmegrid->offset[YY];
+ +    offz = pmegrid->offset[ZZ];
+ +
+ +    ndatatot = pnx*pny*pnz;
+ +    grid     = pmegrid->grid;
+ +    for (i = 0; i < ndatatot; i++)
+ +    {
+ +        grid[i] = 0;
+ +    }
+ +
+ +    order = pmegrid->order;
+ +
+ +    for (nn = 0; nn < spline->n; nn++)
+ +    {
+ +        n  = spline->ind[nn];
+ +        qn = atc->q[n];
+ +
+ +        if (qn != 0)
+ +        {
+ +            idxptr = atc->idx[n];
+ +            norder = nn*order;
+ +
+ +            i0   = idxptr[XX] - offx;
+ +            j0   = idxptr[YY] - offy;
+ +            k0   = idxptr[ZZ] - offz;
+ +
+ +            thx = spline->theta[XX] + norder;
+ +            thy = spline->theta[YY] + norder;
+ +            thz = spline->theta[ZZ] + norder;
+ +
+ +            switch (order)
+ +            {
+ +                case 4:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ +#ifdef PME_SSE_UNALIGNED
+ +#define PME_SPREAD_SSE_ORDER4
+ +#else
+ +#define PME_SPREAD_SSE_ALIGNED
+ +#define PME_ORDER 4
+ +#endif
+ +#include "pme_sse_single.h"
+ +#else
+ +                    DO_BSPLINE(4);
+ +#endif
+ +                    break;
+ +                case 5:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ +#define PME_SPREAD_SSE_ALIGNED
+ +#define PME_ORDER 5
+ +#include "pme_sse_single.h"
+ +#else
+ +                    DO_BSPLINE(5);
+ +#endif
+ +                    break;
+ +                default:
+ +                    DO_BSPLINE(order);
+ +                    break;
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void set_grid_alignment(int *pmegrid_nz, int pme_order)
+ +{
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ +    if (pme_order == 5
+ +#ifndef PME_SSE_UNALIGNED
+ +        || pme_order == 4
+ +#endif
+ +        )
+ +    {
+ +        /* Round nz up to a multiple of 4 to ensure alignment */
+ +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
+ +    }
+ +#endif
+ +}
+ +
+ +static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
+ +{
-         /* Allocate an aligned pointer for SSE operations, including 3 extra
-          * elements at the end since SSE operates on 4 elements at a time.
++#ifdef PME_SSE_SPREAD_GATHER
+ +#ifndef PME_SSE_UNALIGNED
+ +    if (pme_order == 4)
+ +    {
+ +        /* Add extra elements to ensured aligned operations do not go
+ +         * beyond the allocated grid size.
+ +         * Note that for pme_order=5, the pme grid z-size alignment
+ +         * ensures that we will not go beyond the grid size.
+ +         */
+ +        *gridsize += 4;
+ +    }
+ +#endif
+ +#endif
+ +}
+ +
+ +static void pmegrid_init(pmegrid_t *grid,
+ +                         int cx, int cy, int cz,
+ +                         int x0, int y0, int z0,
+ +                         int x1, int y1, int z1,
+ +                         gmx_bool set_alignment,
+ +                         int pme_order,
+ +                         real *ptr)
+ +{
+ +    int nz, gridsize;
+ +
+ +    grid->ci[XX]     = cx;
+ +    grid->ci[YY]     = cy;
+ +    grid->ci[ZZ]     = cz;
+ +    grid->offset[XX] = x0;
+ +    grid->offset[YY] = y0;
+ +    grid->offset[ZZ] = z0;
+ +    grid->n[XX]      = x1 - x0 + pme_order - 1;
+ +    grid->n[YY]      = y1 - y0 + pme_order - 1;
+ +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
+ +    copy_ivec(grid->n, grid->s);
+ +
+ +    nz = grid->s[ZZ];
+ +    set_grid_alignment(&nz, pme_order);
+ +    if (set_alignment)
+ +    {
+ +        grid->s[ZZ] = nz;
+ +    }
+ +    else if (nz != grid->s[ZZ])
+ +    {
+ +        gmx_incons("pmegrid_init call with an unaligned z size");
+ +    }
+ +
+ +    grid->order = pme_order;
+ +    if (ptr == NULL)
+ +    {
+ +        gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
+ +        set_gridsize_alignment(&gridsize, pme_order);
+ +        snew_aligned(grid->grid, gridsize, 16);
+ +    }
+ +    else
+ +    {
+ +        grid->grid = ptr;
+ +    }
+ +}
+ +
+ +static int div_round_up(int enumerator, int denominator)
+ +{
+ +    return (enumerator + denominator - 1)/denominator;
+ +}
+ +
+ +static void make_subgrid_division(const ivec n, int ovl, int nthread,
+ +                                  ivec nsub)
+ +{
+ +    int gsize_opt, gsize;
+ +    int nsx, nsy, nsz;
+ +    char *env;
+ +
+ +    gsize_opt = -1;
+ +    for (nsx = 1; nsx <= nthread; nsx++)
+ +    {
+ +        if (nthread % nsx == 0)
+ +        {
+ +            for (nsy = 1; nsy <= nthread; nsy++)
+ +            {
+ +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
+ +                {
+ +                    nsz = nthread/(nsx*nsy);
+ +
+ +                    /* Determine the number of grid points per thread */
+ +                    gsize =
+ +                        (div_round_up(n[XX], nsx) + ovl)*
+ +                        (div_round_up(n[YY], nsy) + ovl)*
+ +                        (div_round_up(n[ZZ], nsz) + ovl);
+ +
+ +                    /* Minimize the number of grids points per thread
+ +                     * and, secondarily, the number of cuts in minor dimensions.
+ +                     */
+ +                    if (gsize_opt == -1 ||
+ +                        gsize < gsize_opt ||
+ +                        (gsize == gsize_opt &&
+ +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
+ +                    {
+ +                        nsub[XX]  = nsx;
+ +                        nsub[YY]  = nsy;
+ +                        nsub[ZZ]  = nsz;
+ +                        gsize_opt = gsize;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    env = getenv("GMX_PME_THREAD_DIVISION");
+ +    if (env != NULL)
+ +    {
+ +        sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
+ +    }
+ +
+ +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
+ +    {
+ +        gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
+ +    }
+ +}
+ +
+ +static void pmegrids_init(pmegrids_t *grids,
+ +                          int nx, int ny, int nz, int nz_base,
+ +                          int pme_order,
+ +                          gmx_bool bUseThreads,
+ +                          int nthread,
+ +                          int overlap_x,
+ +                          int overlap_y)
+ +{
+ +    ivec n, n_base, g0, g1;
+ +    int t, x, y, z, d, i, tfac;
+ +    int max_comm_lines = -1;
+ +
+ +    n[XX] = nx - (pme_order - 1);
+ +    n[YY] = ny - (pme_order - 1);
+ +    n[ZZ] = nz - (pme_order - 1);
+ +
+ +    copy_ivec(n, n_base);
+ +    n_base[ZZ] = nz_base;
+ +
+ +    pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
+ +                 NULL);
+ +
+ +    grids->nthread = nthread;
+ +
+ +    make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
+ +
+ +    if (bUseThreads)
+ +    {
+ +        ivec nst;
+ +        int gridsize;
+ +
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
+ +        }
+ +        set_grid_alignment(&nst[ZZ], pme_order);
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
+ +                    grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
+ +            fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
+ +                    nx, ny, nz,
+ +                    nst[XX], nst[YY], nst[ZZ]);
+ +        }
+ +
+ +        snew(grids->grid_th, grids->nthread);
+ +        t        = 0;
+ +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
+ +        set_gridsize_alignment(&gridsize, pme_order);
+ +        snew_aligned(grids->grid_all,
+ +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
+ +                     16);
+ +
+ +        for (x = 0; x < grids->nc[XX]; x++)
+ +        {
+ +            for (y = 0; y < grids->nc[YY]; y++)
+ +            {
+ +                for (z = 0; z < grids->nc[ZZ]; z++)
+ +                {
+ +                    pmegrid_init(&grids->grid_th[t],
+ +                                 x, y, z,
+ +                                 (n[XX]*(x  ))/grids->nc[XX],
+ +                                 (n[YY]*(y  ))/grids->nc[YY],
+ +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
+ +                                 (n[XX]*(x+1))/grids->nc[XX],
+ +                                 (n[YY]*(y+1))/grids->nc[YY],
+ +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
+ +                                 TRUE,
+ +                                 pme_order,
+ +                                 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
+ +                    t++;
+ +                }
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        grids->grid_th = NULL;
+ +    }
+ +
+ +    snew(grids->g2t, DIM);
+ +    tfac = 1;
+ +    for (d = DIM-1; d >= 0; d--)
+ +    {
+ +        snew(grids->g2t[d], n[d]);
+ +        t = 0;
+ +        for (i = 0; i < n[d]; i++)
+ +        {
+ +            /* The second check should match the parameters
+ +             * of the pmegrid_init call above.
+ +             */
+ +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
+ +            {
+ +                t++;
+ +            }
+ +            grids->g2t[d][i] = t*tfac;
+ +        }
+ +
+ +        tfac *= grids->nc[d];
+ +
+ +        switch (d)
+ +        {
+ +            case XX: max_comm_lines = overlap_x;     break;
+ +            case YY: max_comm_lines = overlap_y;     break;
+ +            case ZZ: max_comm_lines = pme_order - 1; break;
+ +        }
+ +        grids->nthread_comm[d] = 0;
+ +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
+ +               grids->nthread_comm[d] < grids->nc[d])
+ +        {
+ +            grids->nthread_comm[d]++;
+ +        }
+ +        if (debug != NULL)
+ +        {
+ +            fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
+ +                    'x'+d, grids->nthread_comm[d]);
+ +        }
+ +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
+ +         * work, but this is not a problematic restriction.
+ +         */
+ +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
+ +        {
+ +            gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void pmegrids_destroy(pmegrids_t *grids)
+ +{
+ +    int t;
+ +
+ +    if (grids->grid.grid != NULL)
+ +    {
+ +        sfree(grids->grid.grid);
+ +
+ +        if (grids->nthread > 0)
+ +        {
+ +            for (t = 0; t < grids->nthread; t++)
+ +            {
+ +                sfree(grids->grid_th[t].grid);
+ +            }
+ +            sfree(grids->grid_th);
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void realloc_work(pme_work_t *work, int nkx)
+ +{
+ +    if (nkx > work->nalloc)
+ +    {
+ +        work->nalloc = nkx;
+ +        srenew(work->mhx, work->nalloc);
+ +        srenew(work->mhy, work->nalloc);
+ +        srenew(work->mhz, work->nalloc);
+ +        srenew(work->m2, work->nalloc);
-         snew_aligned(work->denom, work->nalloc+3, 16);
-         snew_aligned(work->tmp1, work->nalloc+3, 16);
-         snew_aligned(work->eterm, work->nalloc+3, 16);
++        /* Allocate an aligned pointer for SIMD operations, including extra
++         * elements at the end for padding.
+ +         */
++#ifdef PME_SIMD
++#define ALIGN_HERE  GMX_SIMD_WIDTH_HERE
++#else
++/* We can use any alignment, apart from 0, so we use 4 */
++#define ALIGN_HERE  4
++#endif
+ +        sfree_aligned(work->denom);
+ +        sfree_aligned(work->tmp1);
+ +        sfree_aligned(work->eterm);
- #ifdef PME_SSE
- /* Calculate exponentials through SSE in float precision */
- inline static void calc_exponentials(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
++        snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
++        snew_aligned(work->tmp1,  work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
++        snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ +        srenew(work->m2inv, work->nalloc);
+ +    }
+ +}
+ +
+ +
+ +static void free_work(pme_work_t *work)
+ +{
+ +    sfree(work->mhx);
+ +    sfree(work->mhy);
+ +    sfree(work->mhz);
+ +    sfree(work->m2);
+ +    sfree_aligned(work->denom);
+ +    sfree_aligned(work->tmp1);
+ +    sfree_aligned(work->eterm);
+ +    sfree(work->m2inv);
+ +}
+ +
+ +
-         const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-         __m128 f_sse;
-         __m128 lu;
-         __m128 tmp_d1, d_inv, tmp_r, tmp_e;
++#ifdef PME_SIMD
++/* Calculate exponentials through SIMD */
++inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+ +{
+ +    {
-         f_sse = _mm_load1_ps(&f);
-         for (kx = 0; kx < end; kx += 4)
++        const gmx_mm_pr two = gmx_set1_pr(2.0);
++        gmx_mm_pr f_simd;
++        gmx_mm_pr lu;
++        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+ +        int kx;
-             tmp_d1   = _mm_load_ps(d_aligned+kx);
-             lu       = _mm_rcp_ps(tmp_d1);
-             d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
-             tmp_r    = _mm_load_ps(r_aligned+kx);
-             tmp_r    = gmx_mm_exp_ps(tmp_r);
-             tmp_e    = _mm_mul_ps(f_sse, d_inv);
-             tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
-             _mm_store_ps(e_aligned+kx, tmp_e);
++        f_simd = gmx_load1_pr(&f);
++        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+ +        {
- #ifdef PME_SSE
++            tmp_d1   = gmx_load_pr(d_aligned+kx);
++            d_inv    = gmx_inv_pr(tmp_d1);
++            tmp_r    = gmx_load_pr(r_aligned+kx);
++            tmp_r    = gmx_exp_pr(tmp_r);
++            tmp_e    = gmx_mul_pr(f_simd, d_inv);
++            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
++            gmx_store_pr(e_aligned+kx, tmp_e);
+ +        }
+ +    }
+ +}
+ +#else
+ +inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+ +{
+ +    int kx;
+ +    for (kx = start; kx < end; kx++)
+ +    {
+ +        d[kx] = 1.0/d[kx];
+ +    }
+ +    for (kx = start; kx < end; kx++)
+ +    {
+ +        r[kx] = exp(r[kx]);
+ +    }
+ +    for (kx = start; kx < end; kx++)
+ +    {
+ +        e[kx] = f*r[kx]*d[kx];
+ +    }
+ +}
+ +#endif
+ +
+ +
+ +static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
+ +                         real ewaldcoeff, real vol,
+ +                         gmx_bool bEnerVir,
+ +                         int nthread, int thread)
+ +{
+ +    /* do recip sum over local cells in grid */
+ +    /* y major, z middle, x minor or continuous */
+ +    t_complex *p0;
+ +    int     kx, ky, kz, maxkx, maxky, maxkz;
+ +    int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
+ +    real    mx, my, mz;
+ +    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+ +    real    ets2, struct2, vfactor, ets2vf;
+ +    real    d1, d2, energy = 0;
+ +    real    by, bz;
+ +    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
+ +    real    rxx, ryx, ryy, rzx, rzy, rzz;
+ +    pme_work_t *work;
+ +    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
+ +    real    mhxk, mhyk, mhzk, m2k;
+ +    real    corner_fac;
+ +    ivec    complex_order;
+ +    ivec    local_ndata, local_offset, local_size;
+ +    real    elfac;
+ +
+ +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
+ +
+ +    nx = pme->nkx;
+ +    ny = pme->nky;
+ +    nz = pme->nkz;
+ +
+ +    /* Dimensions should be identical for A/B grid, so we just use A here */
+ +    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
+ +                                      complex_order,
+ +                                      local_ndata,
+ +                                      local_offset,
+ +                                      local_size);
+ +
+ +    rxx = pme->recipbox[XX][XX];
+ +    ryx = pme->recipbox[YY][XX];
+ +    ryy = pme->recipbox[YY][YY];
+ +    rzx = pme->recipbox[ZZ][XX];
+ +    rzy = pme->recipbox[ZZ][YY];
+ +    rzz = pme->recipbox[ZZ][ZZ];
+ +
+ +    maxkx = (nx+1)/2;
+ +    maxky = (ny+1)/2;
+ +    maxkz = nz/2+1;
+ +
+ +    work  = &pme->work[thread];
+ +    mhx   = work->mhx;
+ +    mhy   = work->mhy;
+ +    mhz   = work->mhz;
+ +    m2    = work->m2;
+ +    denom = work->denom;
+ +    tmp1  = work->tmp1;
+ +    eterm = work->eterm;
+ +    m2inv = work->m2inv;
+ +
+ +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
+ +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
+ +
+ +    for (iyz = iyz0; iyz < iyz1; iyz++)
+ +    {
+ +        iy = iyz/local_ndata[ZZ];
+ +        iz = iyz - iy*local_ndata[ZZ];
+ +
+ +        ky = iy + local_offset[YY];
+ +
+ +        if (ky < maxky)
+ +        {
+ +            my = ky;
+ +        }
+ +        else
+ +        {
+ +            my = (ky - ny);
+ +        }
+ +
+ +        by = M_PI*vol*pme->bsp_mod[YY][ky];
+ +
+ +        kz = iz + local_offset[ZZ];
+ +
+ +        mz = kz;
+ +
+ +        bz = pme->bsp_mod[ZZ][kz];
+ +
+ +        /* 0.5 correction for corner points */
+ +        corner_fac = 1;
+ +        if (kz == 0 || kz == (nz+1)/2)
+ +        {
+ +            corner_fac = 0.5;
+ +        }
+ +
+ +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ +
+ +        /* We should skip the k-space point (0,0,0) */
+ +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
+ +        {
+ +            kxstart = local_offset[XX];
+ +        }
+ +        else
+ +        {
+ +            kxstart = local_offset[XX] + 1;
+ +            p0++;
+ +        }
+ +        kxend = local_offset[XX] + local_ndata[XX];
+ +
+ +        if (bEnerVir)
+ +        {
+ +            /* More expensive inner loop, especially because of the storage
+ +             * of the mh elements in array's.
+ +             * Because x is the minor grid index, all mh elements
+ +             * depend on kx for triclinic unit cells.
+ +             */
+ +
+ +            /* Two explicit loops to avoid a conditional inside the loop */
+ +            for (kx = kxstart; kx < maxkx; kx++)
+ +            {
+ +                mx = kx;
+ +
+ +                mhxk      = mx * rxx;
+ +                mhyk      = mx * ryx + my * ryy;
+ +                mhzk      = mx * rzx + my * rzy + mz * rzz;
+ +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ +                mhx[kx]   = mhxk;
+ +                mhy[kx]   = mhyk;
+ +                mhz[kx]   = mhzk;
+ +                m2[kx]    = m2k;
+ +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ +                tmp1[kx]  = -factor*m2k;
+ +            }
+ +
+ +            for (kx = maxkx; kx < kxend; kx++)
+ +            {
+ +                mx = (kx - nx);
+ +
+ +                mhxk      = mx * rxx;
+ +                mhyk      = mx * ryx + my * ryy;
+ +                mhzk      = mx * rzx + my * rzy + mz * rzz;
+ +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ +                mhx[kx]   = mhxk;
+ +                mhy[kx]   = mhyk;
+ +                mhz[kx]   = mhzk;
+ +                m2[kx]    = m2k;
+ +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ +                tmp1[kx]  = -factor*m2k;
+ +            }
+ +
+ +            for (kx = kxstart; kx < kxend; kx++)
+ +            {
+ +                m2inv[kx] = 1.0/m2[kx];
+ +            }
+ +
+ +            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+ +
+ +            for (kx = kxstart; kx < kxend; kx++, p0++)
+ +            {
+ +                d1      = p0->re;
+ +                d2      = p0->im;
+ +
+ +                p0->re  = d1*eterm[kx];
+ +                p0->im  = d2*eterm[kx];
+ +
+ +                struct2 = 2.0*(d1*d1+d2*d2);
+ +
+ +                tmp1[kx] = eterm[kx]*struct2;
+ +            }
+ +
+ +            for (kx = kxstart; kx < kxend; kx++)
+ +            {
+ +                ets2     = corner_fac*tmp1[kx];
+ +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
+ +                energy  += ets2;
+ +
+ +                ets2vf   = ets2*vfactor;
+ +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
+ +                virxy   += ets2vf*mhx[kx]*mhy[kx];
+ +                virxz   += ets2vf*mhx[kx]*mhz[kx];
+ +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
+ +                viryz   += ets2vf*mhy[kx]*mhz[kx];
+ +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            /* We don't need to calculate the energy and the virial.
+ +             * In this case the triclinic overhead is small.
+ +             */
+ +
+ +            /* Two explicit loops to avoid a conditional inside the loop */
+ +
+ +            for (kx = kxstart; kx < maxkx; kx++)
+ +            {
+ +                mx = kx;
+ +
+ +                mhxk      = mx * rxx;
+ +                mhyk      = mx * ryx + my * ryy;
+ +                mhzk      = mx * rzx + my * rzy + mz * rzz;
+ +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ +                tmp1[kx]  = -factor*m2k;
+ +            }
+ +
+ +            for (kx = maxkx; kx < kxend; kx++)
+ +            {
+ +                mx = (kx - nx);
+ +
+ +                mhxk      = mx * rxx;
+ +                mhyk      = mx * ryx + my * ryy;
+ +                mhzk      = mx * rzx + my * rzy + mz * rzz;
+ +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ +                tmp1[kx]  = -factor*m2k;
+ +            }
+ +
+ +            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+ +
+ +            for (kx = kxstart; kx < kxend; kx++, p0++)
+ +            {
+ +                d1      = p0->re;
+ +                d2      = p0->im;
+ +
+ +                p0->re  = d1*eterm[kx];
+ +                p0->im  = d2*eterm[kx];
+ +            }
+ +        }
+ +    }
+ +
+ +    if (bEnerVir)
+ +    {
+ +        /* Update virial with local values.
+ +         * The virial is symmetric by definition.
+ +         * this virial seems ok for isotropic scaling, but I'm
+ +         * experiencing problems on semiisotropic membranes.
+ +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
+ +         */
+ +        work->vir[XX][XX] = 0.25*virxx;
+ +        work->vir[YY][YY] = 0.25*viryy;
+ +        work->vir[ZZ][ZZ] = 0.25*virzz;
+ +        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
+ +        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
+ +        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+ +
+ +        /* This energy should be corrected for a charged system */
+ +        work->energy = 0.5*energy;
+ +    }
+ +
+ +    /* Return the loop count */
+ +    return local_ndata[YY]*local_ndata[XX];
+ +}
+ +
+ +static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
+ +                             real *mesh_energy, matrix vir)
+ +{
+ +    /* This function sums output over threads
+ +     * and should therefore only be called after thread synchronization.
+ +     */
+ +    int thread;
+ +
+ +    *mesh_energy = pme->work[0].energy;
+ +    copy_mat(pme->work[0].vir, vir);
+ +
+ +    for (thread = 1; thread < nthread; thread++)
+ +    {
+ +        *mesh_energy += pme->work[thread].energy;
+ +        m_add(vir, pme->work[thread].vir, vir);
+ +    }
+ +}
+ +
+ +#define DO_FSPLINE(order)                      \
+ +    for (ithx = 0; (ithx < order); ithx++)              \
+ +    {                                              \
+ +        index_x = (i0+ithx)*pny*pnz;               \
+ +        tx      = thx[ithx];                       \
+ +        dx      = dthx[ithx];                      \
+ +                                               \
+ +        for (ithy = 0; (ithy < order); ithy++)          \
+ +        {                                          \
+ +            index_xy = index_x+(j0+ithy)*pnz;      \
+ +            ty       = thy[ithy];                  \
+ +            dy       = dthy[ithy];                 \
+ +            fxy1     = fz1 = 0;                    \
+ +                                               \
+ +            for (ithz = 0; (ithz < order); ithz++)      \
+ +            {                                      \
+ +                gval  = grid[index_xy+(k0+ithz)];  \
+ +                fxy1 += thz[ithz]*gval;            \
+ +                fz1  += dthz[ithz]*gval;           \
+ +            }                                      \
+ +            fx += dx*ty*fxy1;                      \
+ +            fy += tx*dy*fxy1;                      \
+ +            fz += tx*ty*fz1;                       \
+ +        }                                          \
+ +    }
+ +
+ +
+ +static void gather_f_bsplines(gmx_pme_t pme, real *grid,
+ +                              gmx_bool bClearF, pme_atomcomm_t *atc,
+ +                              splinedata_t *spline,
+ +                              real scale)
+ +{
+ +    /* sum forces for local particles */
+ +    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
+ +    int     index_x, index_xy;
+ +    int     nx, ny, nz, pnx, pny, pnz;
+ +    int *   idxptr;
+ +    real    tx, ty, dx, dy, qn;
+ +    real    fx, fy, fz, gval;
+ +    real    fxy1, fz1;
+ +    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
+ +    int     norder;
+ +    real    rxx, ryx, ryy, rzx, rzy, rzz;
+ +    int     order;
+ +
+ +    pme_spline_work_t *work;
+ +
+ +    work = pme->spline_work;
+ +
+ +    order = pme->pme_order;
+ +    thx   = spline->theta[XX];
+ +    thy   = spline->theta[YY];
+ +    thz   = spline->theta[ZZ];
+ +    dthx  = spline->dtheta[XX];
+ +    dthy  = spline->dtheta[YY];
+ +    dthz  = spline->dtheta[ZZ];
+ +    nx    = pme->nkx;
+ +    ny    = pme->nky;
+ +    nz    = pme->nkz;
+ +    pnx   = pme->pmegrid_nx;
+ +    pny   = pme->pmegrid_ny;
+ +    pnz   = pme->pmegrid_nz;
+ +
+ +    rxx   = pme->recipbox[XX][XX];
+ +    ryx   = pme->recipbox[YY][XX];
+ +    ryy   = pme->recipbox[YY][YY];
+ +    rzx   = pme->recipbox[ZZ][XX];
+ +    rzy   = pme->recipbox[ZZ][YY];
+ +    rzz   = pme->recipbox[ZZ][ZZ];
+ +
+ +    for (nn = 0; nn < spline->n; nn++)
+ +    {
+ +        n  = spline->ind[nn];
+ +        qn = scale*atc->q[n];
+ +
+ +        if (bClearF)
+ +        {
+ +            atc->f[n][XX] = 0;
+ +            atc->f[n][YY] = 0;
+ +            atc->f[n][ZZ] = 0;
+ +        }
+ +        if (qn != 0)
+ +        {
+ +            fx     = 0;
+ +            fy     = 0;
+ +            fz     = 0;
+ +            idxptr = atc->idx[n];
+ +            norder = nn*order;
+ +
+ +            i0   = idxptr[XX];
+ +            j0   = idxptr[YY];
+ +            k0   = idxptr[ZZ];
+ +
+ +            /* Pointer arithmetic alert, next six statements */
+ +            thx  = spline->theta[XX] + norder;
+ +            thy  = spline->theta[YY] + norder;
+ +            thz  = spline->theta[ZZ] + norder;
+ +            dthx = spline->dtheta[XX] + norder;
+ +            dthy = spline->dtheta[YY] + norder;
+ +            dthz = spline->dtheta[ZZ] + norder;
+ +
+ +            switch (order)
+ +            {
+ +                case 4:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ +#ifdef PME_SSE_UNALIGNED
+ +#define PME_GATHER_F_SSE_ORDER4
+ +#else
+ +#define PME_GATHER_F_SSE_ALIGNED
+ +#define PME_ORDER 4
+ +#endif
+ +#include "pme_sse_single.h"
+ +#else
+ +                    DO_FSPLINE(4);
+ +#endif
+ +                    break;
+ +                case 5:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ +#define PME_GATHER_F_SSE_ALIGNED
+ +#define PME_ORDER 5
+ +#include "pme_sse_single.h"
+ +#else
+ +                    DO_FSPLINE(5);
+ +#endif
+ +                    break;
+ +                default:
+ +                    DO_FSPLINE(order);
+ +                    break;
+ +            }
+ +
+ +            atc->f[n][XX] += -qn*( fx*nx*rxx );
+ +            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
+ +            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+ +        }
+ +    }
+ +    /* Since the energy and not forces are interpolated
+ +     * the net force might not be exactly zero.
+ +     * This can be solved by also interpolating F, but
+ +     * that comes at a cost.
+ +     * A better hack is to remove the net force every
+ +     * step, but that must be done at a higher level
+ +     * since this routine doesn't see all atoms if running
+ +     * in parallel. Don't know how important it is?  EL 990726
+ +     */
+ +}
+ +
+ +
+ +static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
+ +                                   pme_atomcomm_t *atc)
+ +{
+ +    splinedata_t *spline;
+ +    int     n, ithx, ithy, ithz, i0, j0, k0;
+ +    int     index_x, index_xy;
+ +    int *   idxptr;
+ +    real    energy, pot, tx, ty, qn, gval;
+ +    real    *thx, *thy, *thz;
+ +    int     norder;
+ +    int     order;
+ +
+ +    spline = &atc->spline[0];
+ +
+ +    order = pme->pme_order;
+ +
+ +    energy = 0;
+ +    for (n = 0; (n < atc->n); n++)
+ +    {
+ +        qn      = atc->q[n];
+ +
+ +        if (qn != 0)
+ +        {
+ +            idxptr = atc->idx[n];
+ +            norder = n*order;
+ +
+ +            i0   = idxptr[XX];
+ +            j0   = idxptr[YY];
+ +            k0   = idxptr[ZZ];
+ +
+ +            /* Pointer arithmetic alert, next three statements */
+ +            thx  = spline->theta[XX] + norder;
+ +            thy  = spline->theta[YY] + norder;
+ +            thz  = spline->theta[ZZ] + norder;
+ +
+ +            pot = 0;
+ +            for (ithx = 0; (ithx < order); ithx++)
+ +            {
+ +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
+ +                tx      = thx[ithx];
+ +
+ +                for (ithy = 0; (ithy < order); ithy++)
+ +                {
+ +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
+ +                    ty       = thy[ithy];
+ +
+ +                    for (ithz = 0; (ithz < order); ithz++)
+ +                    {
+ +                        gval  = grid[index_xy+(k0+ithz)];
+ +                        pot  += tx*ty*thz[ithz]*gval;
+ +                    }
+ +
+ +                }
+ +            }
+ +
+ +            energy += pot*qn;
+ +        }
+ +    }
+ +
+ +    return energy;
+ +}
+ +
+ +/* Macro to force loop unrolling by fixing order.
+ + * This gives a significant performance gain.
+ + */
+ +#define CALC_SPLINE(order)                     \
+ +    {                                              \
+ +        int j, k, l;                                 \
+ +        real dr, div;                               \
+ +        real data[PME_ORDER_MAX];                  \
+ +        real ddata[PME_ORDER_MAX];                 \
+ +                                               \
+ +        for (j = 0; (j < DIM); j++)                     \
+ +        {                                          \
+ +            dr  = xptr[j];                         \
+ +                                               \
+ +            /* dr is relative offset from lower cell limit */ \
+ +            data[order-1] = 0;                     \
+ +            data[1]       = dr;                          \
+ +            data[0]       = 1 - dr;                      \
+ +                                               \
+ +            for (k = 3; (k < order); k++)               \
+ +            {                                      \
+ +                div       = 1.0/(k - 1.0);               \
+ +                data[k-1] = div*dr*data[k-2];      \
+ +                for (l = 1; (l < (k-1)); l++)           \
+ +                {                                  \
+ +                    data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
+ +                                       data[k-l-1]);                \
+ +                }                                  \
+ +                data[0] = div*(1-dr)*data[0];      \
+ +            }                                      \
+ +            /* differentiate */                    \
+ +            ddata[0] = -data[0];                   \
+ +            for (k = 1; (k < order); k++)               \
+ +            {                                      \
+ +                ddata[k] = data[k-1] - data[k];    \
+ +            }                                      \
+ +                                               \
+ +            div           = 1.0/(order - 1);                 \
+ +            data[order-1] = div*dr*data[order-2];  \
+ +            for (l = 1; (l < (order-1)); l++)           \
+ +            {                                      \
+ +                data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
+ +                                       (order-l-dr)*data[order-l-1]); \
+ +            }                                      \
+ +            data[0] = div*(1 - dr)*data[0];        \
+ +                                               \
+ +            for (k = 0; k < order; k++)                 \
+ +            {                                      \
+ +                theta[j][i*order+k]  = data[k];    \
+ +                dtheta[j][i*order+k] = ddata[k];   \
+ +            }                                      \
+ +        }                                          \
+ +    }
+ +
+ +void make_bsplines(splinevec theta, splinevec dtheta, int order,
+ +                   rvec fractx[], int nr, int ind[], real charge[],
+ +                   gmx_bool bFreeEnergy)
+ +{
+ +    /* construct splines for local atoms */
+ +    int  i, ii;
+ +    real *xptr;
+ +
+ +    for (i = 0; i < nr; i++)
+ +    {
+ +        /* With free energy we do not use the charge check.
+ +         * In most cases this will be more efficient than calling make_bsplines
+ +         * twice, since usually more than half the particles have charges.
+ +         */
+ +        ii = ind[i];
+ +        if (bFreeEnergy || charge[ii] != 0.0)
+ +        {
+ +            xptr = fractx[ii];
+ +            switch (order)
+ +            {
+ +                case 4:  CALC_SPLINE(4);     break;
+ +                case 5:  CALC_SPLINE(5);     break;
+ +                default: CALC_SPLINE(order); break;
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +void make_dft_mod(real *mod, real *data, int ndata)
+ +{
+ +    int i, j;
+ +    real sc, ss, arg;
+ +
+ +    for (i = 0; i < ndata; i++)
+ +    {
+ +        sc = ss = 0;
+ +        for (j = 0; j < ndata; j++)
+ +        {
+ +            arg = (2.0*M_PI*i*j)/ndata;
+ +            sc += data[j]*cos(arg);
+ +            ss += data[j]*sin(arg);
+ +        }
+ +        mod[i] = sc*sc+ss*ss;
+ +    }
+ +    for (i = 0; i < ndata; i++)
+ +    {
+ +        if (mod[i] < 1e-7)
+ +        {
+ +            mod[i] = (mod[i-1]+mod[i+1])*0.5;
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void make_bspline_moduli(splinevec bsp_mod,
+ +                                int nx, int ny, int nz, int order)
+ +{
+ +    int nmax = max(nx, max(ny, nz));
+ +    real *data, *ddata, *bsp_data;
+ +    int i, k, l;
+ +    real div;
+ +
+ +    snew(data, order);
+ +    snew(ddata, order);
+ +    snew(bsp_data, nmax);
+ +
+ +    data[order-1] = 0;
+ +    data[1]       = 0;
+ +    data[0]       = 1;
+ +
+ +    for (k = 3; k < order; k++)
+ +    {
+ +        div       = 1.0/(k-1.0);
+ +        data[k-1] = 0;
+ +        for (l = 1; l < (k-1); l++)
+ +        {
+ +            data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
+ +        }
+ +        data[0] = div*data[0];
+ +    }
+ +    /* differentiate */
+ +    ddata[0] = -data[0];
+ +    for (k = 1; k < order; k++)
+ +    {
+ +        ddata[k] = data[k-1]-data[k];
+ +    }
+ +    div           = 1.0/(order-1);
+ +    data[order-1] = 0;
+ +    for (l = 1; l < (order-1); l++)
+ +    {
+ +        data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
+ +    }
+ +    data[0] = div*data[0];
+ +
+ +    for (i = 0; i < nmax; i++)
+ +    {
+ +        bsp_data[i] = 0;
+ +    }
+ +    for (i = 1; i <= order; i++)
+ +    {
+ +        bsp_data[i] = data[i-1];
+ +    }
+ +
+ +    make_dft_mod(bsp_mod[XX], bsp_data, nx);
+ +    make_dft_mod(bsp_mod[YY], bsp_data, ny);
+ +    make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
+ +
+ +    sfree(data);
+ +    sfree(ddata);
+ +    sfree(bsp_data);
+ +}
+ +
+ +
+ +/* Return the P3M optimal influence function */
+ +static double do_p3m_influence(double z, int order)
+ +{
+ +    double z2, z4;
+ +
+ +    z2 = z*z;
+ +    z4 = z2*z2;
+ +
+ +    /* The formula and most constants can be found in:
+ +     * Ballenegger et al., JCTC 8, 936 (2012)
+ +     */
+ +    switch (order)
+ +    {
+ +        case 2:
+ +            return 1.0 - 2.0*z2/3.0;
+ +            break;
+ +        case 3:
+ +            return 1.0 - z2 + 2.0*z4/15.0;
+ +            break;
+ +        case 4:
+ +            return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
+ +            break;
+ +        case 5:
+ +            return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
+ +            break;
+ +        case 6:
+ +            return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
+ +            break;
+ +        case 7:
+ +            return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
+ +        case 8:
+ +            return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
+ +            break;
+ +    }
+ +
+ +    return 0.0;
+ +}
+ +
+ +/* Calculate the P3M B-spline moduli for one dimension */
+ +static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
+ +{
+ +    double zarg, zai, sinzai, infl;
+ +    int    maxk, i;
+ +
+ +    if (order > 8)
+ +    {
+ +        gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
+ +    }
+ +
+ +    zarg = M_PI/n;
+ +
+ +    maxk = (n + 1)/2;
+ +
+ +    for (i = -maxk; i < 0; i++)
+ +    {
+ +        zai          = zarg*i;
+ +        sinzai       = sin(zai);
+ +        infl         = do_p3m_influence(sinzai, order);
+ +        bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
+ +    }
+ +    bsp_mod[0] = 1.0;
+ +    for (i = 1; i < maxk; i++)
+ +    {
+ +        zai        = zarg*i;
+ +        sinzai     = sin(zai);
+ +        infl       = do_p3m_influence(sinzai, order);
+ +        bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
+ +    }
+ +}
+ +
+ +/* Calculate the P3M B-spline moduli */
+ +static void make_p3m_bspline_moduli(splinevec bsp_mod,
+ +                                    int nx, int ny, int nz, int order)
+ +{
+ +    make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
+ +    make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
+ +    make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
+ +}
+ +
+ +
+ +static void setup_coordinate_communication(pme_atomcomm_t *atc)
+ +{
+ +    int nslab, n, i;
+ +    int fw, bw;
+ +
+ +    nslab = atc->nslab;
+ +
+ +    n = 0;
+ +    for (i = 1; i <= nslab/2; i++)
+ +    {
+ +        fw = (atc->nodeid + i) % nslab;
+ +        bw = (atc->nodeid - i + nslab) % nslab;
+ +        if (n < nslab - 1)
+ +        {
+ +            atc->node_dest[n] = fw;
+ +            atc->node_src[n]  = bw;
+ +            n++;
+ +        }
+ +        if (n < nslab - 1)
+ +        {
+ +            atc->node_dest[n] = bw;
+ +            atc->node_src[n]  = fw;
+ +            n++;
+ +        }
+ +    }
+ +}
+ +
+ +int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
+ +{
+ +    int thread;
+ +
+ +    if (NULL != log)
+ +    {
+ +        fprintf(log, "Destroying PME data structures.\n");
+ +    }
+ +
+ +    sfree((*pmedata)->nnx);
+ +    sfree((*pmedata)->nny);
+ +    sfree((*pmedata)->nnz);
+ +
+ +    pmegrids_destroy(&(*pmedata)->pmegridA);
+ +
+ +    sfree((*pmedata)->fftgridA);
+ +    sfree((*pmedata)->cfftgridA);
+ +    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
+ +
+ +    if ((*pmedata)->pmegridB.grid.grid != NULL)
+ +    {
+ +        pmegrids_destroy(&(*pmedata)->pmegridB);
+ +        sfree((*pmedata)->fftgridB);
+ +        sfree((*pmedata)->cfftgridB);
+ +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
+ +    }
+ +    for (thread = 0; thread < (*pmedata)->nthread; thread++)
+ +    {
+ +        free_work(&(*pmedata)->work[thread]);
+ +    }
+ +    sfree((*pmedata)->work);
+ +
+ +    sfree(*pmedata);
+ +    *pmedata = NULL;
+ +
+ +    return 0;
+ +}
+ +
+ +static int mult_up(int n, int f)
+ +{
+ +    return ((n + f - 1)/f)*f;
+ +}
+ +
+ +
+ +static double pme_load_imbalance(gmx_pme_t pme)
+ +{
+ +    int    nma, nmi;
+ +    double n1, n2, n3;
+ +
+ +    nma = pme->nnodes_major;
+ +    nmi = pme->nnodes_minor;
+ +
+ +    n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
+ +    n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
+ +    n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
+ +
+ +    /* pme_solve is roughly double the cost of an fft */
+ +
+ +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
+ +}
+ +
+ +static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
+ +                          int dimind, gmx_bool bSpread)
+ +{
+ +    int nk, k, s, thread;
+ +
+ +    atc->dimind    = dimind;
+ +    atc->nslab     = 1;
+ +    atc->nodeid    = 0;
+ +    atc->pd_nalloc = 0;
+ +#ifdef GMX_MPI
+ +    if (pme->nnodes > 1)
+ +    {
+ +        atc->mpi_comm = pme->mpi_comm_d[dimind];
+ +        MPI_Comm_size(atc->mpi_comm, &atc->nslab);
+ +        MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
+ +    }
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
+ +    }
+ +#endif
+ +
+ +    atc->bSpread   = bSpread;
+ +    atc->pme_order = pme->pme_order;
+ +
+ +    if (atc->nslab > 1)
+ +    {
+ +        /* These three allocations are not required for particle decomp. */
+ +        snew(atc->node_dest, atc->nslab);
+ +        snew(atc->node_src, atc->nslab);
+ +        setup_coordinate_communication(atc);
+ +
+ +        snew(atc->count_thread, pme->nthread);
+ +        for (thread = 0; thread < pme->nthread; thread++)
+ +        {
+ +            snew(atc->count_thread[thread], atc->nslab);
+ +        }
+ +        atc->count = atc->count_thread[0];
+ +        snew(atc->rcount, atc->nslab);
+ +        snew(atc->buf_index, atc->nslab);
+ +    }
+ +
+ +    atc->nthread = pme->nthread;
+ +    if (atc->nthread > 1)
+ +    {
+ +        snew(atc->thread_plist, atc->nthread);
+ +    }
+ +    snew(atc->spline, atc->nthread);
+ +    for (thread = 0; thread < atc->nthread; thread++)
+ +    {
+ +        if (atc->nthread > 1)
+ +        {
+ +            snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
+ +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
+ +        }
+ +        snew(atc->spline[thread].thread_one, pme->nthread);
+ +        atc->spline[thread].thread_one[thread] = 1;
+ +    }
+ +}
+ +
+ +static void
+ +init_overlap_comm(pme_overlap_t *  ol,
+ +                  int              norder,
+ +#ifdef GMX_MPI
+ +                  MPI_Comm         comm,
+ +#endif
+ +                  int              nnodes,
+ +                  int              nodeid,
+ +                  int              ndata,
+ +                  int              commplainsize)
+ +{
+ +    int lbnd, rbnd, maxlr, b, i;
+ +    int exten;
+ +    int nn, nk;
+ +    pme_grid_comm_t *pgc;
+ +    gmx_bool bCont;
+ +    int fft_start, fft_end, send_index1, recv_index1;
+ +#ifdef GMX_MPI
+ +    MPI_Status stat;
+ +
+ +    ol->mpi_comm = comm;
+ +#endif
+ +
+ +    ol->nnodes = nnodes;
+ +    ol->nodeid = nodeid;
+ +
+ +    /* Linear translation of the PME grid won't affect reciprocal space
+ +     * calculations, so to optimize we only interpolate "upwards",
+ +     * which also means we only have to consider overlap in one direction.
+ +     * I.e., particles on this node might also be spread to grid indices
+ +     * that belong to higher nodes (modulo nnodes)
+ +     */
+ +
+ +    snew(ol->s2g0, ol->nnodes+1);
+ +    snew(ol->s2g1, ol->nnodes);
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "PME slab boundaries:");
+ +    }
+ +    for (i = 0; i < nnodes; i++)
+ +    {
+ +        /* s2g0 the local interpolation grid start.
+ +         * s2g1 the local interpolation grid end.
+ +         * Because grid overlap communication only goes forward,
+ +         * the grid the slabs for fft's should be rounded down.
+ +         */
+ +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
+ +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
+ +        }
+ +    }
+ +    ol->s2g0[nnodes] = ndata;
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "\n");
+ +    }
+ +
+ +    /* Determine with how many nodes we need to communicate the grid overlap */
+ +    b = 0;
+ +    do
+ +    {
+ +        b++;
+ +        bCont = FALSE;
+ +        for (i = 0; i < nnodes; i++)
+ +        {
+ +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
+ +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
+ +            {
+ +                bCont = TRUE;
+ +            }
+ +        }
+ +    }
+ +    while (bCont && b < nnodes);
+ +    ol->noverlap_nodes = b - 1;
+ +
+ +    snew(ol->send_id, ol->noverlap_nodes);
+ +    snew(ol->recv_id, ol->noverlap_nodes);
+ +    for (b = 0; b < ol->noverlap_nodes; b++)
+ +    {
+ +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
+ +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
+ +    }
+ +    snew(ol->comm_data, ol->noverlap_nodes);
+ +
+ +    ol->send_size = 0;
+ +    for (b = 0; b < ol->noverlap_nodes; b++)
+ +    {
+ +        pgc = &ol->comm_data[b];
+ +        /* Send */
+ +        fft_start        = ol->s2g0[ol->send_id[b]];
+ +        fft_end          = ol->s2g0[ol->send_id[b]+1];
+ +        if (ol->send_id[b] < nodeid)
+ +        {
+ +            fft_start += ndata;
+ +            fft_end   += ndata;
+ +        }
+ +        send_index1       = ol->s2g1[nodeid];
+ +        send_index1       = min(send_index1, fft_end);
+ +        pgc->send_index0  = fft_start;
+ +        pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
+ +        ol->send_size    += pgc->send_nindex;
+ +
+ +        /* We always start receiving to the first index of our slab */
+ +        fft_start        = ol->s2g0[ol->nodeid];
+ +        fft_end          = ol->s2g0[ol->nodeid+1];
+ +        recv_index1      = ol->s2g1[ol->recv_id[b]];
+ +        if (ol->recv_id[b] > nodeid)
+ +        {
+ +            recv_index1 -= ndata;
+ +        }
+ +        recv_index1      = min(recv_index1, fft_end);
+ +        pgc->recv_index0 = fft_start;
+ +        pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
+ +    }
+ +
+ +#ifdef GMX_MPI
+ +    /* Communicate the buffer sizes to receive */
+ +    for (b = 0; b < ol->noverlap_nodes; b++)
+ +    {
+ +        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
+ +                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
+ +                     ol->mpi_comm, &stat);
+ +    }
+ +#endif
+ +
+ +    /* For non-divisible grid we need pme_order iso pme_order-1 */
+ +    snew(ol->sendbuf, norder*commplainsize);
+ +    snew(ol->recvbuf, norder*commplainsize);
+ +}
+ +
+ +static void
+ +make_gridindex5_to_localindex(int n, int local_start, int local_range,
+ +                              int **global_to_local,
+ +                              real **fraction_shift)
+ +{
+ +    int i;
+ +    int * gtl;
+ +    real * fsh;
+ +
+ +    snew(gtl, 5*n);
+ +    snew(fsh, 5*n);
+ +    for (i = 0; (i < 5*n); i++)
+ +    {
+ +        /* Determine the global to local grid index */
+ +        gtl[i] = (i - local_start + n) % n;
+ +        /* For coordinates that fall within the local grid the fraction
+ +         * is correct, we don't need to shift it.
+ +         */
+ +        fsh[i] = 0;
+ +        if (local_range < n)
+ +        {
+ +            /* Due to rounding issues i could be 1 beyond the lower or
+ +             * upper boundary of the local grid. Correct the index for this.
+ +             * If we shift the index, we need to shift the fraction by
+ +             * the same amount in the other direction to not affect
+ +             * the weights.
+ +             * Note that due to this shifting the weights at the end of
+ +             * the spline might change, but that will only involve values
+ +             * between zero and values close to the precision of a real,
+ +             * which is anyhow the accuracy of the whole mesh calculation.
+ +             */
+ +            /* With local_range=0 we should not change i=local_start */
+ +            if (i % n != local_start)
+ +            {
+ +                if (gtl[i] == n-1)
+ +                {
+ +                    gtl[i] = 0;
+ +                    fsh[i] = -1;
+ +                }
+ +                else if (gtl[i] == local_range)
+ +                {
+ +                    gtl[i] = local_range - 1;
+ +                    fsh[i] = 1;
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    *global_to_local = gtl;
+ +    *fraction_shift  = fsh;
+ +}
+ +
+ +static pme_spline_work_t *make_pme_spline_work(int order)
+ +{
+ +    pme_spline_work_t *work;
+ +
++#ifdef PME_SSE_SPREAD_GATHER
+ +    float  tmp[8];
+ +    __m128 zero_SSE;
+ +    int    of, i;
+ +
+ +    snew_aligned(work, 1, 16);
+ +
+ +    zero_SSE = _mm_setzero_ps();
+ +
+ +    /* Generate bit masks to mask out the unused grid entries,
+ +     * as we only operate on order of the 8 grid entries that are
+ +     * load into 2 SSE float registers.
+ +     */
+ +    for (of = 0; of < 8-(order-1); of++)
+ +    {
+ +        for (i = 0; i < 8; i++)
+ +        {
+ +            tmp[i] = (i >= of && i < of+order ? 1 : 0);
+ +        }
+ +        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
+ +        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
+ +        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
+ +        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+ +    }
+ +#else
+ +    work = NULL;
+ +#endif
+ +
+ +    return work;
+ +}
+ +
+ +void gmx_pme_check_restrictions(int pme_order,
+ +                                int nkx, int nky, int nkz,
+ +                                int nnodes_major,
+ +                                int nnodes_minor,
+ +                                gmx_bool bUseThreads,
+ +                                gmx_bool bFatal,
+ +                                gmx_bool *bValidSettings)
+ +{
+ +    if (pme_order > PME_ORDER_MAX)
+ +    {
+ +        if (!bFatal)
+ +        {
+ +            *bValidSettings = FALSE;
+ +            return;
+ +        }
+ +        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+ +                  pme_order, PME_ORDER_MAX);
+ +    }
+ +
+ +    if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
+ +        nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
+ +        nkz <= pme_order)
+ +    {
+ +        if (!bFatal)
+ +        {
+ +            *bValidSettings = FALSE;
+ +            return;
+ +        }
+ +        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
+ +                  pme_order);
+ +    }
+ +
+ +    /* Check for a limitation of the (current) sum_fftgrid_dd code.
+ +     * We only allow multiple communication pulses in dim 1, not in dim 0.
+ +     */
+ +    if (bUseThreads && (nkx < nnodes_major*pme_order &&
+ +                        nkx != nnodes_major*(pme_order - 1)))
+ +    {
+ +        if (!bFatal)
+ +        {
+ +            *bValidSettings = FALSE;
+ +            return;
+ +        }
+ +        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+ +                  nkx/(double)nnodes_major, pme_order);
+ +    }
+ +
+ +    if (bValidSettings != NULL)
+ +    {
+ +        *bValidSettings = TRUE;
+ +    }
+ +
+ +    return;
+ +}
+ +
+ +int gmx_pme_init(gmx_pme_t *         pmedata,
+ +                 t_commrec *         cr,
+ +                 int                 nnodes_major,
+ +                 int                 nnodes_minor,
+ +                 t_inputrec *        ir,
+ +                 int                 homenr,
+ +                 gmx_bool            bFreeEnergy,
+ +                 gmx_bool            bReproducible,
+ +                 int                 nthread)
+ +{
+ +    gmx_pme_t pme = NULL;
+ +
+ +    int  use_threads, sum_use_threads;
+ +    ivec ndata;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Creating PME data structures.\n");
+ +    }
+ +    snew(pme, 1);
+ +
+ +    pme->redist_init         = FALSE;
+ +    pme->sum_qgrid_tmp       = NULL;
+ +    pme->sum_qgrid_dd_tmp    = NULL;
+ +    pme->buf_nalloc          = 0;
+ +    pme->redist_buf_nalloc   = 0;
+ +
+ +    pme->nnodes              = 1;
+ +    pme->bPPnode             = TRUE;
+ +
+ +    pme->nnodes_major        = nnodes_major;
+ +    pme->nnodes_minor        = nnodes_minor;
+ +
+ +#ifdef GMX_MPI
+ +    if (nnodes_major*nnodes_minor > 1)
+ +    {
+ +        pme->mpi_comm = cr->mpi_comm_mygroup;
+ +
+ +        MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
+ +        MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
+ +        if (pme->nnodes != nnodes_major*nnodes_minor)
+ +        {
+ +            gmx_incons("PME node count mismatch");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        pme->mpi_comm = MPI_COMM_NULL;
+ +    }
+ +#endif
+ +
+ +    if (pme->nnodes == 1)
+ +    {
+ +#ifdef GMX_MPI
+ +        pme->mpi_comm_d[0] = MPI_COMM_NULL;
+ +        pme->mpi_comm_d[1] = MPI_COMM_NULL;
+ +#endif
+ +        pme->ndecompdim   = 0;
+ +        pme->nodeid_major = 0;
+ +        pme->nodeid_minor = 0;
+ +#ifdef GMX_MPI
+ +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
+ +#endif
+ +    }
+ +    else
+ +    {
+ +        if (nnodes_minor == 1)
+ +        {
+ +#ifdef GMX_MPI
+ +            pme->mpi_comm_d[0] = pme->mpi_comm;
+ +            pme->mpi_comm_d[1] = MPI_COMM_NULL;
+ +#endif
+ +            pme->ndecompdim   = 1;
+ +            pme->nodeid_major = pme->nodeid;
+ +            pme->nodeid_minor = 0;
+ +
+ +        }
+ +        else if (nnodes_major == 1)
+ +        {
+ +#ifdef GMX_MPI
+ +            pme->mpi_comm_d[0] = MPI_COMM_NULL;
+ +            pme->mpi_comm_d[1] = pme->mpi_comm;
+ +#endif
+ +            pme->ndecompdim   = 1;
+ +            pme->nodeid_major = 0;
+ +            pme->nodeid_minor = pme->nodeid;
+ +        }
+ +        else
+ +        {
+ +            if (pme->nnodes % nnodes_major != 0)
+ +            {
+ +                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+ +            }
+ +            pme->ndecompdim = 2;
+ +
+ +#ifdef GMX_MPI
+ +            MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
+ +                           pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
+ +            MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
+ +                           pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
+ +
+ +            MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
+ +            MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
+ +            MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
+ +            MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
+ +#endif
+ +        }
+ +        pme->bPPnode = (cr->duty & DUTY_PP);
+ +    }
+ +
+ +    pme->nthread = nthread;
+ +
+ +    /* Check if any of the PME MPI ranks uses threads */
+ +    use_threads = (pme->nthread > 1 ? 1 : 0);
+ +#ifdef GMX_MPI
+ +    if (pme->nnodes > 1)
+ +    {
+ +        MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
+ +                      MPI_SUM, pme->mpi_comm);
+ +    }
+ +    else
+ +#endif
+ +    {
+ +        sum_use_threads = use_threads;
+ +    }
+ +    pme->bUseThreads = (sum_use_threads > 0);
+ +
+ +    if (ir->ePBC == epbcSCREW)
+ +    {
+ +        gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
+ +    }
+ +
+ +    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
+ +    pme->nkx         = ir->nkx;
+ +    pme->nky         = ir->nky;
+ +    pme->nkz         = ir->nkz;
+ +    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
+ +    pme->pme_order   = ir->pme_order;
+ +    pme->epsilon_r   = ir->epsilon_r;
+ +
+ +    /* If we violate restrictions, generate a fatal error here */
+ +    gmx_pme_check_restrictions(pme->pme_order,
+ +                               pme->nkx, pme->nky, pme->nkz,
+ +                               pme->nnodes_major,
+ +                               pme->nnodes_minor,
+ +                               pme->bUseThreads,
+ +                               TRUE,
+ +                               NULL);
+ +
+ +    if (pme->nnodes > 1)
+ +    {
+ +        double imbal;
+ +
+ +#ifdef GMX_MPI
+ +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
+ +        MPI_Type_commit(&(pme->rvec_mpi));
+ +#endif
+ +
+ +        /* Note that the charge spreading and force gathering, which usually
+ +         * takes about the same amount of time as FFT+solve_pme,
+ +         * is always fully load balanced
+ +         * (unless the charge distribution is inhomogeneous).
+ +         */
+ +
+ +        imbal = pme_load_imbalance(pme);
+ +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
+ +        {
+ +            fprintf(stderr,
+ +                    "\n"
+ +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
+ +                    "      For optimal PME load balancing\n"
+ +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
+ +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+ +                    "\n",
+ +                    (int)((imbal-1)*100 + 0.5),
+ +                    pme->nkx, pme->nky, pme->nnodes_major,
+ +                    pme->nky, pme->nkz, pme->nnodes_minor);
+ +        }
+ +    }
+ +
+ +    /* For non-divisible grid we need pme_order iso pme_order-1 */
+ +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
+ +     * y is always copied through a buffer: we don't need padding in z,
+ +     * but we do need the overlap in x because of the communication order.
+ +     */
+ +    init_overlap_comm(&pme->overlap[0], pme->pme_order,
+ +#ifdef GMX_MPI
+ +                      pme->mpi_comm_d[0],
+ +#endif
+ +                      pme->nnodes_major, pme->nodeid_major,
+ +                      pme->nkx,
+ +                      (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
+ +
+ +    /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
+ +     * We do this with an offset buffer of equal size, so we need to allocate
+ +     * extra for the offset. That's what the (+1)*pme->nkz is for.
+ +     */
+ +    init_overlap_comm(&pme->overlap[1], pme->pme_order,
+ +#ifdef GMX_MPI
+ +                      pme->mpi_comm_d[1],
+ +#endif
+ +                      pme->nnodes_minor, pme->nodeid_minor,
+ +                      pme->nky,
+ +                      (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
+ +
+ +    /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
+ +     * Note that gmx_pme_check_restrictions checked for this already.
+ +     */
+ +    if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
+ +    {
+ +        gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
+ +    }
+ +
+ +    snew(pme->bsp_mod[XX], pme->nkx);
+ +    snew(pme->bsp_mod[YY], pme->nky);
+ +    snew(pme->bsp_mod[ZZ], pme->nkz);
+ +
+ +    /* The required size of the interpolation grid, including overlap.
+ +     * The allocated size (pmegrid_n?) might be slightly larger.
+ +     */
+ +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
+ +        pme->overlap[0].s2g0[pme->nodeid_major];
+ +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
+ +        pme->overlap[1].s2g0[pme->nodeid_minor];
+ +    pme->pmegrid_nz_base = pme->nkz;
+ +    pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
+ +    set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
+ +
+ +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
+ +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
+ +    pme->pmegrid_start_iz = 0;
+ +
+ +    make_gridindex5_to_localindex(pme->nkx,
+ +                                  pme->pmegrid_start_ix,
+ +                                  pme->pmegrid_nx - (pme->pme_order-1),
+ +                                  &pme->nnx, &pme->fshx);
+ +    make_gridindex5_to_localindex(pme->nky,
+ +                                  pme->pmegrid_start_iy,
+ +                                  pme->pmegrid_ny - (pme->pme_order-1),
+ +                                  &pme->nny, &pme->fshy);
+ +    make_gridindex5_to_localindex(pme->nkz,
+ +                                  pme->pmegrid_start_iz,
+ +                                  pme->pmegrid_nz_base,
+ +                                  &pme->nnz, &pme->fshz);
+ +
+ +    pmegrids_init(&pme->pmegridA,
+ +                  pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+ +                  pme->pmegrid_nz_base,
+ +                  pme->pme_order,
+ +                  pme->bUseThreads,
+ +                  pme->nthread,
+ +                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+ +                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+ +
+ +    pme->spline_work = make_pme_spline_work(pme->pme_order);
+ +
+ +    ndata[0] = pme->nkx;
+ +    ndata[1] = pme->nky;
+ +    ndata[2] = pme->nkz;
+ +
+ +    /* This routine will allocate the grid data to fit the FFTs */
+ +    gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
+ +                            &pme->fftgridA, &pme->cfftgridA,
+ +                            pme->mpi_comm_d,
+ +                            bReproducible, pme->nthread);
+ +
+ +    if (bFreeEnergy)
+ +    {
+ +        pmegrids_init(&pme->pmegridB,
+ +                      pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+ +                      pme->pmegrid_nz_base,
+ +                      pme->pme_order,
+ +                      pme->bUseThreads,
+ +                      pme->nthread,
+ +                      pme->nkx % pme->nnodes_major != 0,
+ +                      pme->nky % pme->nnodes_minor != 0);
+ +
+ +        gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
+ +                                &pme->fftgridB, &pme->cfftgridB,
+ +                                pme->mpi_comm_d,
+ +                                bReproducible, pme->nthread);
+ +    }
+ +    else
+ +    {
+ +        pme->pmegridB.grid.grid = NULL;
+ +        pme->fftgridB           = NULL;
+ +        pme->cfftgridB          = NULL;
+ +    }
+ +
+ +    if (!pme->bP3M)
+ +    {
+ +        /* Use plain SPME B-spline interpolation */
+ +        make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
+ +    }
+ +    else
+ +    {
+ +        /* Use the P3M grid-optimized influence function */
+ +        make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
+ +    }
+ +
+ +    /* Use atc[0] for spreading */
+ +    init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
+ +    if (pme->ndecompdim >= 2)
+ +    {
+ +        init_atomcomm(pme, &pme->atc[1], 1, FALSE);
+ +    }
+ +
+ +    if (pme->nnodes == 1)
+ +    {
+ +        pme->atc[0].n = homenr;
+ +        pme_realloc_atomcomm_things(&pme->atc[0]);
+ +    }
+ +
+ +    {
+ +        int thread;
+ +
+ +        /* Use fft5d, order after FFT is y major, z, x minor */
+ +
+ +        snew(pme->work, pme->nthread);
+ +        for (thread = 0; thread < pme->nthread; thread++)
+ +        {
+ +            realloc_work(&pme->work[thread], pme->nkx);
+ +        }
+ +    }
+ +
+ +    *pmedata = pme;
+ +
+ +    return 0;
+ +}
+ +
+ +static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
+ +{
+ +    int d, t;
+ +
+ +    for (d = 0; d < DIM; d++)
+ +    {
+ +        if (new->grid.n[d] > old->grid.n[d])
+ +        {
+ +            return;
+ +        }
+ +    }
+ +
+ +    sfree_aligned(new->grid.grid);
+ +    new->grid.grid = old->grid.grid;
+ +
+ +    if (new->grid_th != NULL && new->nthread == old->nthread)
+ +    {
+ +        sfree_aligned(new->grid_all);
+ +        for (t = 0; t < new->nthread; t++)
+ +        {
+ +            new->grid_th[t].grid = old->grid_th[t].grid;
+ +        }
+ +    }
+ +}
+ +
+ +int gmx_pme_reinit(gmx_pme_t *         pmedata,
+ +                   t_commrec *         cr,
+ +                   gmx_pme_t           pme_src,
+ +                   const t_inputrec *  ir,
+ +                   ivec                grid_size)
+ +{
+ +    t_inputrec irc;
+ +    int homenr;
+ +    int ret;
+ +
+ +    irc     = *ir;
+ +    irc.nkx = grid_size[XX];
+ +    irc.nky = grid_size[YY];
+ +    irc.nkz = grid_size[ZZ];
+ +
+ +    if (pme_src->nnodes == 1)
+ +    {
+ +        homenr = pme_src->atc[0].n;
+ +    }
+ +    else
+ +    {
+ +        homenr = -1;
+ +    }
+ +
+ +    ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
+ +                       &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
+ +
+ +    if (ret == 0)
+ +    {
+ +        /* We can easily reuse the allocated pme grids in pme_src */
+ +        reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
+ +        /* We would like to reuse the fft grids, but that's harder */
+ +    }
+ +
+ +    return ret;
+ +}
+ +
+ +
+ +static void copy_local_grid(gmx_pme_t pme,
+ +                            pmegrids_t *pmegrids, int thread, real *fftgrid)
+ +{
+ +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ +    int  fft_my, fft_mz;
+ +    int  nsx, nsy, nsz;
+ +    ivec nf;
+ +    int  offx, offy, offz, x, y, z, i0, i0t;
+ +    int  d;
+ +    pmegrid_t *pmegrid;
+ +    real *grid_th;
+ +
+ +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ +                                   local_fft_ndata,
+ +                                   local_fft_offset,
+ +                                   local_fft_size);
+ +    fft_my = local_fft_size[YY];
+ +    fft_mz = local_fft_size[ZZ];
+ +
+ +    pmegrid = &pmegrids->grid_th[thread];
+ +
+ +    nsx = pmegrid->s[XX];
+ +    nsy = pmegrid->s[YY];
+ +    nsz = pmegrid->s[ZZ];
+ +
+ +    for (d = 0; d < DIM; d++)
+ +    {
+ +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
+ +                    local_fft_ndata[d] - pmegrid->offset[d]);
+ +    }
+ +
+ +    offx = pmegrid->offset[XX];
+ +    offy = pmegrid->offset[YY];
+ +    offz = pmegrid->offset[ZZ];
+ +
+ +    /* Directly copy the non-overlapping parts of the local grids.
+ +     * This also initializes the full grid.
+ +     */
+ +    grid_th = pmegrid->grid;
+ +    for (x = 0; x < nf[XX]; x++)
+ +    {
+ +        for (y = 0; y < nf[YY]; y++)
+ +        {
+ +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
+ +            i0t = (x*nsy + y)*nsz;
+ +            for (z = 0; z < nf[ZZ]; z++)
+ +            {
+ +                fftgrid[i0+z] = grid_th[i0t+z];
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void
+ +reduce_threadgrid_overlap(gmx_pme_t pme,
+ +                          const pmegrids_t *pmegrids, int thread,
+ +                          real *fftgrid, real *commbuf_x, real *commbuf_y)
+ +{
+ +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ +    int  fft_nx, fft_ny, fft_nz;
+ +    int  fft_my, fft_mz;
+ +    int  buf_my = -1;
+ +    int  nsx, nsy, nsz;
+ +    ivec ne;
+ +    int  offx, offy, offz, x, y, z, i0, i0t;
+ +    int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
+ +    gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
+ +    gmx_bool bCommX, bCommY;
+ +    int  d;
+ +    int  thread_f;
+ +    const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
+ +    const real *grid_th;
+ +    real *commbuf = NULL;
+ +
+ +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ +                                   local_fft_ndata,
+ +                                   local_fft_offset,
+ +                                   local_fft_size);
+ +    fft_nx = local_fft_ndata[XX];
+ +    fft_ny = local_fft_ndata[YY];
+ +    fft_nz = local_fft_ndata[ZZ];
+ +
+ +    fft_my = local_fft_size[YY];
+ +    fft_mz = local_fft_size[ZZ];
+ +
+ +    /* This routine is called when all thread have finished spreading.
+ +     * Here each thread sums grid contributions calculated by other threads
+ +     * to the thread local grid volume.
+ +     * To minimize the number of grid copying operations,
+ +     * this routines sums immediately from the pmegrid to the fftgrid.
+ +     */
+ +
+ +    /* Determine which part of the full node grid we should operate on,
+ +     * this is our thread local part of the full grid.
+ +     */
+ +    pmegrid = &pmegrids->grid_th[thread];
+ +
+ +    for (d = 0; d < DIM; d++)
+ +    {
+ +        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ +                    local_fft_ndata[d]);
+ +    }
+ +
+ +    offx = pmegrid->offset[XX];
+ +    offy = pmegrid->offset[YY];
+ +    offz = pmegrid->offset[ZZ];
+ +
+ +
+ +    bClearBufX  = TRUE;
+ +    bClearBufY  = TRUE;
+ +    bClearBufXY = TRUE;
+ +
+ +    /* Now loop over all the thread data blocks that contribute
+ +     * to the grid region we (our thread) are operating on.
+ +     */
+ +    /* Note that ffy_nx/y is equal to the number of grid points
+ +     * between the first point of our node grid and the one of the next node.
+ +     */
+ +    for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
+ +    {
+ +        fx     = pmegrid->ci[XX] + sx;
+ +        ox     = 0;
+ +        bCommX = FALSE;
+ +        if (fx < 0)
+ +        {
+ +            fx    += pmegrids->nc[XX];
+ +            ox    -= fft_nx;
+ +            bCommX = (pme->nnodes_major > 1);
+ +        }
+ +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
+ +        ox       += pmegrid_g->offset[XX];
+ +        if (!bCommX)
+ +        {
+ +            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+ +        }
+ +        else
+ +        {
+ +            tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
+ +        }
+ +
+ +        for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
+ +        {
+ +            fy     = pmegrid->ci[YY] + sy;
+ +            oy     = 0;
+ +            bCommY = FALSE;
+ +            if (fy < 0)
+ +            {
+ +                fy    += pmegrids->nc[YY];
+ +                oy    -= fft_ny;
+ +                bCommY = (pme->nnodes_minor > 1);
+ +            }
+ +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
+ +            oy       += pmegrid_g->offset[YY];
+ +            if (!bCommY)
+ +            {
+ +                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+ +            }
+ +            else
+ +            {
+ +                ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
+ +            }
+ +
+ +            for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
+ +            {
+ +                fz = pmegrid->ci[ZZ] + sz;
+ +                oz = 0;
+ +                if (fz < 0)
+ +                {
+ +                    fz += pmegrids->nc[ZZ];
+ +                    oz -= fft_nz;
+ +                }
+ +                pmegrid_g = &pmegrids->grid_th[fz];
+ +                oz       += pmegrid_g->offset[ZZ];
+ +                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+ +
+ +                if (sx == 0 && sy == 0 && sz == 0)
+ +                {
+ +                    /* We have already added our local contribution
+ +                     * before calling this routine, so skip it here.
+ +                     */
+ +                    continue;
+ +                }
+ +
+ +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
+ +
+ +                pmegrid_f = &pmegrids->grid_th[thread_f];
+ +
+ +                grid_th = pmegrid_f->grid;
+ +
+ +                nsx = pmegrid_f->s[XX];
+ +                nsy = pmegrid_f->s[YY];
+ +                nsz = pmegrid_f->s[ZZ];
+ +
+ +#ifdef DEBUG_PME_REDUCE
+ +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
+ +                       pme->nodeid, thread, thread_f,
+ +                       pme->pmegrid_start_ix,
+ +                       pme->pmegrid_start_iy,
+ +                       pme->pmegrid_start_iz,
+ +                       sx, sy, sz,
+ +                       offx-ox, tx1-ox, offx, tx1,
+ +                       offy-oy, ty1-oy, offy, ty1,
+ +                       offz-oz, tz1-oz, offz, tz1);
+ +#endif
+ +
+ +                if (!(bCommX || bCommY))
+ +                {
+ +                    /* Copy from the thread local grid to the node grid */
+ +                    for (x = offx; x < tx1; x++)
+ +                    {
+ +                        for (y = offy; y < ty1; y++)
+ +                        {
+ +                            i0  = (x*fft_my + y)*fft_mz;
+ +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+ +                            for (z = offz; z < tz1; z++)
+ +                            {
+ +                                fftgrid[i0+z] += grid_th[i0t+z];
+ +                            }
+ +                        }
+ +                    }
+ +                }
+ +                else
+ +                {
+ +                    /* The order of this conditional decides
+ +                     * where the corner volume gets stored with x+y decomp.
+ +                     */
+ +                    if (bCommY)
+ +                    {
+ +                        commbuf = commbuf_y;
+ +                        buf_my  = ty1 - offy;
+ +                        if (bCommX)
+ +                        {
+ +                            /* We index commbuf modulo the local grid size */
+ +                            commbuf += buf_my*fft_nx*fft_nz;
+ +
+ +                            bClearBuf   = bClearBufXY;
+ +                            bClearBufXY = FALSE;
+ +                        }
+ +                        else
+ +                        {
+ +                            bClearBuf  = bClearBufY;
+ +                            bClearBufY = FALSE;
+ +                        }
+ +                    }
+ +                    else
+ +                    {
+ +                        commbuf    = commbuf_x;
+ +                        buf_my     = fft_ny;
+ +                        bClearBuf  = bClearBufX;
+ +                        bClearBufX = FALSE;
+ +                    }
+ +
+ +                    /* Copy to the communication buffer */
+ +                    for (x = offx; x < tx1; x++)
+ +                    {
+ +                        for (y = offy; y < ty1; y++)
+ +                        {
+ +                            i0  = (x*buf_my + y)*fft_nz;
+ +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+ +
+ +                            if (bClearBuf)
+ +                            {
+ +                                /* First access of commbuf, initialize it */
+ +                                for (z = offz; z < tz1; z++)
+ +                                {
+ +                                    commbuf[i0+z]  = grid_th[i0t+z];
+ +                                }
+ +                            }
+ +                            else
+ +                            {
+ +                                for (z = offz; z < tz1; z++)
+ +                                {
+ +                                    commbuf[i0+z] += grid_th[i0t+z];
+ +                                }
+ +                            }
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+ +{
+ +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ +    pme_overlap_t *overlap;
+ +    int  send_index0, send_nindex;
+ +    int  recv_nindex;
+ +#ifdef GMX_MPI
+ +    MPI_Status stat;
+ +#endif
+ +    int  send_size_y, recv_size_y;
+ +    int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
+ +    real *sendptr, *recvptr;
+ +    int  x, y, z, indg, indb;
+ +
+ +    /* Note that this routine is only used for forward communication.
+ +     * Since the force gathering, unlike the charge spreading,
+ +     * can be trivially parallelized over the particles,
+ +     * the backwards process is much simpler and can use the "old"
+ +     * communication setup.
+ +     */
+ +
+ +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ +                                   local_fft_ndata,
+ +                                   local_fft_offset,
+ +                                   local_fft_size);
+ +
+ +    if (pme->nnodes_minor > 1)
+ +    {
+ +        /* Major dimension */
+ +        overlap = &pme->overlap[1];
+ +
+ +        if (pme->nnodes_major > 1)
+ +        {
+ +            size_yx = pme->overlap[0].comm_data[0].send_nindex;
+ +        }
+ +        else
+ +        {
+ +            size_yx = 0;
+ +        }
+ +        datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
+ +
+ +        send_size_y = overlap->send_size;
+ +
+ +        for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
+ +        {
+ +            send_id       = overlap->send_id[ipulse];
+ +            recv_id       = overlap->recv_id[ipulse];
+ +            send_index0   =
+ +                overlap->comm_data[ipulse].send_index0 -
+ +                overlap->comm_data[0].send_index0;
+ +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
+ +            /* We don't use recv_index0, as we always receive starting at 0 */
+ +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+ +            recv_size_y   = overlap->comm_data[ipulse].recv_size;
+ +
+ +            sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
+ +            recvptr = overlap->recvbuf;
+ +
+ +#ifdef GMX_MPI
+ +            MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
+ +                         send_id, ipulse,
+ +                         recvptr, recv_size_y*datasize, GMX_MPI_REAL,
+ +                         recv_id, ipulse,
+ +                         overlap->mpi_comm, &stat);
+ +#endif
+ +
+ +            for (x = 0; x < local_fft_ndata[XX]; x++)
+ +            {
+ +                for (y = 0; y < recv_nindex; y++)
+ +                {
+ +                    indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+ +                    indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
+ +                    for (z = 0; z < local_fft_ndata[ZZ]; z++)
+ +                    {
+ +                        fftgrid[indg+z] += recvptr[indb+z];
+ +                    }
+ +                }
+ +            }
+ +
+ +            if (pme->nnodes_major > 1)
+ +            {
+ +                /* Copy from the received buffer to the send buffer for dim 0 */
+ +                sendptr = pme->overlap[0].sendbuf;
+ +                for (x = 0; x < size_yx; x++)
+ +                {
+ +                    for (y = 0; y < recv_nindex; y++)
+ +                    {
+ +                        indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+ +                        indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
+ +                        for (z = 0; z < local_fft_ndata[ZZ]; z++)
+ +                        {
+ +                            sendptr[indg+z] += recvptr[indb+z];
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    /* We only support a single pulse here.
+ +     * This is not a severe limitation, as this code is only used
+ +     * with OpenMP and with OpenMP the (PME) domains can be larger.
+ +     */
+ +    if (pme->nnodes_major > 1)
+ +    {
+ +        /* Major dimension */
+ +        overlap = &pme->overlap[0];
+ +
+ +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
+ +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
+ +
+ +        ipulse = 0;
+ +
+ +        send_id       = overlap->send_id[ipulse];
+ +        recv_id       = overlap->recv_id[ipulse];
+ +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
+ +        /* We don't use recv_index0, as we always receive starting at 0 */
+ +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+ +
+ +        sendptr = overlap->sendbuf;
+ +        recvptr = overlap->recvbuf;
+ +
+ +        if (debug != NULL)
+ +        {
+ +            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+ +                    send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
+ +        }
+ +
+ +#ifdef GMX_MPI
+ +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
+ +                     send_id, ipulse,
+ +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
+ +                     recv_id, ipulse,
+ +                     overlap->mpi_comm, &stat);
+ +#endif
+ +
+ +        for (x = 0; x < recv_nindex; x++)
+ +        {
+ +            for (y = 0; y < local_fft_ndata[YY]; y++)
+ +            {
+ +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
+ +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+ +                for (z = 0; z < local_fft_ndata[ZZ]; z++)
+ +                {
+ +                    fftgrid[indg+z] += recvptr[indb+z];
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void spread_on_grid(gmx_pme_t pme,
+ +                           pme_atomcomm_t *atc, pmegrids_t *grids,
+ +                           gmx_bool bCalcSplines, gmx_bool bSpread,
+ +                           real *fftgrid)
+ +{
+ +    int nthread, thread;
+ +#ifdef PME_TIME_THREADS
+ +    gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
+ +    static double cs1     = 0, cs2 = 0, cs3 = 0;
+ +    static double cs1a[6] = {0, 0, 0, 0, 0, 0};
+ +    static int cnt        = 0;
+ +#endif
+ +
+ +    nthread = pme->nthread;
+ +    assert(nthread > 0);
+ +
+ +#ifdef PME_TIME_THREADS
+ +    c1 = omp_cyc_start();
+ +#endif
+ +    if (bCalcSplines)
+ +    {
+ +#pragma omp parallel for num_threads(nthread) schedule(static)
+ +        for (thread = 0; thread < nthread; thread++)
+ +        {
+ +            int start, end;
+ +
+ +            start = atc->n* thread   /nthread;
+ +            end   = atc->n*(thread+1)/nthread;
+ +
+ +            /* Compute fftgrid index for all atoms,
+ +             * with help of some extra variables.
+ +             */
+ +            calc_interpolation_idx(pme, atc, start, end, thread);
+ +        }
+ +    }
+ +#ifdef PME_TIME_THREADS
+ +    c1   = omp_cyc_end(c1);
+ +    cs1 += (double)c1;
+ +#endif
+ +
+ +#ifdef PME_TIME_THREADS
+ +    c2 = omp_cyc_start();
+ +#endif
+ +#pragma omp parallel for num_threads(nthread) schedule(static)
+ +    for (thread = 0; thread < nthread; thread++)
+ +    {
+ +        splinedata_t *spline;
+ +        pmegrid_t *grid = NULL;
+ +
+ +        /* make local bsplines  */
+ +        if (grids == NULL || !pme->bUseThreads)
+ +        {
+ +            spline = &atc->spline[0];
+ +
+ +            spline->n = atc->n;
+ +
+ +            if (bSpread)
+ +            {
+ +                grid = &grids->grid;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            spline = &atc->spline[thread];
+ +
+ +            if (grids->nthread == 1)
+ +            {
+ +                /* One thread, we operate on all charges */
+ +                spline->n = atc->n;
+ +            }
+ +            else
+ +            {
+ +                /* Get the indices our thread should operate on */
+ +                make_thread_local_ind(atc, thread, spline);
+ +            }
+ +
+ +            grid = &grids->grid_th[thread];
+ +        }
+ +
+ +        if (bCalcSplines)
+ +        {
+ +            make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
+ +                          atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
+ +        }
+ +
+ +        if (bSpread)
+ +        {
+ +            /* put local atoms on grid. */
+ +#ifdef PME_TIME_SPREAD
+ +            ct1a = omp_cyc_start();
+ +#endif
+ +            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+ +
+ +            if (pme->bUseThreads)
+ +            {
+ +                copy_local_grid(pme, grids, thread, fftgrid);
+ +            }
+ +#ifdef PME_TIME_SPREAD
+ +            ct1a          = omp_cyc_end(ct1a);
+ +            cs1a[thread] += (double)ct1a;
+ +#endif
+ +        }
+ +    }
+ +#ifdef PME_TIME_THREADS
+ +    c2   = omp_cyc_end(c2);
+ +    cs2 += (double)c2;
+ +#endif
+ +
+ +    if (bSpread && pme->bUseThreads)
+ +    {
+ +#ifdef PME_TIME_THREADS
+ +        c3 = omp_cyc_start();
+ +#endif
+ +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
+ +        for (thread = 0; thread < grids->nthread; thread++)
+ +        {
+ +            reduce_threadgrid_overlap(pme, grids, thread,
+ +                                      fftgrid,
+ +                                      pme->overlap[0].sendbuf,
+ +                                      pme->overlap[1].sendbuf);
+ +        }
+ +#ifdef PME_TIME_THREADS
+ +        c3   = omp_cyc_end(c3);
+ +        cs3 += (double)c3;
+ +#endif
+ +
+ +        if (pme->nnodes > 1)
+ +        {
+ +            /* Communicate the overlapping part of the fftgrid.
+ +             * For this communication call we need to check pme->bUseThreads
+ +             * to have all ranks communicate here, regardless of pme->nthread.
+ +             */
+ +            sum_fftgrid_dd(pme, fftgrid);
+ +        }
+ +    }
+ +
+ +#ifdef PME_TIME_THREADS
+ +    cnt++;
+ +    if (cnt % 20 == 0)
+ +    {
+ +        printf("idx %.2f spread %.2f red %.2f",
+ +               cs1*1e-9, cs2*1e-9, cs3*1e-9);
+ +#ifdef PME_TIME_SPREAD
+ +        for (thread = 0; thread < nthread; thread++)
+ +        {
+ +            printf(" %.2f", cs1a[thread]*1e-9);
+ +        }
+ +#endif
+ +        printf("\n");
+ +    }
+ +#endif
+ +}
+ +
+ +
+ +static void dump_grid(FILE *fp,
+ +                      int sx, int sy, int sz, int nx, int ny, int nz,
+ +                      int my, int mz, const real *g)
+ +{
+ +    int x, y, z;
+ +
+ +    for (x = 0; x < nx; x++)
+ +    {
+ +        for (y = 0; y < ny; y++)
+ +        {
+ +            for (z = 0; z < nz; z++)
+ +            {
+ +                fprintf(fp, "%2d %2d %2d %6.3f\n",
+ +                        sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
+ +{
+ +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ +
+ +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ +                                   local_fft_ndata,
+ +                                   local_fft_offset,
+ +                                   local_fft_size);
+ +
+ +    dump_grid(stderr,
+ +              pme->pmegrid_start_ix,
+ +              pme->pmegrid_start_iy,
+ +              pme->pmegrid_start_iz,
+ +              pme->pmegrid_nx-pme->pme_order+1,
+ +              pme->pmegrid_ny-pme->pme_order+1,
+ +              pme->pmegrid_nz-pme->pme_order+1,
+ +              local_fft_size[YY],
+ +              local_fft_size[ZZ],
+ +              fftgrid);
+ +}
+ +
+ +
+ +void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
+ +{
+ +    pme_atomcomm_t *atc;
+ +    pmegrids_t *grid;
+ +
+ +    if (pme->nnodes > 1)
+ +    {
+ +        gmx_incons("gmx_pme_calc_energy called in parallel");
+ +    }
+ +    if (pme->bFEP > 1)
+ +    {
+ +        gmx_incons("gmx_pme_calc_energy with free energy");
+ +    }
+ +
+ +    atc            = &pme->atc_energy;
+ +    atc->nthread   = 1;
+ +    if (atc->spline == NULL)
+ +    {
+ +        snew(atc->spline, atc->nthread);
+ +    }
+ +    atc->nslab     = 1;
+ +    atc->bSpread   = TRUE;
+ +    atc->pme_order = pme->pme_order;
+ +    atc->n         = n;
+ +    pme_realloc_atomcomm_things(atc);
+ +    atc->x         = x;
+ +    atc->q         = q;
+ +
+ +    /* We only use the A-charges grid */
+ +    grid = &pme->pmegridA;
+ +
+ +    /* Only calculate the spline coefficients, don't actually spread */
+ +    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
+ +
+ +    *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
+ +}
+ +
+ +
+ +static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+ +                                   t_nrnb *nrnb, t_inputrec *ir,
+ +                                   gmx_large_int_t step)
+ +{
+ +    /* Reset all the counters related to performance over the run */
+ +    wallcycle_stop(wcycle, ewcRUN);
+ +    wallcycle_reset_all(wcycle);
+ +    init_nrnb(nrnb);
+ +    if (ir->nsteps >= 0)
+ +    {
+ +        /* ir->nsteps is not used here, but we update it for consistency */
+ +        ir->nsteps -= step - ir->init_step;
+ +    }
+ +    ir->init_step = step;
+ +    wallcycle_start(wcycle, ewcRUN);
+ +}
+ +
+ +
+ +static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
+ +                               ivec grid_size,
+ +                               t_commrec *cr, t_inputrec *ir,
+ +                               gmx_pme_t *pme_ret)
+ +{
+ +    int ind;
+ +    gmx_pme_t pme = NULL;
+ +
+ +    ind = 0;
+ +    while (ind < *npmedata)
+ +    {
+ +        pme = (*pmedata)[ind];
+ +        if (pme->nkx == grid_size[XX] &&
+ +            pme->nky == grid_size[YY] &&
+ +            pme->nkz == grid_size[ZZ])
+ +        {
+ +            *pme_ret = pme;
+ +
+ +            return;
+ +        }
+ +
+ +        ind++;
+ +    }
+ +
+ +    (*npmedata)++;
+ +    srenew(*pmedata, *npmedata);
+ +
+ +    /* Generate a new PME data structure, copying part of the old pointers */
+ +    gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
+ +
+ +    *pme_ret = (*pmedata)[ind];
+ +}
+ +
+ +
+ +int gmx_pmeonly(gmx_pme_t pme,
+ +                t_commrec *cr,    t_nrnb *nrnb,
+ +                gmx_wallcycle_t wcycle,
+ +                real ewaldcoeff,
+ +                t_inputrec *ir)
+ +{
+ +    int npmedata;
+ +    gmx_pme_t *pmedata;
+ +    gmx_pme_pp_t pme_pp;
+ +    int  ret;
+ +    int  natoms;
+ +    matrix box;
+ +    rvec *x_pp      = NULL, *f_pp = NULL;
+ +    real *chargeA   = NULL, *chargeB = NULL;
+ +    real lambda     = 0;
+ +    int  maxshift_x = 0, maxshift_y = 0;
+ +    real energy, dvdlambda;
+ +    matrix vir;
+ +    float cycles;
+ +    int  count;
+ +    gmx_bool bEnerVir;
+ +    gmx_large_int_t step, step_rel;
+ +    ivec grid_switch;
+ +
+ +    /* This data will only use with PME tuning, i.e. switching PME grids */
+ +    npmedata = 1;
+ +    snew(pmedata, npmedata);
+ +    pmedata[0] = pme;
+ +
+ +    pme_pp = gmx_pme_pp_init(cr);
+ +
+ +    init_nrnb(nrnb);
+ +
+ +    count = 0;
+ +    do /****** this is a quasi-loop over time steps! */
+ +    {
+ +        /* The reason for having a loop here is PME grid tuning/switching */
+ +        do
+ +        {
+ +            /* Domain decomposition */
+ +            ret = gmx_pme_recv_q_x(pme_pp,
+ +                                   &natoms,
+ +                                   &chargeA, &chargeB, box, &x_pp, &f_pp,
+ +                                   &maxshift_x, &maxshift_y,
+ +                                   &pme->bFEP, &lambda,
+ +                                   &bEnerVir,
+ +                                   &step,
+ +                                   grid_switch, &ewaldcoeff);
+ +
+ +            if (ret == pmerecvqxSWITCHGRID)
+ +            {
+ +                /* Switch the PME grid to grid_switch */
+ +                gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
+ +            }
+ +
+ +            if (ret == pmerecvqxRESETCOUNTERS)
+ +            {
+ +                /* Reset the cycle and flop counters */
+ +                reset_pmeonly_counters(wcycle, nrnb, ir, step);
+ +            }
+ +        }
+ +        while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
+ +
+ +        if (ret == pmerecvqxFINISH)
+ +        {
+ +            /* We should stop: break out of the loop */
+ +            break;
+ +        }
+ +
+ +        step_rel = step - ir->init_step;
+ +
+ +        if (count == 0)
+ +        {
+ +            wallcycle_start(wcycle, ewcRUN);
+ +        }
+ +
+ +        wallcycle_start(wcycle, ewcPMEMESH);
+ +
+ +        dvdlambda = 0;
+ +        clear_mat(vir);
+ +        gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
+ +                   cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
+ +                   &energy, lambda, &dvdlambda,
+ +                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+ +
+ +        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
+ +
+ +        gmx_pme_send_force_vir_ener(pme_pp,
+ +                                    f_pp, vir, energy, dvdlambda,
+ +                                    cycles);
+ +
+ +        count++;
+ +    } /***** end of quasi-loop, we stop with the break above */
+ +    while (TRUE);
+ +
+ +    return 0;
+ +}
+ +
+ +int gmx_pme_do(gmx_pme_t pme,
+ +               int start,       int homenr,
+ +               rvec x[],        rvec f[],
+ +               real *chargeA,   real *chargeB,
+ +               matrix box, t_commrec *cr,
+ +               int  maxshift_x, int maxshift_y,
+ +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
+ +               matrix vir,      real ewaldcoeff,
+ +               real *energy,    real lambda,
+ +               real *dvdlambda, int flags)
+ +{
+ +    int     q, d, i, j, ntot, npme;
+ +    int     nx, ny, nz;
+ +    int     n_d, local_ny;
+ +    pme_atomcomm_t *atc = NULL;
+ +    pmegrids_t *pmegrid = NULL;
+ +    real    *grid       = NULL;
+ +    real    *ptr;
+ +    rvec    *x_d, *f_d;
+ +    real    *charge = NULL, *q_d;
+ +    real    energy_AB[2];
+ +    matrix  vir_AB[2];
+ +    gmx_bool bClearF;
+ +    gmx_parallel_3dfft_t pfft_setup;
+ +    real *  fftgrid;
+ +    t_complex * cfftgrid;
+ +    int     thread;
+ +    const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
+ +    const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
+ +
+ +    assert(pme->nnodes > 0);
+ +    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
+ +
+ +    if (pme->nnodes > 1)
+ +    {
+ +        atc      = &pme->atc[0];
+ +        atc->npd = homenr;
+ +        if (atc->npd > atc->pd_nalloc)
+ +        {
+ +            atc->pd_nalloc = over_alloc_dd(atc->npd);
+ +            srenew(atc->pd, atc->pd_nalloc);
+ +        }
+ +        atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ +    }
+ +    else
+ +    {
+ +        /* This could be necessary for TPI */
+ +        pme->atc[0].n = homenr;
+ +    }
+ +
+ +    for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+ +    {
+ +        if (q == 0)
+ +        {
+ +            pmegrid    = &pme->pmegridA;
+ +            fftgrid    = pme->fftgridA;
+ +            cfftgrid   = pme->cfftgridA;
+ +            pfft_setup = pme->pfft_setupA;
+ +            charge     = chargeA+start;
+ +        }
+ +        else
+ +        {
+ +            pmegrid    = &pme->pmegridB;
+ +            fftgrid    = pme->fftgridB;
+ +            cfftgrid   = pme->cfftgridB;
+ +            pfft_setup = pme->pfft_setupB;
+ +            charge     = chargeB+start;
+ +        }
+ +        grid = pmegrid->grid.grid;
+ +        /* Unpack structure */
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+ +                    cr->nnodes, cr->nodeid);
+ +            fprintf(debug, "Grid = %p\n", (void*)grid);
+ +            if (grid == NULL)
+ +            {
+ +                gmx_fatal(FARGS, "No grid!");
+ +            }
+ +        }
+ +        where();
+ +
+ +        m_inv_ur0(box, pme->recipbox);
+ +
+ +        if (pme->nnodes == 1)
+ +        {
+ +            atc = &pme->atc[0];
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                atc->n = homenr;
+ +                pme_realloc_atomcomm_things(atc);
+ +            }
+ +            atc->x = x;
+ +            atc->q = charge;
+ +            atc->f = f;
+ +        }
+ +        else
+ +        {
+ +            wallcycle_start(wcycle, ewcPME_REDISTXF);
+ +            for (d = pme->ndecompdim-1; d >= 0; d--)
+ +            {
+ +                if (d == pme->ndecompdim-1)
+ +                {
+ +                    n_d = homenr;
+ +                    x_d = x + start;
+ +                    q_d = charge;
+ +                }
+ +                else
+ +                {
+ +                    n_d = pme->atc[d+1].n;
+ +                    x_d = atc->x;
+ +                    q_d = atc->q;
+ +                }
+ +                atc      = &pme->atc[d];
+ +                atc->npd = n_d;
+ +                if (atc->npd > atc->pd_nalloc)
+ +                {
+ +                    atc->pd_nalloc = over_alloc_dd(atc->npd);
+ +                    srenew(atc->pd, atc->pd_nalloc);
+ +                }
+ +                atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ +                pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
+ +                where();
+ +
+ +                /* Redistribute x (only once) and qA or qB */
+ +                if (DOMAINDECOMP(cr))
+ +                {
+ +                    dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
+ +                }
+ +                else
+ +                {
+ +                    pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
+ +                }
+ +            }
+ +            where();
+ +
+ +            wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ +        }
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+ +                    cr->nodeid, atc->n);
+ +        }
+ +
+ +        if (flags & GMX_PME_SPREAD_Q)
+ +        {
+ +            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ +
+ +            /* Spread the charges on a grid */
+ +            spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
+ +
+ +            if (q == 0)
+ +            {
+ +                inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+ +            }
+ +            inc_nrnb(nrnb, eNR_SPREADQBSP,
+ +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+ +
+ +            if (!pme->bUseThreads)
+ +            {
+ +                wrap_periodic_pmegrid(pme, grid);
+ +
+ +                /* sum contributions to local grid from other nodes */
+ +#ifdef GMX_MPI
+ +                if (pme->nnodes > 1)
+ +                {
+ +                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+ +                    where();
+ +                }
+ +#endif
+ +
+ +                copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
+ +            }
+ +
+ +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ +
+ +            /*
+ +               dump_local_fftgrid(pme,fftgrid);
+ +               exit(0);
+ +             */
+ +        }
+ +
+ +        /* Here we start a large thread parallel region */
+ +#pragma omp parallel num_threads(pme->nthread) private(thread)
+ +        {
+ +            thread = gmx_omp_get_thread_num();
+ +            if (flags & GMX_PME_SOLVE)
+ +            {
+ +                int loop_count;
+ +
+ +                /* do 3d-fft */
+ +                if (thread == 0)
+ +                {
+ +                    wallcycle_start(wcycle, ewcPME_FFT);
+ +                }
+ +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+ +                                           thread, wcycle);
+ +                if (thread == 0)
+ +                {
+ +                    wallcycle_stop(wcycle, ewcPME_FFT);
+ +                }
+ +                where();
+ +
+ +                /* solve in k-space for our local cells */
+ +                if (thread == 0)
+ +                {
+ +                    wallcycle_start(wcycle, ewcPME_SOLVE);
+ +                }
+ +                loop_count =
+ +                    solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
+ +                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ +                                  bCalcEnerVir,
+ +                                  pme->nthread, thread);
+ +                if (thread == 0)
+ +                {
+ +                    wallcycle_stop(wcycle, ewcPME_SOLVE);
+ +                    where();
+ +                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+ +                }
+ +            }
+ +
+ +            if (bCalcF)
+ +            {
+ +                /* do 3d-invfft */
+ +                if (thread == 0)
+ +                {
+ +                    where();
+ +                    wallcycle_start(wcycle, ewcPME_FFT);
+ +                }
+ +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+ +                                           thread, wcycle);
+ +                if (thread == 0)
+ +                {
+ +                    wallcycle_stop(wcycle, ewcPME_FFT);
+ +
+ +                    where();
+ +
+ +                    if (pme->nodeid == 0)
+ +                    {
+ +                        ntot  = pme->nkx*pme->nky*pme->nkz;
+ +                        npme  = ntot*log((real)ntot)/log(2.0);
+ +                        inc_nrnb(nrnb, eNR_FFT, 2*npme);
+ +                    }
+ +
+ +                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ +                }
+ +
+ +                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
+ +            }
+ +        }
+ +        /* End of thread parallel section.
+ +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
+ +         */
+ +
+ +        if (bCalcF)
+ +        {
+ +            /* distribute local grid to all nodes */
+ +#ifdef GMX_MPI
+ +            if (pme->nnodes > 1)
+ +            {
+ +                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+ +            }
+ +#endif
+ +            where();
+ +
+ +            unwrap_periodic_pmegrid(pme, grid);
+ +
+ +            /* interpolate forces for our local atoms */
+ +
+ +            where();
+ +
+ +            /* If we are running without parallelization,
+ +             * atc->f is the actual force array, not a buffer,
+ +             * therefore we should not clear it.
+ +             */
+ +            bClearF = (q == 0 && PAR(cr));
+ +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ +            for (thread = 0; thread < pme->nthread; thread++)
+ +            {
+ +                gather_f_bsplines(pme, grid, bClearF, atc,
+ +                                  &atc->spline[thread],
+ +                                  pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
+ +            }
+ +
+ +            where();
+ +
+ +            inc_nrnb(nrnb, eNR_GATHERFBSP,
+ +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ +        }
+ +
+ +        if (bCalcEnerVir)
+ +        {
+ +            /* This should only be called on the master thread
+ +             * and after the threads have synchronized.
+ +             */
+ +            get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+ +        }
+ +    } /* of q-loop */
+ +
+ +    if (bCalcF && pme->nnodes > 1)
+ +    {
+ +        wallcycle_start(wcycle, ewcPME_REDISTXF);
+ +        for (d = 0; d < pme->ndecompdim; d++)
+ +        {
+ +            atc = &pme->atc[d];
+ +            if (d == pme->ndecompdim - 1)
+ +            {
+ +                n_d = homenr;
+ +                f_d = f + start;
+ +            }
+ +            else
+ +            {
+ +                n_d = pme->atc[d+1].n;
+ +                f_d = pme->atc[d+1].f;
+ +            }
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                dd_pmeredist_f(pme, atc, n_d, f_d,
+ +                               d == pme->ndecompdim-1 && pme->bPPnode);
+ +            }
+ +            else
+ +            {
+ +                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
+ +            }
+ +        }
+ +
+ +        wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ +    }
+ +    where();
+ +
+ +    if (bCalcEnerVir)
+ +    {
+ +        if (!pme->bFEP)
+ +        {
+ +            *energy = energy_AB[0];
+ +            m_add(vir, vir_AB[0], vir);
+ +        }
+ +        else
+ +        {
+ +            *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
+ +            *dvdlambda += energy_AB[1] - energy_AB[0];
+ +            for (i = 0; i < DIM; i++)
+ +            {
+ +                for (j = 0; j < DIM; j++)
+ +                {
+ +                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
+ +                        lambda*vir_AB[1][i][j];
+ +                }
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        *energy = 0;
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "PME mesh energy: %g\n", *energy);
+ +    }
+ +
+ +    return 0;
+ +}
diff --cc src/gromacs/mdlib/sim_util.c

index 36fd8b2b32d5be4564d7c97d2abcb706a3c3a571,0000000000000000000000000000000000000000..538d1caf61f60ad4f1e046eaf2f83caf42fe42aa

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/sim_util.c
--- /dev/null
+++ b/src/gromacs/mdlib/sim_util.c
@@@ -1,2724 -1,0 +1,2718 @@@
-         /* Do the actual neighbour searching and if twin range electrostatics
-          * also do the calculation of long range forces and energies.
-          */
-         for (i = 0; i < efptNR; i++)
-         {
-             dvdlambda[i] = 0;
-         }
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROwing Monsters And Cloning Shrimps
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#ifdef GMX_CRAY_XT3
+ +#include <catamount/dclock.h>
+ +#endif
+ +
+ +
+ +#include <stdio.h>
+ +#include <time.h>
+ +#ifdef HAVE_SYS_TIME_H
+ +#include <sys/time.h>
+ +#endif
+ +#include <math.h>
+ +#include "typedefs.h"
+ +#include "string2.h"
+ +#include "gmxfio.h"
+ +#include "smalloc.h"
+ +#include "names.h"
+ +#include "confio.h"
+ +#include "mvdata.h"
+ +#include "txtdump.h"
+ +#include "pbc.h"
+ +#include "chargegroup.h"
+ +#include "vec.h"
+ +#include <time.h>
+ +#include "nrnb.h"
+ +#include "mshift.h"
+ +#include "mdrun.h"
+ +#include "sim_util.h"
+ +#include "update.h"
+ +#include "physics.h"
+ +#include "main.h"
+ +#include "mdatoms.h"
+ +#include "force.h"
+ +#include "bondf.h"
+ +#include "pme.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "network.h"
+ +#include "calcmu.h"
+ +#include "constr.h"
+ +#include "xvgr.h"
+ +#include "trnio.h"
+ +#include "xtcio.h"
+ +#include "copyrite.h"
+ +#include "pull_rotation.h"
+ +#include "gmx_random.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "gmx_wallcycle.h"
+ +#include "genborn.h"
+ +#include "nbnxn_atomdata.h"
+ +#include "nbnxn_search.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+ +
+ +#include "gromacs/utility/gmxmpi.h"
+ +
+ +#include "adress.h"
+ +#include "qmmm.h"
+ +
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +#include "nbnxn_cuda/nbnxn_cuda.h"
+ +
+ +double
+ +gmx_gettime()
+ +{
+ +#ifdef HAVE_GETTIMEOFDAY
+ +    struct timeval t;
+ +    double         seconds;
+ +
+ +    gettimeofday(&t, NULL);
+ +
+ +    seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
+ +
+ +    return seconds;
+ +#else
+ +    double  seconds;
+ +
+ +    seconds = time(NULL);
+ +
+ +    return seconds;
+ +#endif
+ +}
+ +
+ +
+ +#define difftime(end, start) ((double)(end)-(double)(start))
+ +
+ +void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
+ +                t_inputrec *ir, t_commrec gmx_unused *cr)
+ +{
+ +    time_t finish;
+ +    char   timebuf[STRLEN];
+ +    double dt;
+ +    char   buf[48];
+ +
+ +#ifndef GMX_THREAD_MPI
+ +    if (!PAR(cr))
+ +#endif
+ +    {
+ +        fprintf(out, "\r");
+ +    }
+ +    fprintf(out, "step %s", gmx_step_str(step, buf));
+ +    if ((step >= ir->nstlist))
+ +    {
+ +        runtime->last          = gmx_gettime();
+ +        dt                     = difftime(runtime->last, runtime->real);
+ +        runtime->time_per_step = dt/(step - ir->init_step + 1);
+ +
+ +        dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
+ +
+ +        if (ir->nsteps >= 0)
+ +        {
+ +            if (dt >= 300)
+ +            {
+ +                finish = (time_t) (runtime->last + dt);
+ +                gmx_ctime_r(&finish, timebuf, STRLEN);
+ +                sprintf(buf, "%s", timebuf);
+ +                buf[strlen(buf)-1] = '\0';
+ +                fprintf(out, ", will finish %s", buf);
+ +            }
+ +            else
+ +            {
+ +                fprintf(out, ", remaining runtime: %5d s          ", (int)dt);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            fprintf(out, " performance: %.1f ns/day    ",
+ +                    ir->delta_t/1000*24*60*60/runtime->time_per_step);
+ +        }
+ +    }
+ +#ifndef GMX_THREAD_MPI
+ +    if (PAR(cr))
+ +    {
+ +        fprintf(out, "\n");
+ +    }
+ +#endif
+ +
+ +    fflush(out);
+ +}
+ +
+ +#ifdef NO_CLOCK
+ +#define clock() -1
+ +#endif
+ +
+ +static double set_proctime(gmx_runtime_t *runtime)
+ +{
+ +    double diff;
+ +#ifdef GMX_CRAY_XT3
+ +    double prev;
+ +
+ +    prev          = runtime->proc;
+ +    runtime->proc = dclock();
+ +
+ +    diff = runtime->proc - prev;
+ +#else
+ +    clock_t prev;
+ +
+ +    prev          = runtime->proc;
+ +    runtime->proc = clock();
+ +
+ +    diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
+ +#endif
+ +    if (diff < 0)
+ +    {
+ +        /* The counter has probably looped, ignore this data */
+ +        diff = 0;
+ +    }
+ +
+ +    return diff;
+ +}
+ +
+ +void runtime_start(gmx_runtime_t *runtime)
+ +{
+ +    runtime->real          = gmx_gettime();
+ +    runtime->proc          = 0;
+ +    set_proctime(runtime);
+ +    runtime->realtime      = 0;
+ +    runtime->proctime      = 0;
+ +    runtime->last          = 0;
+ +    runtime->time_per_step = 0;
+ +}
+ +
+ +void runtime_end(gmx_runtime_t *runtime)
+ +{
+ +    double now;
+ +
+ +    now = gmx_gettime();
+ +
+ +    runtime->proctime += set_proctime(runtime);
+ +    runtime->realtime  = now - runtime->real;
+ +    runtime->real      = now;
+ +}
+ +
+ +void runtime_upd_proc(gmx_runtime_t *runtime)
+ +{
+ +    runtime->proctime += set_proctime(runtime);
+ +}
+ +
+ +void print_date_and_time(FILE *fplog, int nodeid, const char *title,
+ +                         const gmx_runtime_t *runtime)
+ +{
+ +    int    i;
+ +    char   timebuf[STRLEN];
+ +    char   time_string[STRLEN];
+ +    time_t tmptime;
+ +
+ +    if (fplog)
+ +    {
+ +        if (runtime != NULL)
+ +        {
+ +            tmptime = (time_t) runtime->real;
+ +            gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ +        }
+ +        else
+ +        {
+ +            tmptime = (time_t) gmx_gettime();
+ +            gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ +        }
+ +        for (i = 0; timebuf[i] >= ' '; i++)
+ +        {
+ +            time_string[i] = timebuf[i];
+ +        }
+ +        time_string[i] = '\0';
+ +
+ +        fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
+ +    }
+ +}
+ +
+ +static void sum_forces(int start, int end, rvec f[], rvec flr[])
+ +{
+ +    int i;
+ +
+ +    if (gmx_debug_at)
+ +    {
+ +        pr_rvecs(debug, 0, "fsr", f+start, end-start);
+ +        pr_rvecs(debug, 0, "flr", flr+start, end-start);
+ +    }
+ +    for (i = start; (i < end); i++)
+ +    {
+ +        rvec_inc(f[i], flr[i]);
+ +    }
+ +}
+ +
+ +/*
+ + * calc_f_el calculates forces due to an electric field.
+ + *
+ + * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
+ + *
+ + * Et[] contains the parameters for the time dependent
+ + * part of the field (not yet used).
+ + * Ex[] contains the parameters for
+ + * the spatial dependent part of the field. You can have cool periodic
+ + * fields in principle, but only a constant field is supported
+ + * now.
+ + * The function should return the energy due to the electric field
+ + * (if any) but for now returns 0.
+ + *
+ + * WARNING:
+ + * There can be problems with the virial.
+ + * Since the field is not self-consistent this is unavoidable.
+ + * For neutral molecules the virial is correct within this approximation.
+ + * For neutral systems with many charged molecules the error is small.
+ + * But for systems with a net charge or a few charged molecules
+ + * the error can be significant when the field is high.
+ + * Solution: implement a self-consitent electric field into PME.
+ + */
+ +static void calc_f_el(FILE *fp, int  start, int homenr,
+ +                      real charge[], rvec f[],
+ +                      t_cosines Ex[], t_cosines Et[], double t)
+ +{
+ +    rvec Ext;
+ +    real t0;
+ +    int  i, m;
+ +
+ +    for (m = 0; (m < DIM); m++)
+ +    {
+ +        if (Et[m].n > 0)
+ +        {
+ +            if (Et[m].n == 3)
+ +            {
+ +                t0     = Et[m].a[1];
+ +                Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
+ +            }
+ +            else
+ +            {
+ +                Ext[m] = cos(Et[m].a[0]*t);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            Ext[m] = 1.0;
+ +        }
+ +        if (Ex[m].n > 0)
+ +        {
+ +            /* Convert the field strength from V/nm to MD-units */
+ +            Ext[m] *= Ex[m].a[0]*FIELDFAC;
+ +            for (i = start; (i < start+homenr); i++)
+ +            {
+ +                f[i][m] += charge[i]*Ext[m];
+ +            }
+ +        }
+ +        else
+ +        {
+ +            Ext[m] = 0;
+ +        }
+ +    }
+ +    if (fp != NULL)
+ +    {
+ +        fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
+ +                Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
+ +    }
+ +}
+ +
+ +static void calc_virial(int start, int homenr, rvec x[], rvec f[],
+ +                        tensor vir_part, t_graph *graph, matrix box,
+ +                        t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
+ +{
+ +    int    i, j;
+ +    tensor virtest;
+ +
+ +    /* The short-range virial from surrounding boxes */
+ +    clear_mat(vir_part);
+ +    calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
+ +    inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
+ +
+ +    /* Calculate partial virial, for local atoms only, based on short range.
+ +     * Total virial is computed in global_stat, called from do_md
+ +     */
+ +    f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
+ +    inc_nrnb(nrnb, eNR_VIRIAL, homenr);
+ +
+ +    /* Add position restraint contribution */
+ +    for (i = 0; i < DIM; i++)
+ +    {
+ +        vir_part[i][i] += fr->vir_diag_posres[i];
+ +    }
+ +
+ +    /* Add wall contribution */
+ +    for (i = 0; i < DIM; i++)
+ +    {
+ +        vir_part[i][ZZ] += fr->vir_wall_z[i];
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
+ +    }
+ +}
+ +
+ +static void posres_wrapper(FILE *fplog,
+ +                           int flags,
+ +                           gmx_bool bSepDVDL,
+ +                           t_inputrec *ir,
+ +                           t_nrnb *nrnb,
+ +                           gmx_localtop_t *top,
+ +                           matrix box, rvec x[],
+ +                           gmx_enerdata_t *enerd,
+ +                           real *lambda,
+ +                           t_forcerec *fr)
+ +{
+ +    t_pbc pbc;
+ +    real  v, dvdl;
+ +    int   i;
+ +
+ +    /* Position restraints always require full pbc */
+ +    set_pbc(&pbc, ir->ePBC, box);
+ +    dvdl = 0;
+ +    v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ +                  top->idef.iparams_posres,
+ +                  (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ +                  ir->ePBC == epbcNONE ? NULL : &pbc,
+ +                  lambda[efptRESTRAINT], &dvdl,
+ +                  fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ +    if (bSepDVDL)
+ +    {
+ +        gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
+ +    }
+ +    enerd->term[F_POSRES] += v;
+ +    /* If just the force constant changes, the FEP term is linear,
+ +     * but if k changes, it is not.
+ +     */
+ +    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+ +    inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
+ +
+ +    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+ +    {
+ +        for (i = 0; i < enerd->n_lambda; i++)
+ +        {
+ +            real dvdl_dum, lambda_dum;
+ +
+ +            lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+ +            v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ +                                top->idef.iparams_posres,
+ +                                (const rvec*)x, NULL, NULL,
+ +                                ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
+ +                                fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ +            enerd->enerpart_lambda[i] += v;
+ +        }
+ +    }
+ +}
+ +
+ +static void pull_potential_wrapper(FILE *fplog,
+ +                                   gmx_bool bSepDVDL,
+ +                                   t_commrec *cr,
+ +                                   t_inputrec *ir,
+ +                                   matrix box, rvec x[],
+ +                                   rvec f[],
+ +                                   tensor vir_force,
+ +                                   t_mdatoms *mdatoms,
+ +                                   gmx_enerdata_t *enerd,
+ +                                   real *lambda,
+ +                                   double t)
+ +{
+ +    t_pbc  pbc;
+ +    real   dvdl;
+ +
+ +    /* Calculate the center of mass forces, this requires communication,
+ +     * which is why pull_potential is called close to other communication.
+ +     * The virial contribution is calculated directly,
+ +     * which is why we call pull_potential after calc_virial.
+ +     */
+ +    set_pbc(&pbc, ir->ePBC, box);
+ +    dvdl                     = 0;
+ +    enerd->term[F_COM_PULL] +=
+ +        pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
+ +                       cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
+ +    if (bSepDVDL)
+ +    {
+ +        gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
+ +    }
+ +    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+ +}
+ +
+ +static void pme_receive_force_ener(FILE           *fplog,
+ +                                   gmx_bool        bSepDVDL,
+ +                                   t_commrec      *cr,
+ +                                   gmx_wallcycle_t wcycle,
+ +                                   gmx_enerdata_t *enerd,
+ +                                   t_forcerec     *fr)
+ +{
+ +    real   e, v, dvdl;
+ +    float  cycles_ppdpme, cycles_seppme;
+ +
+ +    cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
+ +    dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
+ +
+ +    /* In case of node-splitting, the PP nodes receive the long-range
+ +     * forces, virial and energy from the PME nodes here.
+ +     */
+ +    wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
+ +    dvdl = 0;
+ +    gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
+ +                      &cycles_seppme);
+ +    if (bSepDVDL)
+ +    {
+ +        gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
+ +    }
+ +    enerd->term[F_COUL_RECIP] += e;
+ +    enerd->dvdl_lin[efptCOUL] += dvdl;
+ +    if (wcycle)
+ +    {
+ +        dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
+ +    }
+ +    wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
+ +}
+ +
+ +static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
+ +                               gmx_large_int_t step, real pforce, rvec *x, rvec *f)
+ +{
+ +    int  i;
+ +    real pf2, fn2;
+ +    char buf[STEPSTRSIZE];
+ +
+ +    pf2 = sqr(pforce);
+ +    for (i = md->start; i < md->start+md->homenr; i++)
+ +    {
+ +        fn2 = norm2(f[i]);
+ +        /* We also catch NAN, if the compiler does not optimize this away. */
+ +        if (fn2 >= pf2 || fn2 != fn2)
+ +        {
+ +            fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
+ +                    gmx_step_str(step, buf),
+ +                    ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
+ +        }
+ +    }
+ +}
+ +
+ +static void post_process_forces(t_commrec *cr,
+ +                                gmx_large_int_t step,
+ +                                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +                                gmx_localtop_t *top,
+ +                                matrix box, rvec x[],
+ +                                rvec f[],
+ +                                tensor vir_force,
+ +                                t_mdatoms *mdatoms,
+ +                                t_graph *graph,
+ +                                t_forcerec *fr, gmx_vsite_t *vsite,
+ +                                int flags)
+ +{
+ +    if (fr->bF_NoVirSum)
+ +    {
+ +        if (vsite)
+ +        {
+ +            /* Spread the mesh force on virtual sites to the other particles...
+ +             * This is parallellized. MPI communication is performed
+ +             * if the constructing atoms aren't local.
+ +             */
+ +            wallcycle_start(wcycle, ewcVSITESPREAD);
+ +            spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
+ +                           (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
+ +                           nrnb,
+ +                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +            wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +        }
+ +        if (flags & GMX_FORCE_VIRIAL)
+ +        {
+ +            /* Now add the forces, this is local */
+ +            if (fr->bDomDec)
+ +            {
+ +                sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
+ +            }
+ +            else
+ +            {
+ +                sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
+ +                           f, fr->f_novirsum);
+ +            }
+ +            if (EEL_FULL(fr->eeltype))
+ +            {
+ +                /* Add the mesh contribution to the virial */
+ +                m_add(vir_force, fr->vir_el_recip, vir_force);
+ +            }
+ +            if (debug)
+ +            {
+ +                pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
+ +            }
+ +        }
+ +    }
+ +
+ +    if (fr->print_force >= 0)
+ +    {
+ +        print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
+ +    }
+ +}
+ +
+ +static void do_nb_verlet(t_forcerec *fr,
+ +                         interaction_const_t *ic,
+ +                         gmx_enerdata_t *enerd,
+ +                         int flags, int ilocality,
+ +                         int clearF,
+ +                         t_nrnb *nrnb)
+ +{
+ +    int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
+ +    char                      *env;
+ +    nonbonded_verlet_group_t  *nbvg;
+ +    gmx_bool                  bCUDA;
+ +
+ +    if (!(flags & GMX_FORCE_NONBONDED))
+ +    {
+ +        /* skip non-bonded calculation */
+ +        return;
+ +    }
+ +
+ +    nbvg = &fr->nbv->grp[ilocality];
+ +
+ +    /* CUDA kernel launch overhead is already timed separately */
+ +    if (fr->cutoff_scheme != ecutsVERLET)
+ +    {
+ +        gmx_incons("Invalid cut-off scheme passed!");
+ +    }
+ +
+ +    bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
+ +
+ +    if (!bCUDA)
+ +    {
+ +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+ +    }
+ +    switch (nbvg->kernel_type)
+ +    {
+ +        case nbnxnk4x4_PlainC:
+ +            nbnxn_kernel_ref(&nbvg->nbl_lists,
+ +                             nbvg->nbat, ic,
+ +                             fr->shift_vec,
+ +                             flags,
+ +                             clearF,
+ +                             fr->fshift[0],
+ +                             enerd->grpp.ener[egCOULSR],
+ +                             fr->bBHAM ?
+ +                             enerd->grpp.ener[egBHAMSR] :
+ +                             enerd->grpp.ener[egLJSR]);
+ +            break;
+ +
+ +        case nbnxnk4xN_SIMD_4xN:
+ +            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+ +                                  nbvg->nbat, ic,
+ +                                  nbvg->ewald_excl,
+ +                                  fr->shift_vec,
+ +                                  flags,
+ +                                  clearF,
+ +                                  fr->fshift[0],
+ +                                  enerd->grpp.ener[egCOULSR],
+ +                                  fr->bBHAM ?
+ +                                  enerd->grpp.ener[egBHAMSR] :
+ +                                  enerd->grpp.ener[egLJSR]);
+ +            break;
+ +        case nbnxnk4xN_SIMD_2xNN:
+ +            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+ +                                   nbvg->nbat, ic,
+ +                                   nbvg->ewald_excl,
+ +                                   fr->shift_vec,
+ +                                   flags,
+ +                                   clearF,
+ +                                   fr->fshift[0],
+ +                                   enerd->grpp.ener[egCOULSR],
+ +                                   fr->bBHAM ?
+ +                                   enerd->grpp.ener[egBHAMSR] :
+ +                                   enerd->grpp.ener[egLJSR]);
+ +            break;
+ +
+ +        case nbnxnk8x8x8_CUDA:
+ +            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ +            break;
+ +
+ +        case nbnxnk8x8x8_PlainC:
+ +            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+ +                                 nbvg->nbat, ic,
+ +                                 fr->shift_vec,
+ +                                 flags,
+ +                                 clearF,
+ +                                 nbvg->nbat->out[0].f,
+ +                                 fr->fshift[0],
+ +                                 enerd->grpp.ener[egCOULSR],
+ +                                 fr->bBHAM ?
+ +                                 enerd->grpp.ener[egBHAMSR] :
+ +                                 enerd->grpp.ener[egLJSR]);
+ +            break;
+ +
+ +        default:
+ +            gmx_incons("Invalid nonbonded kernel type passed!");
+ +
+ +    }
+ +    if (!bCUDA)
+ +    {
+ +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+ +    }
+ +
+ +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ +    {
+ +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
+ +    }
+ +    else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
+ +             (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+ +    {
+ +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
+ +    }
+ +    else
+ +    {
+ +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
+ +    }
+ +    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
+ +    if (flags & GMX_FORCE_ENERGY)
+ +    {
+ +        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+ +        enr_nbnxn_kernel_ljc += 1;
+ +        enr_nbnxn_kernel_lj  += 1;
+ +    }
+ +
+ +    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
+ +             nbvg->nbl_lists.natpair_ljq);
+ +    inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
+ +             nbvg->nbl_lists.natpair_lj);
+ +    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
+ +             nbvg->nbl_lists.natpair_q);
+ +}
+ +
+ +void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
+ +                         t_inputrec *inputrec,
+ +                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +                         gmx_localtop_t *top,
+ +                         gmx_groups_t gmx_unused *groups,
+ +                         matrix box, rvec x[], history_t *hist,
+ +                         rvec f[],
+ +                         tensor vir_force,
+ +                         t_mdatoms *mdatoms,
+ +                         gmx_enerdata_t *enerd, t_fcdata *fcd,
+ +                         real *lambda, t_graph *graph,
+ +                         t_forcerec *fr, interaction_const_t *ic,
+ +                         gmx_vsite_t *vsite, rvec mu_tot,
+ +                         double t, FILE *field, gmx_edsam_t ed,
+ +                         gmx_bool bBornRadii,
+ +                         int flags)
+ +{
+ +    int                 cg0, cg1, i, j;
+ +    int                 start, homenr;
+ +    int                 nb_kernel_type;
+ +    double              mu[2*DIM];
+ +    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
+ +    gmx_bool            bDiffKernels = FALSE;
+ +    matrix              boxs;
+ +    rvec                vzero, box_diag;
+ +    real                e, v, dvdl;
+ +    float               cycles_pme, cycles_force;
+ +    nonbonded_verlet_t *nbv;
+ +
+ +    cycles_force   = 0;
+ +    nbv            = fr->nbv;
+ +    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+ +
+ +    start  = mdatoms->start;
+ +    homenr = mdatoms->homenr;
+ +
+ +    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+ +
+ +    clear_mat(vir_force);
+ +
+ +    cg0 = 0;
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        cg1 = cr->dd->ncg_tot;
+ +    }
+ +    else
+ +    {
+ +        cg1 = top->cgs.nr;
+ +    }
+ +    if (fr->n_tpi > 0)
+ +    {
+ +        cg1--;
+ +    }
+ +
+ +    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ +    bFillGrid     = (bNS && bStateChanged);
+ +    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
+ +    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
+ +    bDoForces     = (flags & GMX_FORCE_FORCES);
+ +    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+ +    bUseGPU       = fr->nbv->bUseGPU;
+ +    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
+ +
+ +    if (bStateChanged)
+ +    {
+ +        update_forcerec(fr, box);
+ +
+ +        if (NEED_MUTOT(*inputrec))
+ +        {
+ +            /* Calculate total (local) dipole moment in a temporary common array.
+ +             * This makes it possible to sum them over nodes faster.
+ +             */
+ +            calc_mu(start, homenr,
+ +                    x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ +                    mu, mu+DIM);
+ +        }
+ +    }
+ +
+ +    if (fr->ePBC != epbcNONE)
+ +    {
+ +        /* Compute shift vectors every step,
+ +         * because of pressure coupling or box deformation!
+ +         */
+ +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ +        {
+ +            calc_shifts(box, fr->shift_vec);
+ +        }
+ +
+ +        if (bCalcCGCM)
+ +        {
+ +            put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
+ +            inc_nrnb(nrnb, eNR_SHIFTX, homenr);
+ +        }
+ +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ +        {
+ +            unshift_self(graph, box, x);
+ +        }
+ +    }
+ +
+ +    nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+ +                                 fr->shift_vec, nbv->grp[0].nbat);
+ +
+ +#ifdef GMX_MPI
+ +    if (!(cr->duty & DUTY_PME))
+ +    {
+ +        /* Send particle coordinates to the pme nodes.
+ +         * Since this is only implemented for domain decomposition
+ +         * and domain decomposition does not use the graph,
+ +         * we do not need to worry about shifting.
+ +         */
+ +
+ +        wallcycle_start(wcycle, ewcPP_PMESENDX);
+ +
+ +        bBS = (inputrec->nwall == 2);
+ +        if (bBS)
+ +        {
+ +            copy_mat(box, boxs);
+ +            svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ +        }
+ +
+ +        gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ +                       mdatoms->nChargePerturbed, lambda[efptCOUL],
+ +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+ +
+ +        wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ +    }
+ +#endif /* GMX_MPI */
+ +
+ +    /* do gridding for pair search */
+ +    if (bNS)
+ +    {
+ +        if (graph && bStateChanged)
+ +        {
+ +            /* Calculate intramolecular shift vectors to make molecules whole */
+ +            mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        }
+ +
+ +        clear_rvec(vzero);
+ +        box_diag[XX] = box[XX][XX];
+ +        box_diag[YY] = box[YY][YY];
+ +        box_diag[ZZ] = box[ZZ][ZZ];
+ +
+ +        wallcycle_start(wcycle, ewcNS);
+ +        if (!fr->bDomDec)
+ +        {
+ +            wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
+ +            nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
+ +                              0, vzero, box_diag,
+ +                              0, mdatoms->homenr, -1, fr->cginfo, x,
+ +                              0, NULL,
+ +                              nbv->grp[eintLocal].kernel_type,
+ +                              nbv->grp[eintLocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
+ +        }
+ +        else
+ +        {
+ +            wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
+ +            nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
+ +                                       fr->cginfo, x,
+ +                                       nbv->grp[eintNonlocal].kernel_type,
+ +                                       nbv->grp[eintNonlocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
+ +        }
+ +
+ +        if (nbv->ngrp == 1 ||
+ +            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+ +        {
+ +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
+ +                               nbv->nbs, mdatoms, fr->cginfo);
+ +        }
+ +        else
+ +        {
+ +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
+ +                               nbv->nbs, mdatoms, fr->cginfo);
+ +            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
+ +                               nbv->nbs, mdatoms, fr->cginfo);
+ +        }
+ +        wallcycle_stop(wcycle, ewcNS);
+ +    }
+ +
+ +    /* initialize the GPU atom data and copy shift vector */
+ +    if (bUseGPU)
+ +    {
+ +        if (bNS)
+ +        {
+ +            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ +            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +        }
+ +
+ +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ +        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +    }
+ +
+ +    /* do local pair search */
+ +    if (bNS)
+ +    {
+ +        wallcycle_start_nocount(wcycle, ewcNS);
+ +        wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
+ +        nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+ +                            &top->excls,
+ +                            ic->rlist,
+ +                            nbv->min_ci_balanced,
+ +                            &nbv->grp[eintLocal].nbl_lists,
+ +                            eintLocal,
+ +                            nbv->grp[eintLocal].kernel_type,
+ +                            nrnb);
+ +        wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
+ +
+ +        if (bUseGPU)
+ +        {
+ +            /* initialize local pair-list on the GPU */
+ +            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ +                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
+ +                                     eintLocal);
+ +        }
+ +        wallcycle_stop(wcycle, ewcNS);
+ +    }
+ +    else
+ +    {
+ +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ +        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
+ +                                        nbv->grp[eintLocal].nbat);
+ +        wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +    }
+ +
+ +    if (bUseGPU)
+ +    {
+ +        wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ +        /* launch local nonbonded F on GPU */
+ +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ +                     nrnb);
+ +        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +    }
+ +
+ +    /* Communicate coordinates and sum dipole if necessary +
+ +       do non-local pair search */
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+ +                        nbv->grp[eintLocal].kernel_type);
+ +
+ +        if (bDiffKernels)
+ +        {
+ +            /* With GPU+CPU non-bonded calculations we need to copy
+ +             * the local coordinates to the non-local nbat struct
+ +             * (in CPU format) as the non-local kernel call also
+ +             * calculates the local - non-local interactions.
+ +             */
+ +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
+ +                                            nbv->grp[eintNonlocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ +            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        }
+ +
+ +        if (bNS)
+ +        {
+ +            wallcycle_start_nocount(wcycle, ewcNS);
+ +            wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+ +
+ +            if (bDiffKernels)
+ +            {
+ +                nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
+ +            }
+ +
+ +            nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+ +                                &top->excls,
+ +                                ic->rlist,
+ +                                nbv->min_ci_balanced,
+ +                                &nbv->grp[eintNonlocal].nbl_lists,
+ +                                eintNonlocal,
+ +                                nbv->grp[eintNonlocal].kernel_type,
+ +                                nrnb);
+ +
+ +            wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+ +
+ +            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+ +            {
+ +                /* initialize non-local pair-list on the GPU */
+ +                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ +                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ +                                         eintNonlocal);
+ +            }
+ +            wallcycle_stop(wcycle, ewcNS);
+ +        }
+ +        else
+ +        {
+ +            wallcycle_start(wcycle, ewcMOVEX);
+ +            dd_move_x(cr->dd, box, x);
+ +
+ +            /* When we don't need the total dipole we sum it in global_stat */
+ +            if (bStateChanged && NEED_MUTOT(*inputrec))
+ +            {
+ +                gmx_sumd(2*DIM, mu, cr);
+ +            }
+ +            wallcycle_stop(wcycle, ewcMOVEX);
+ +
+ +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
+ +                                            nbv->grp[eintNonlocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        }
+ +
+ +        if (bUseGPU && !bDiffKernels)
+ +        {
+ +            wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ +            /* launch non-local nonbonded F on GPU */
+ +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ +                         nrnb);
+ +            cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +        }
+ +    }
+ +
+ +    if (bUseGPU)
+ +    {
+ +        /* launch D2H copy-back F */
+ +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +        if (DOMAINDECOMP(cr) && !bDiffKernels)
+ +        {
+ +            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+ +                                      flags, eatNonlocal);
+ +        }
+ +        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+ +                                  flags, eatLocal);
+ +        cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +    }
+ +
+ +    if (bStateChanged && NEED_MUTOT(*inputrec))
+ +    {
+ +        if (PAR(cr))
+ +        {
+ +            gmx_sumd(2*DIM, mu, cr);
+ +        }
+ +
+ +        for (i = 0; i < 2; i++)
+ +        {
+ +            for (j = 0; j < DIM; j++)
+ +            {
+ +                fr->mu_tot[i][j] = mu[i*DIM + j];
+ +            }
+ +        }
+ +    }
+ +    if (fr->efep == efepNO)
+ +    {
+ +        copy_rvec(fr->mu_tot[0], mu_tot);
+ +    }
+ +    else
+ +    {
+ +        for (j = 0; j < DIM; j++)
+ +        {
+ +            mu_tot[j] =
+ +                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+ +                lambda[efptCOUL]*fr->mu_tot[1][j];
+ +        }
+ +    }
+ +
+ +    /* Reset energies */
+ +    reset_enerdata(fr, bNS, enerd, MASTER(cr));
+ +    clear_rvecs(SHIFTS, fr->fshift);
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        if (!(cr->duty & DUTY_PME))
+ +        {
+ +            wallcycle_start(wcycle, ewcPPDURINGPME);
+ +            dd_force_flop_start(cr->dd, nrnb);
+ +        }
+ +    }
+ +
+ +    if (inputrec->bRot)
+ +    {
+ +        /* Enforced rotation has its own cycle counter that starts after the collective
+ +         * coordinates have been communicated. It is added to ddCyclF to allow
+ +         * for proper load-balancing */
+ +        wallcycle_start(wcycle, ewcROT);
+ +        do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ +        wallcycle_stop(wcycle, ewcROT);
+ +    }
+ +
+ +    /* Start the force cycle counter.
+ +     * This counter is stopped in do_forcelow_level.
+ +     * No parallel communication should occur while this counter is running,
+ +     * since that will interfere with the dynamic load balancing.
+ +     */
+ +    wallcycle_start(wcycle, ewcFORCE);
+ +    if (bDoForces)
+ +    {
+ +        /* Reset forces for which the virial is calculated separately:
+ +         * PME/Ewald forces if necessary */
+ +        if (fr->bF_NoVirSum)
+ +        {
+ +            if (flags & GMX_FORCE_VIRIAL)
+ +            {
+ +                fr->f_novirsum = fr->f_novirsum_alloc;
+ +                if (fr->bDomDec)
+ +                {
+ +                    clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ +                }
+ +                else
+ +                {
+ +                    clear_rvecs(homenr, fr->f_novirsum+start);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                /* We are not calculating the pressure so we do not need
+ +                 * a separate array for forces that do not contribute
+ +                 * to the pressure.
+ +                 */
+ +                fr->f_novirsum = f;
+ +            }
+ +        }
+ +
+ +        /* Clear the short- and long-range forces */
+ +        clear_rvecs(fr->natoms_force_constr, f);
+ +        if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ +        }
+ +
+ +        clear_rvec(fr->vir_diag_posres);
+ +    }
+ +
+ +    if (inputrec->ePull == epullCONSTRAINT)
+ +    {
+ +        clear_pull_forces(inputrec->pull);
+ +    }
+ +
+ +    /* We calculate the non-bonded forces, when done on the CPU, here.
+ +     * We do this before calling do_force_lowlevel, as in there bondeds
+ +     * forces are calculated before PME, which does communication.
+ +     * With this order, non-bonded and bonded force calculation imbalance
+ +     * can be balanced out by the domain decomposition load balancing.
+ +     */
+ +
+ +    if (!bUseOrEmulGPU)
+ +    {
+ +        /* Maybe we should move this into do_force_lowlevel */
+ +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ +                     nrnb);
+ +    }
+ +
+ +    if (!bUseOrEmulGPU || bDiffKernels)
+ +    {
+ +        int aloc;
+ +
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+ +                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
+ +                         nrnb);
+ +        }
+ +
+ +        if (!bUseOrEmulGPU)
+ +        {
+ +            aloc = eintLocal;
+ +        }
+ +        else
+ +        {
+ +            aloc = eintNonlocal;
+ +        }
+ +
+ +        /* Add all the non-bonded force to the normal force array.
+ +         * This can be split into a local a non-local part when overlapping
+ +         * communication with calculation with domain decomposition.
+ +         */
+ +        cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ +        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+ +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ +        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_start_nocount(wcycle, ewcFORCE);
+ +
+ +        /* if there are multiple fshift output buffers reduce them */
+ +        if ((flags & GMX_FORCE_VIRIAL) &&
+ +            nbv->grp[aloc].nbl_lists.nnbl > 1)
+ +        {
+ +            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+ +                                                     fr->fshift);
+ +        }
+ +    }
+ +
+ +    /* update QMMMrec, if necessary */
+ +    if (fr->bQMMM)
+ +    {
+ +        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ +    }
+ +
+ +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ +    {
+ +        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ +                       enerd, lambda, fr);
+ +    }
+ +
+ +    /* Compute the bonded and non-bonded energies and optionally forces */
+ +    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +                      cr, nrnb, wcycle, mdatoms,
+ +                      x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
+ +                      &(top->atomtypes), bBornRadii, box,
+ +                      inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
+ +                      flags, &cycles_pme);
+ +
+ +    if (bSepLRF)
+ +    {
+ +        if (do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            /* Add the long range forces to the short range forces */
+ +            for (i = 0; i < fr->natoms_force_constr; i++)
+ +            {
+ +                rvec_add(fr->f_twin[i], f[i], f[i]);
+ +            }
+ +        }
+ +    }
+ +
+ +    cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ +
+ +    if (ed)
+ +    {
+ +        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ +    }
+ +
+ +    if (bUseOrEmulGPU && !bDiffKernels)
+ +    {
+ +        /* wait for non-local forces (or calculate in emulation mode) */
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            if (bUseGPU)
+ +            {
+ +                wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
+ +                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ +                                    nbv->grp[eintNonlocal].nbat,
+ +                                    flags, eatNonlocal,
+ +                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                    fr->fshift);
+ +                cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ +            }
+ +            else
+ +            {
+ +                wallcycle_start_nocount(wcycle, ewcFORCE);
+ +                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ +                             nrnb);
+ +                cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ +            }
+ +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +            wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ +            /* skip the reduction if there was no non-local work to do */
+ +            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ +            {
+ +                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
+ +                                               nbv->grp[eintNonlocal].nbat, f);
+ +            }
+ +            wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        }
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        /* Communicate the forces */
+ +        if (PAR(cr))
+ +        {
+ +            wallcycle_start(wcycle, ewcMOVEF);
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                dd_move_f(cr->dd, f, fr->fshift);
+ +                /* Do we need to communicate the separate force array
+ +                 * for terms that do not contribute to the single sum virial?
+ +                 * Position restraints and electric fields do not introduce
+ +                 * inter-cg forces, only full electrostatics methods do.
+ +                 * When we do not calculate the virial, fr->f_novirsum = f,
+ +                 * so we have already communicated these forces.
+ +                 */
+ +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ +                    (flags & GMX_FORCE_VIRIAL))
+ +                {
+ +                    dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ +                }
+ +                if (bSepLRF)
+ +                {
+ +                    /* We should not update the shift forces here,
+ +                     * since f_twin is already included in f.
+ +                     */
+ +                    dd_move_f(cr->dd, fr->f_twin, NULL);
+ +                }
+ +            }
+ +            wallcycle_stop(wcycle, ewcMOVEF);
+ +        }
+ +    }
+ +
+ +    if (bUseOrEmulGPU)
+ +    {
+ +        /* wait for local forces (or calculate in emulation mode) */
+ +        if (bUseGPU)
+ +        {
+ +            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ +            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ +                                nbv->grp[eintLocal].nbat,
+ +                                flags, eatLocal,
+ +                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                fr->fshift);
+ +            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ +
+ +            /* now clear the GPU outputs while we finish the step on the CPU */
+ +
+ +            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ +            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +        }
+ +        else
+ +        {
+ +            wallcycle_start_nocount(wcycle, ewcFORCE);
+ +            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ +                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ +                         nrnb);
+ +            wallcycle_stop(wcycle, ewcFORCE);
+ +        }
+ +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ +        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ +        {
+ +            /* skip the reduction if there was no non-local work to do */
+ +            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ +                                           nbv->grp[eintLocal].nbat, f);
+ +        }
+ +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        dd_force_flop_stop(cr->dd, nrnb);
+ +        if (wcycle)
+ +        {
+ +            dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ +        }
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        if (IR_ELEC_FIELD(*inputrec))
+ +        {
+ +            /* Compute forces due to electric field */
+ +            calc_f_el(MASTER(cr) ? field : NULL,
+ +                      start, homenr, mdatoms->chargeA, fr->f_novirsum,
+ +                      inputrec->ex, inputrec->et, t);
+ +        }
+ +
+ +        /* If we have NoVirSum forces, but we do not calculate the virial,
+ +         * we sum fr->f_novirum=f later.
+ +         */
+ +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ +        {
+ +            wallcycle_start(wcycle, ewcVSITESPREAD);
+ +            spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ +                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +            wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +
+ +            if (bSepLRF)
+ +            {
+ +                wallcycle_start(wcycle, ewcVSITESPREAD);
+ +                spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ +                               nrnb,
+ +                               &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +                wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +            }
+ +        }
+ +
+ +        if (flags & GMX_FORCE_VIRIAL)
+ +        {
+ +            /* Calculation of the virial must be done after vsites! */
+ +            calc_virial(mdatoms->start, mdatoms->homenr, x, f,
+ +                        vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ +        }
+ +    }
+ +
+ +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ +    {
+ +        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +                               f, vir_force, mdatoms, enerd, lambda, t);
+ +    }
+ +
+ +    /* Add the forces from enforced rotation potentials (if any) */
+ +    if (inputrec->bRot)
+ +    {
+ +        wallcycle_start(wcycle, ewcROTadd);
+ +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ +        wallcycle_stop(wcycle, ewcROTadd);
+ +    }
+ +
+ +    if (PAR(cr) && !(cr->duty & DUTY_PME))
+ +    {
+ +        /* In case of node-splitting, the PP nodes receive the long-range
+ +         * forces, virial and energy from the PME nodes here.
+ +         */
+ +        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        post_process_forces(cr, step, nrnb, wcycle,
+ +                            top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ +                            flags);
+ +    }
+ +
+ +    /* Sum the potential energy terms from group contributions */
+ +    sum_epot(&(enerd->grpp), enerd->term);
+ +}
+ +
+ +void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
+ +                        t_inputrec *inputrec,
+ +                        gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +                        gmx_localtop_t *top,
+ +                        gmx_groups_t *groups,
+ +                        matrix box, rvec x[], history_t *hist,
+ +                        rvec f[],
+ +                        tensor vir_force,
+ +                        t_mdatoms *mdatoms,
+ +                        gmx_enerdata_t *enerd, t_fcdata *fcd,
+ +                        real *lambda, t_graph *graph,
+ +                        t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
+ +                        double t, FILE *field, gmx_edsam_t ed,
+ +                        gmx_bool bBornRadii,
+ +                        int flags)
+ +{
+ +    int        cg0, cg1, i, j;
+ +    int        start, homenr;
+ +    double     mu[2*DIM];
+ +    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
+ +    gmx_bool   bDoAdressWF;
+ +    matrix     boxs;
+ +    rvec       vzero, box_diag;
+ +    real       e, v, dvdlambda[efptNR];
+ +    t_pbc      pbc;
+ +    float      cycles_pme, cycles_force;
+ +
+ +    start  = mdatoms->start;
+ +    homenr = mdatoms->homenr;
+ +
+ +    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+ +
+ +    clear_mat(vir_force);
+ +
+ +    if (PARTDECOMP(cr))
+ +    {
+ +        pd_cg_range(cr, &cg0, &cg1);
+ +    }
+ +    else
+ +    {
+ +        cg0 = 0;
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            cg1 = cr->dd->ncg_tot;
+ +        }
+ +        else
+ +        {
+ +            cg1 = top->cgs.nr;
+ +        }
+ +        if (fr->n_tpi > 0)
+ +        {
+ +            cg1--;
+ +        }
+ +    }
+ +
+ +    bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
+ +    bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ +    /* Should we update the long-range neighborlists at this step? */
+ +    bDoLongRangeNS = fr->bTwinRange && bNS;
+ +    /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
+ +    bFillGrid      = (bNS && bStateChanged);
+ +    bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
+ +    bDoForces      = (flags & GMX_FORCE_FORCES);
+ +    bDoPotential   = (flags & GMX_FORCE_ENERGY);
+ +    bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
+ +                      (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
+ +
+ +    /* should probably move this to the forcerec since it doesn't change */
+ +    bDoAdressWF   = ((fr->adress_type != eAdressOff));
+ +
+ +    if (bStateChanged)
+ +    {
+ +        update_forcerec(fr, box);
+ +
+ +        if (NEED_MUTOT(*inputrec))
+ +        {
+ +            /* Calculate total (local) dipole moment in a temporary common array.
+ +             * This makes it possible to sum them over nodes faster.
+ +             */
+ +            calc_mu(start, homenr,
+ +                    x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ +                    mu, mu+DIM);
+ +        }
+ +    }
+ +
+ +    if (fr->ePBC != epbcNONE)
+ +    {
+ +        /* Compute shift vectors every step,
+ +         * because of pressure coupling or box deformation!
+ +         */
+ +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ +        {
+ +            calc_shifts(box, fr->shift_vec);
+ +        }
+ +
+ +        if (bCalcCGCM)
+ +        {
+ +            put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
+ +                                     &(top->cgs), x, fr->cg_cm);
+ +            inc_nrnb(nrnb, eNR_CGCM, homenr);
+ +            inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
+ +        }
+ +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ +        {
+ +            unshift_self(graph, box, x);
+ +        }
+ +    }
+ +    else if (bCalcCGCM)
+ +    {
+ +        calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
+ +        inc_nrnb(nrnb, eNR_CGCM, homenr);
+ +    }
+ +
+ +    if (bCalcCGCM)
+ +    {
+ +        if (PAR(cr))
+ +        {
+ +            move_cgcm(fplog, cr, fr->cg_cm);
+ +        }
+ +        if (gmx_debug_at)
+ +        {
+ +            pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
+ +        }
+ +    }
+ +
+ +#ifdef GMX_MPI
+ +    if (!(cr->duty & DUTY_PME))
+ +    {
+ +        /* Send particle coordinates to the pme nodes.
+ +         * Since this is only implemented for domain decomposition
+ +         * and domain decomposition does not use the graph,
+ +         * we do not need to worry about shifting.
+ +         */
+ +
+ +        wallcycle_start(wcycle, ewcPP_PMESENDX);
+ +
+ +        bBS = (inputrec->nwall == 2);
+ +        if (bBS)
+ +        {
+ +            copy_mat(box, boxs);
+ +            svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ +        }
+ +
+ +        gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ +                       mdatoms->nChargePerturbed, lambda[efptCOUL],
+ +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+ +
+ +        wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ +    }
+ +#endif /* GMX_MPI */
+ +
+ +    /* Communicate coordinates and sum dipole if necessary */
+ +    if (PAR(cr))
+ +    {
+ +        wallcycle_start(wcycle, ewcMOVEX);
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            dd_move_x(cr->dd, box, x);
+ +        }
+ +        else
+ +        {
+ +            move_x(cr, x, nrnb);
+ +        }
+ +        wallcycle_stop(wcycle, ewcMOVEX);
+ +    }
+ +
+ +    /* update adress weight beforehand */
+ +    if (bStateChanged && bDoAdressWF)
+ +    {
+ +        /* need pbc for adress weight calculation with pbc_dx */
+ +        set_pbc(&pbc, inputrec->ePBC, box);
+ +        if (fr->adress_site == eAdressSITEcog)
+ +        {
+ +            update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
+ +                                      inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +        else if (fr->adress_site == eAdressSITEcom)
+ +        {
+ +            update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ +                                      inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +        else if (fr->adress_site == eAdressSITEatomatom)
+ +        {
+ +            update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ +                                                inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +        else
+ +        {
+ +            update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ +                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +    }
+ +
+ +    if (NEED_MUTOT(*inputrec))
+ +    {
+ +
+ +        if (bStateChanged)
+ +        {
+ +            if (PAR(cr))
+ +            {
+ +                gmx_sumd(2*DIM, mu, cr);
+ +            }
+ +            for (i = 0; i < 2; i++)
+ +            {
+ +                for (j = 0; j < DIM; j++)
+ +                {
+ +                    fr->mu_tot[i][j] = mu[i*DIM + j];
+ +                }
+ +            }
+ +        }
+ +        if (fr->efep == efepNO)
+ +        {
+ +            copy_rvec(fr->mu_tot[0], mu_tot);
+ +        }
+ +        else
+ +        {
+ +            for (j = 0; j < DIM; j++)
+ +            {
+ +                mu_tot[j] =
+ +                    (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Reset energies */
+ +    reset_enerdata(fr, bNS, enerd, MASTER(cr));
+ +    clear_rvecs(SHIFTS, fr->fshift);
+ +
+ +    if (bNS)
+ +    {
+ +        wallcycle_start(wcycle, ewcNS);
+ +
+ +        if (graph && bStateChanged)
+ +        {
+ +            /* Calculate intramolecular shift vectors to make molecules whole */
+ +            mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        }
+ +
++        /* Do the actual neighbour searching */
+ +        ns(fplog, fr, box,
+ +           groups, top, mdatoms,
+ +           cr, nrnb, bFillGrid,
+ +           bDoLongRangeNS);
+ +
+ +        wallcycle_stop(wcycle, ewcNS);
+ +    }
+ +
+ +    if (inputrec->implicit_solvent && bNS)
+ +    {
+ +        make_gb_nblist(cr, inputrec->gb_algorithm,
+ +                       x, box, fr, &top->idef, graph, fr->born);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        if (!(cr->duty & DUTY_PME))
+ +        {
+ +            wallcycle_start(wcycle, ewcPPDURINGPME);
+ +            dd_force_flop_start(cr->dd, nrnb);
+ +        }
+ +    }
+ +
+ +    if (inputrec->bRot)
+ +    {
+ +        /* Enforced rotation has its own cycle counter that starts after the collective
+ +         * coordinates have been communicated. It is added to ddCyclF to allow
+ +         * for proper load-balancing */
+ +        wallcycle_start(wcycle, ewcROT);
+ +        do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ +        wallcycle_stop(wcycle, ewcROT);
+ +    }
+ +
+ +    /* Start the force cycle counter.
+ +     * This counter is stopped in do_forcelow_level.
+ +     * No parallel communication should occur while this counter is running,
+ +     * since that will interfere with the dynamic load balancing.
+ +     */
+ +    wallcycle_start(wcycle, ewcFORCE);
+ +
+ +    if (bDoForces)
+ +    {
+ +        /* Reset forces for which the virial is calculated separately:
+ +         * PME/Ewald forces if necessary */
+ +        if (fr->bF_NoVirSum)
+ +        {
+ +            if (flags & GMX_FORCE_VIRIAL)
+ +            {
+ +                fr->f_novirsum = fr->f_novirsum_alloc;
+ +                if (fr->bDomDec)
+ +                {
+ +                    clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ +                }
+ +                else
+ +                {
+ +                    clear_rvecs(homenr, fr->f_novirsum+start);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                /* We are not calculating the pressure so we do not need
+ +                 * a separate array for forces that do not contribute
+ +                 * to the pressure.
+ +                 */
+ +                fr->f_novirsum = f;
+ +            }
+ +        }
+ +
+ +        /* Clear the short- and long-range forces */
+ +        clear_rvecs(fr->natoms_force_constr, f);
+ +        if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ +        }
+ +
+ +        clear_rvec(fr->vir_diag_posres);
+ +    }
+ +    if (inputrec->ePull == epullCONSTRAINT)
+ +    {
+ +        clear_pull_forces(inputrec->pull);
+ +    }
+ +
+ +    /* update QMMMrec, if necessary */
+ +    if (fr->bQMMM)
+ +    {
+ +        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ +    }
+ +
+ +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ +    {
+ +        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ +                       enerd, lambda, fr);
+ +    }
+ +
+ +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ +    {
+ +        /* Flat-bottomed position restraints always require full pbc */
+ +        if (!(bStateChanged && bDoAdressWF))
+ +        {
+ +            set_pbc(&pbc, inputrec->ePBC, box);
+ +        }
+ +        v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
+ +                     top->idef.iparams_fbposres,
+ +                     (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ +                     inputrec->ePBC == epbcNONE ? NULL : &pbc,
+ +                     fr->rc_scaling, fr->ePBC, fr->posres_com);
+ +        enerd->term[F_FBPOSRES] += v;
+ +        inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
+ +    }
+ +
+ +    /* Compute the bonded and non-bonded energies and optionally forces */
+ +    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +                      cr, nrnb, wcycle, mdatoms,
+ +                      x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
+ +                      &(top->atomtypes), bBornRadii, box,
+ +                      inputrec->fepvals, lambda,
+ +                      graph, &(top->excls), fr->mu_tot,
+ +                      flags,
+ +                      &cycles_pme);
+ +
+ +    if (bSepLRF)
+ +    {
+ +        if (do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            /* Add the long range forces to the short range forces */
+ +            for (i = 0; i < fr->natoms_force_constr; i++)
+ +            {
+ +                rvec_add(fr->f_twin[i], f[i], f[i]);
+ +            }
+ +        }
+ +    }
+ +
+ +    cycles_force = wallcycle_stop(wcycle, ewcFORCE);
+ +
+ +    if (ed)
+ +    {
+ +        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        dd_force_flop_stop(cr->dd, nrnb);
+ +        if (wcycle)
+ +        {
+ +            dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ +        }
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        if (IR_ELEC_FIELD(*inputrec))
+ +        {
+ +            /* Compute forces due to electric field */
+ +            calc_f_el(MASTER(cr) ? field : NULL,
+ +                      start, homenr, mdatoms->chargeA, fr->f_novirsum,
+ +                      inputrec->ex, inputrec->et, t);
+ +        }
+ +
+ +        if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
+ +        {
+ +            /* Compute thermodynamic force in hybrid AdResS region */
+ +            adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
+ +                                inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +
+ +        /* Communicate the forces */
+ +        if (PAR(cr))
+ +        {
+ +            wallcycle_start(wcycle, ewcMOVEF);
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                dd_move_f(cr->dd, f, fr->fshift);
+ +                /* Do we need to communicate the separate force array
+ +                 * for terms that do not contribute to the single sum virial?
+ +                 * Position restraints and electric fields do not introduce
+ +                 * inter-cg forces, only full electrostatics methods do.
+ +                 * When we do not calculate the virial, fr->f_novirsum = f,
+ +                 * so we have already communicated these forces.
+ +                 */
+ +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ +                    (flags & GMX_FORCE_VIRIAL))
+ +                {
+ +                    dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ +                }
+ +                if (bSepLRF)
+ +                {
+ +                    /* We should not update the shift forces here,
+ +                     * since f_twin is already included in f.
+ +                     */
+ +                    dd_move_f(cr->dd, fr->f_twin, NULL);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                pd_move_f(cr, f, nrnb);
+ +                if (bSepLRF)
+ +                {
+ +                    pd_move_f(cr, fr->f_twin, nrnb);
+ +                }
+ +            }
+ +            wallcycle_stop(wcycle, ewcMOVEF);
+ +        }
+ +
+ +        /* If we have NoVirSum forces, but we do not calculate the virial,
+ +         * we sum fr->f_novirum=f later.
+ +         */
+ +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ +        {
+ +            wallcycle_start(wcycle, ewcVSITESPREAD);
+ +            spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ +                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +            wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +
+ +            if (bSepLRF)
+ +            {
+ +                wallcycle_start(wcycle, ewcVSITESPREAD);
+ +                spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ +                               nrnb,
+ +                               &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +                wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +            }
+ +        }
+ +
+ +        if (flags & GMX_FORCE_VIRIAL)
+ +        {
+ +            /* Calculation of the virial must be done after vsites! */
+ +            calc_virial(mdatoms->start, mdatoms->homenr, x, f,
+ +                        vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ +        }
+ +    }
+ +
+ +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ +    {
+ +        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +                               f, vir_force, mdatoms, enerd, lambda, t);
+ +    }
+ +
+ +    /* Add the forces from enforced rotation potentials (if any) */
+ +    if (inputrec->bRot)
+ +    {
+ +        wallcycle_start(wcycle, ewcROTadd);
+ +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ +        wallcycle_stop(wcycle, ewcROTadd);
+ +    }
+ +
+ +    if (PAR(cr) && !(cr->duty & DUTY_PME))
+ +    {
+ +        /* In case of node-splitting, the PP nodes receive the long-range
+ +         * forces, virial and energy from the PME nodes here.
+ +         */
+ +        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        post_process_forces(cr, step, nrnb, wcycle,
+ +                            top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ +                            flags);
+ +    }
+ +
+ +    /* Sum the potential energy terms from group contributions */
+ +    sum_epot(&(enerd->grpp), enerd->term);
+ +}
+ +
+ +void do_force(FILE *fplog, t_commrec *cr,
+ +              t_inputrec *inputrec,
+ +              gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +              gmx_localtop_t *top,
+ +              gmx_groups_t *groups,
+ +              matrix box, rvec x[], history_t *hist,
+ +              rvec f[],
+ +              tensor vir_force,
+ +              t_mdatoms *mdatoms,
+ +              gmx_enerdata_t *enerd, t_fcdata *fcd,
+ +              real *lambda, t_graph *graph,
+ +              t_forcerec *fr,
+ +              gmx_vsite_t *vsite, rvec mu_tot,
+ +              double t, FILE *field, gmx_edsam_t ed,
+ +              gmx_bool bBornRadii,
+ +              int flags)
+ +{
+ +    /* modify force flag if not doing nonbonded */
+ +    if (!fr->bNonbonded)
+ +    {
+ +        flags &= ~GMX_FORCE_NONBONDED;
+ +    }
+ +
+ +    switch (inputrec->cutoff_scheme)
+ +    {
+ +        case ecutsVERLET:
+ +            do_force_cutsVERLET(fplog, cr, inputrec,
+ +                                step, nrnb, wcycle,
+ +                                top,
+ +                                groups,
+ +                                box, x, hist,
+ +                                f, vir_force,
+ +                                mdatoms,
+ +                                enerd, fcd,
+ +                                lambda, graph,
+ +                                fr, fr->ic,
+ +                                vsite, mu_tot,
+ +                                t, field, ed,
+ +                                bBornRadii,
+ +                                flags);
+ +            break;
+ +        case ecutsGROUP:
+ +            do_force_cutsGROUP(fplog, cr, inputrec,
+ +                               step, nrnb, wcycle,
+ +                               top,
+ +                               groups,
+ +                               box, x, hist,
+ +                               f, vir_force,
+ +                               mdatoms,
+ +                               enerd, fcd,
+ +                               lambda, graph,
+ +                               fr, vsite, mu_tot,
+ +                               t, field, ed,
+ +                               bBornRadii,
+ +                               flags);
+ +            break;
+ +        default:
+ +            gmx_incons("Invalid cut-off scheme passed!");
+ +    }
+ +}
+ +
+ +
+ +void do_constrain_first(FILE *fplog, gmx_constr_t constr,
+ +                        t_inputrec *ir, t_mdatoms *md,
+ +                        t_state *state, t_commrec *cr, t_nrnb *nrnb,
+ +                        t_forcerec *fr, gmx_localtop_t *top)
+ +{
+ +    int             i, m, start, end;
+ +    gmx_large_int_t step;
+ +    real            dt = ir->delta_t;
+ +    real            dvdl_dum;
+ +    rvec           *savex;
+ +
+ +    snew(savex, state->natoms);
+ +
+ +    start = md->start;
+ +    end   = md->homenr + start;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
+ +                start, md->homenr, end);
+ +    }
+ +    /* Do a first constrain to reset particles... */
+ +    step = ir->init_step;
+ +    if (fplog)
+ +    {
+ +        char buf[STEPSTRSIZE];
+ +        fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
+ +                gmx_step_str(step, buf));
+ +    }
+ +    dvdl_dum = 0;
+ +
+ +    /* constrain the current position */
+ +    constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ +              ir, NULL, cr, step, 0, md,
+ +              state->x, state->x, NULL,
+ +              fr->bMolPBC, state->box,
+ +              state->lambda[efptBONDED], &dvdl_dum,
+ +              NULL, NULL, nrnb, econqCoord,
+ +              ir->epc == epcMTTK, state->veta, state->veta);
+ +    if (EI_VV(ir->eI))
+ +    {
+ +        /* constrain the inital velocity, and save it */
+ +        /* also may be useful if we need the ekin from the halfstep for velocity verlet */
+ +        /* might not yet treat veta correctly */
+ +        constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ +                  ir, NULL, cr, step, 0, md,
+ +                  state->x, state->v, state->v,
+ +                  fr->bMolPBC, state->box,
+ +                  state->lambda[efptBONDED], &dvdl_dum,
+ +                  NULL, NULL, nrnb, econqVeloc,
+ +                  ir->epc == epcMTTK, state->veta, state->veta);
+ +    }
+ +    /* constrain the inital velocities at t-dt/2 */
+ +    if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
+ +    {
+ +        for (i = start; (i < end); i++)
+ +        {
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                /* Reverse the velocity */
+ +                state->v[i][m] = -state->v[i][m];
+ +                /* Store the position at t-dt in buf */
+ +                savex[i][m] = state->x[i][m] + dt*state->v[i][m];
+ +            }
+ +        }
+ +        /* Shake the positions at t=-dt with the positions at t=0
+ +         * as reference coordinates.
+ +         */
+ +        if (fplog)
+ +        {
+ +            char buf[STEPSTRSIZE];
+ +            fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
+ +                    gmx_step_str(step, buf));
+ +        }
+ +        dvdl_dum = 0;
+ +        constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ +                  ir, NULL, cr, step, -1, md,
+ +                  state->x, savex, NULL,
+ +                  fr->bMolPBC, state->box,
+ +                  state->lambda[efptBONDED], &dvdl_dum,
+ +                  state->v, NULL, nrnb, econqCoord,
+ +                  ir->epc == epcMTTK, state->veta, state->veta);
+ +
+ +        for (i = start; i < end; i++)
+ +        {
+ +            for (m = 0; m < DIM; m++)
+ +            {
+ +                /* Re-reverse the velocities */
+ +                state->v[i][m] = -state->v[i][m];
+ +            }
+ +        }
+ +    }
+ +    sfree(savex);
+ +}
+ +
+ +void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
+ +{
+ +    double eners[2], virs[2], enersum, virsum, y0, f, g, h;
+ +    double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+ +    double invscale, invscale2, invscale3;
+ +    int    ri0, ri1, ri, i, offstart, offset;
+ +    real   scale, *vdwtab, tabfactor, tmp;
+ +
+ +    fr->enershiftsix    = 0;
+ +    fr->enershifttwelve = 0;
+ +    fr->enerdiffsix     = 0;
+ +    fr->enerdifftwelve  = 0;
+ +    fr->virdiffsix      = 0;
+ +    fr->virdifftwelve   = 0;
+ +
+ +    if (eDispCorr != edispcNO)
+ +    {
+ +        for (i = 0; i < 2; i++)
+ +        {
+ +            eners[i] = 0;
+ +            virs[i]  = 0;
+ +        }
+ +        if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
+ +        {
+ +            if (fr->rvdw_switch == 0)
+ +            {
+ +                gmx_fatal(FARGS,
+ +                          "With dispersion correction rvdw-switch can not be zero "
+ +                          "for vdw-type = %s", evdw_names[fr->vdwtype]);
+ +            }
+ +
+ +            scale  = fr->nblists[0].table_elec_vdw.scale;
+ +            vdwtab = fr->nblists[0].table_vdw.data;
+ +
+ +            /* Round the cut-offs to exact table values for precision */
+ +            ri0  = floor(fr->rvdw_switch*scale);
+ +            ri1  = ceil(fr->rvdw*scale);
+ +            r0   = ri0/scale;
+ +            r1   = ri1/scale;
+ +            rc3  = r0*r0*r0;
+ +            rc9  = rc3*rc3*rc3;
+ +
+ +            if (fr->vdwtype == evdwSHIFT)
+ +            {
+ +                /* Determine the constant energy shift below rvdw_switch.
+ +                 * Table has a scale factor since we have scaled it down to compensate
+ +                 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
+ +                 */
+ +                fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
+ +                fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
+ +            }
+ +            /* Add the constant part from 0 to rvdw_switch.
+ +             * This integration from 0 to rvdw_switch overcounts the number
+ +             * of interactions by 1, as it also counts the self interaction.
+ +             * We will correct for this later.
+ +             */
+ +            eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
+ +            eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
+ +
+ +            invscale  = 1.0/(scale);
+ +            invscale2 = invscale*invscale;
+ +            invscale3 = invscale*invscale2;
+ +
+ +            /* following summation derived from cubic spline definition,
+ +               Numerical Recipies in C, second edition, p. 113-116.  Exact
+ +               for the cubic spline.  We first calculate the negative of
+ +               the energy from rvdw to rvdw_switch, assuming that g(r)=1,
+ +               and then add the more standard, abrupt cutoff correction to
+ +               that result, yielding the long-range correction for a
+ +               switched function.  We perform both the pressure and energy
+ +               loops at the same time for simplicity, as the computational
+ +               cost is low. */
+ +
+ +            for (i = 0; i < 2; i++)
+ +            {
+ +                enersum = 0.0; virsum = 0.0;
+ +                if (i == 0)
+ +                {
+ +                    offstart = 0;
+ +                    /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
+ +                     * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
+ +                     * up (to save flops in kernels), we need to correct for this.
+ +                     */
+ +                    tabfactor = 6.0;
+ +                }
+ +                else
+ +                {
+ +                    offstart  = 4;
+ +                    tabfactor = 12.0;
+ +                }
+ +                for (ri = ri0; ri < ri1; ri++)
+ +                {
+ +                    r  = ri*invscale;
+ +                    ea = invscale3;
+ +                    eb = 2.0*invscale2*r;
+ +                    ec = invscale*r*r;
+ +
+ +                    pa = invscale3;
+ +                    pb = 3.0*invscale2*r;
+ +                    pc = 3.0*invscale*r*r;
+ +                    pd = r*r*r;
+ +
+ +                    /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
+ +                    offset = 8*ri + offstart;
+ +                    y0     = vdwtab[offset];
+ +                    f      = vdwtab[offset+1];
+ +                    g      = vdwtab[offset+2];
+ +                    h      = vdwtab[offset+3];
+ +
+ +                    enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
+ +                    virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
+ +                }
+ +
+ +                enersum  *= 4.0*M_PI*tabfactor;
+ +                virsum   *= 4.0*M_PI*tabfactor;
+ +                eners[i] -= enersum;
+ +                virs[i]  -= virsum;
+ +            }
+ +
+ +            /* now add the correction for rvdw_switch to infinity */
+ +            eners[0] += -4.0*M_PI/(3.0*rc3);
+ +            eners[1] +=  4.0*M_PI/(9.0*rc9);
+ +            virs[0]  +=  8.0*M_PI/rc3;
+ +            virs[1]  += -16.0*M_PI/(3.0*rc9);
+ +        }
+ +        else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
+ +        {
+ +            if (fr->vdwtype == evdwUSER && fplog)
+ +            {
+ +                fprintf(fplog,
+ +                        "WARNING: using dispersion correction with user tables\n");
+ +            }
+ +            rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
+ +            rc9  = rc3*rc3*rc3;
+ +            /* Contribution beyond the cut-off */
+ +            eners[0] += -4.0*M_PI/(3.0*rc3);
+ +            eners[1] +=  4.0*M_PI/(9.0*rc9);
+ +            if (fr->vdw_modifier == eintmodPOTSHIFT)
+ +            {
+ +                /* Contribution within the cut-off */
+ +                eners[0] += -4.0*M_PI/(3.0*rc3);
+ +                eners[1] +=  4.0*M_PI/(3.0*rc9);
+ +            }
+ +            /* Contribution beyond the cut-off */
+ +            virs[0]  +=  8.0*M_PI/rc3;
+ +            virs[1]  += -16.0*M_PI/(3.0*rc9);
+ +        }
+ +        else
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "Dispersion correction is not implemented for vdw-type = %s",
+ +                      evdw_names[fr->vdwtype]);
+ +        }
+ +        fr->enerdiffsix    = eners[0];
+ +        fr->enerdifftwelve = eners[1];
+ +        /* The 0.5 is due to the Gromacs definition of the virial */
+ +        fr->virdiffsix     = 0.5*virs[0];
+ +        fr->virdifftwelve  = 0.5*virs[1];
+ +    }
+ +}
+ +
+ +void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
+ +                   gmx_large_int_t step, int natoms,
+ +                   matrix box, real lambda, tensor pres, tensor virial,
+ +                   real *prescorr, real *enercorr, real *dvdlcorr)
+ +{
+ +    gmx_bool bCorrAll, bCorrPres;
+ +    real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
+ +    int      m;
+ +
+ +    *prescorr = 0;
+ +    *enercorr = 0;
+ +    *dvdlcorr = 0;
+ +
+ +    clear_mat(virial);
+ +    clear_mat(pres);
+ +
+ +    if (ir->eDispCorr != edispcNO)
+ +    {
+ +        bCorrAll  = (ir->eDispCorr == edispcAllEner ||
+ +                     ir->eDispCorr == edispcAllEnerPres);
+ +        bCorrPres = (ir->eDispCorr == edispcEnerPres ||
+ +                     ir->eDispCorr == edispcAllEnerPres);
+ +
+ +        invvol = 1/det(box);
+ +        if (fr->n_tpi)
+ +        {
+ +            /* Only correct for the interactions with the inserted molecule */
+ +            dens   = (natoms - fr->n_tpi)*invvol;
+ +            ninter = fr->n_tpi;
+ +        }
+ +        else
+ +        {
+ +            dens   = natoms*invvol;
+ +            ninter = 0.5*natoms;
+ +        }
+ +
+ +        if (ir->efep == efepNO)
+ +        {
+ +            avcsix    = fr->avcsix[0];
+ +            avctwelve = fr->avctwelve[0];
+ +        }
+ +        else
+ +        {
+ +            avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
+ +            avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
+ +        }
+ +
+ +        enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
+ +        *enercorr += avcsix*enerdiff;
+ +        dvdlambda  = 0.0;
+ +        if (ir->efep != efepNO)
+ +        {
+ +            dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
+ +        }
+ +        if (bCorrAll)
+ +        {
+ +            enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
+ +            *enercorr += avctwelve*enerdiff;
+ +            if (fr->efep != efepNO)
+ +            {
+ +                dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
+ +            }
+ +        }
+ +
+ +        if (bCorrPres)
+ +        {
+ +            svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
+ +            if (ir->eDispCorr == edispcAllEnerPres)
+ +            {
+ +                svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
+ +            }
+ +            /* The factor 2 is because of the Gromacs virial definition */
+ +            spres = -2.0*invvol*svir*PRESFAC;
+ +
+ +            for (m = 0; m < DIM; m++)
+ +            {
+ +                virial[m][m] += svir;
+ +                pres[m][m]   += spres;
+ +            }
+ +            *prescorr += spres;
+ +        }
+ +
+ +        /* Can't currently control when it prints, for now, just print when degugging */
+ +        if (debug)
+ +        {
+ +            if (bCorrAll)
+ +            {
+ +                fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ +                        avcsix, avctwelve);
+ +            }
+ +            if (bCorrPres)
+ +            {
+ +                fprintf(debug,
+ +                        "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
+ +                        *enercorr, spres, svir);
+ +            }
+ +            else
+ +            {
+ +                fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
+ +            }
+ +        }
+ +
+ +        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
+ +        {
+ +            gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
+ +        }
+ +        if (fr->efep != efepNO)
+ +        {
+ +            *dvdlcorr += dvdlambda;
+ +        }
+ +    }
+ +}
+ +
+ +void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
+ +                  t_graph *graph, rvec x[])
+ +{
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Removing pbc first time\n");
+ +    }
+ +    calc_shifts(box, fr->shift_vec);
+ +    if (graph)
+ +    {
+ +        mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        if (gmx_debug_at)
+ +        {
+ +            p_graph(debug, "do_pbc_first 1", graph);
+ +        }
+ +        shift_self(graph, box, x);
+ +        /* By doing an extra mk_mshift the molecules that are broken
+ +         * because they were e.g. imported from another software
+ +         * will be made whole again. Such are the healing powers
+ +         * of GROMACS.
+ +         */
+ +        mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        if (gmx_debug_at)
+ +        {
+ +            p_graph(debug, "do_pbc_first 2", graph);
+ +        }
+ +    }
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Done rmpbc\n");
+ +    }
+ +}
+ +
+ +static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ +                            gmx_mtop_t *mtop, rvec x[],
+ +                            gmx_bool bFirst)
+ +{
+ +    t_graph        *graph;
+ +    int             mb, as, mol;
+ +    gmx_molblock_t *molb;
+ +
+ +    if (bFirst && fplog)
+ +    {
+ +        fprintf(fplog, "Removing pbc first time\n");
+ +    }
+ +
+ +    snew(graph, 1);
+ +    as = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        molb = &mtop->molblock[mb];
+ +        if (molb->natoms_mol == 1 ||
+ +            (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
+ +        {
+ +            /* Just one atom or charge group in the molecule, no PBC required */
+ +            as += molb->nmol*molb->natoms_mol;
+ +        }
+ +        else
+ +        {
+ +            /* Pass NULL iso fplog to avoid graph prints for each molecule type */
+ +            mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
+ +                           0, molb->natoms_mol, FALSE, FALSE, graph);
+ +
+ +            for (mol = 0; mol < molb->nmol; mol++)
+ +            {
+ +                mk_mshift(fplog, graph, ePBC, box, x+as);
+ +
+ +                shift_self(graph, box, x+as);
+ +                /* The molecule is whole now.
+ +                 * We don't need the second mk_mshift call as in do_pbc_first,
+ +                 * since we no longer need this graph.
+ +                 */
+ +
+ +                as += molb->natoms_mol;
+ +            }
+ +            done_graph(graph);
+ +        }
+ +    }
+ +    sfree(graph);
+ +}
+ +
+ +void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
+ +                       gmx_mtop_t *mtop, rvec x[])
+ +{
+ +    low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
+ +}
+ +
+ +void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ +                 gmx_mtop_t *mtop, rvec x[])
+ +{
+ +    low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
+ +}
+ +
+ +void finish_run(FILE *fplog, t_commrec *cr,
+ +                t_inputrec *inputrec,
+ +                t_nrnb nrnb[], gmx_wallcycle_t wcycle,
+ +                gmx_runtime_t *runtime,
+ +                wallclock_gpu_t *gputimes,
+ +                gmx_bool bWriteStat)
+ +{
+ +    int     i, j;
+ +    t_nrnb *nrnb_tot = NULL;
+ +    real    delta_t;
+ +    double  nbfs, mflop;
+ +
+ +    wallcycle_sum(cr, wcycle);
+ +
+ +    if (cr->nnodes > 1)
+ +    {
+ +        snew(nrnb_tot, 1);
+ +#ifdef GMX_MPI
+ +        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
+ +                      cr->mpi_comm_mysim);
+ +#endif
+ +    }
+ +    else
+ +    {
+ +        nrnb_tot = nrnb;
+ +    }
+ +
+ +#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+ +    if (cr->nnodes > 1)
+ +    {
+ +        /* reduce nodetime over all MPI processes in the current simulation */
+ +        double sum;
+ +        MPI_Allreduce(&runtime->proctime, &sum, 1, MPI_DOUBLE, MPI_SUM,
+ +                      cr->mpi_comm_mysim);
+ +        runtime->proctime = sum;
+ +    }
+ +#endif
+ +
+ +    if (SIMMASTER(cr))
+ +    {
+ +        print_flop(fplog, nrnb_tot, &nbfs, &mflop);
+ +    }
+ +    if (cr->nnodes > 1)
+ +    {
+ +        sfree(nrnb_tot);
+ +    }
+ +
+ +    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+ +    {
+ +        print_dd_statistics(cr, inputrec, fplog);
+ +    }
+ +
+ +#ifdef GMX_MPI
+ +    if (PARTDECOMP(cr))
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            t_nrnb     *nrnb_all;
+ +            int         s;
+ +            MPI_Status  stat;
+ +
+ +            snew(nrnb_all, cr->nnodes);
+ +            nrnb_all[0] = *nrnb;
+ +            for (s = 1; s < cr->nnodes; s++)
+ +            {
+ +                MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
+ +                         cr->mpi_comm_mysim, &stat);
+ +            }
+ +            pr_load(fplog, cr, nrnb_all);
+ +            sfree(nrnb_all);
+ +        }
+ +        else
+ +        {
+ +            MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
+ +                     cr->mpi_comm_mysim);
+ +        }
+ +    }
+ +#endif
+ +
+ +    if (SIMMASTER(cr))
+ +    {
+ +        wallcycle_print(fplog, cr->nnodes, cr->npmenodes, runtime->realtime,
+ +                        wcycle, gputimes);
+ +
+ +        if (EI_DYNAMICS(inputrec->eI))
+ +        {
+ +            delta_t = inputrec->delta_t;
+ +        }
+ +        else
+ +        {
+ +            delta_t = 0;
+ +        }
+ +
+ +        if (fplog)
+ +        {
+ +            print_perf(fplog, runtime->proctime, runtime->realtime,
+ +                       runtime->nsteps_done, delta_t, nbfs, mflop);
+ +        }
+ +        if (bWriteStat)
+ +        {
+ +            print_perf(stderr, runtime->proctime, runtime->realtime,
+ +                       runtime->nsteps_done, delta_t, nbfs, mflop);
+ +        }
+ +    }
+ +}
+ +
+ +extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+ +{
+ +    /* this function works, but could probably use a logic rewrite to keep all the different
+ +       types of efep straight. */
+ +
+ +    int       i;
+ +    t_lambda *fep = ir->fepvals;
+ +
+ +    if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ +    {
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            lambda[i] = 0.0;
+ +            if (lam0)
+ +            {
+ +                lam0[i] = 0.0;
+ +            }
+ +        }
+ +        return;
+ +    }
+ +    else
+ +    {
+ +        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+ +                                             if checkpoint is set -- a kludge is in for now
+ +                                             to prevent this.*/
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            /* overwrite lambda state with init_lambda for now for backwards compatibility */
+ +            if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+ +            {
+ +                lambda[i] = fep->init_lambda;
+ +                if (lam0)
+ +                {
+ +                    lam0[i] = lambda[i];
+ +                }
+ +            }
+ +            else
+ +            {
+ +                lambda[i] = fep->all_lambda[i][*fep_state];
+ +                if (lam0)
+ +                {
+ +                    lam0[i] = lambda[i];
+ +                }
+ +            }
+ +        }
+ +        if (ir->bSimTemp)
+ +        {
+ +            /* need to rescale control temperatures to match current state */
+ +            for (i = 0; i < ir->opts.ngtc; i++)
+ +            {
+ +                if (ir->opts.ref_t[i] > 0)
+ +                {
+ +                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Send to the log the information on the current lambdas */
+ +    if (fplog != NULL)
+ +    {
+ +        fprintf(fplog, "Initial vector of lambda components:[ ");
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            fprintf(fplog, "%10.4f ", lambda[i]);
+ +        }
+ +        fprintf(fplog, "]\n");
+ +    }
+ +    return;
+ +}
+ +
+ +
+ +void init_md(FILE *fplog,
+ +             t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
+ +             double *t, double *t0,
+ +             real *lambda, int *fep_state, double *lam0,
+ +             t_nrnb *nrnb, gmx_mtop_t *mtop,
+ +             gmx_update_t *upd,
+ +             int nfile, const t_filenm fnm[],
+ +             gmx_mdoutf_t **outf, t_mdebin **mdebin,
+ +             tensor force_vir, tensor shake_vir, rvec mu_tot,
+ +             gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
+ +{
+ +    int  i, j, n;
+ +    real tmpt, mod;
+ +
+ +    /* Initial values */
+ +    *t = *t0       = ir->init_t;
+ +
+ +    *bSimAnn = FALSE;
+ +    for (i = 0; i < ir->opts.ngtc; i++)
+ +    {
+ +        /* set bSimAnn if any group is being annealed */
+ +        if (ir->opts.annealing[i] != eannNO)
+ +        {
+ +            *bSimAnn = TRUE;
+ +        }
+ +    }
+ +    if (*bSimAnn)
+ +    {
+ +        update_annealing_target_temp(&(ir->opts), ir->init_t);
+ +    }
+ +
+ +    /* Initialize lambda variables */
+ +    initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
+ +
+ +    if (upd)
+ +    {
+ +        *upd = init_update(ir);
+ +    }
+ +
+ +
+ +    if (vcm != NULL)
+ +    {
+ +        *vcm = init_vcm(fplog, &mtop->groups, ir);
+ +    }
+ +
+ +    if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
+ +    {
+ +        if (ir->etc == etcBERENDSEN)
+ +        {
+ +            please_cite(fplog, "Berendsen84a");
+ +        }
+ +        if (ir->etc == etcVRESCALE)
+ +        {
+ +            please_cite(fplog, "Bussi2007a");
+ +        }
+ +    }
+ +
+ +    init_nrnb(nrnb);
+ +
+ +    if (nfile != -1)
+ +    {
+ +        *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
+ +
+ +        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
+ +                              mtop, ir, (*outf)->fp_dhdl);
+ +    }
+ +
+ +    if (ir->bAdress)
+ +    {
+ +        please_cite(fplog, "Fritsch12");
+ +        please_cite(fplog, "Junghans10");
+ +    }
+ +    /* Initiate variables */
+ +    clear_mat(force_vir);
+ +    clear_mat(shake_vir);
+ +    clear_rvec(mu_tot);
+ +
+ +    debug_gmx();
+ +}
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 26 Jul 2013 14:47:13 +0000 (16:47 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 26 Jul 2013 14:47:13 +0000 (16:47 +0200)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/CMakeLists.txt	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxana/dlist.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxana/gmx_enemat.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxlib/bondfree.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxpreprocess/calc_verletbuf.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxpreprocess/gen_vsite.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/gmx_simd_macros.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/gmx_simd_math_double.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/legacyheaders/gmx_simd_math_single.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/legacyheaders/gmx_simd_ref.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/legacyheaders/gmx_simd_vec.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/legacyheaders/types/nb_verlet.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/types/nbnxn_pairlist.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/linearalgebra/CMakeLists.txt	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/forcerec.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_atomdata.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_consts.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_internal.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h	patch \|	\|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_search.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_search_simd_2xnn.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_search_simd_4xn.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/pme.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/sim_util.c	patch \|	diff1 \|	\|	blob \| history