--- /dev/null
- ${FFT_LIBRARIES} ${XML_LIBRARIES} ${GSL_LIBRARIES}
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+set(LIBGROMACS_SOURCES)
+
+add_subdirectory(legacyheaders)
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(gmxpreprocess)
+add_subdirectory(gmxana)
+add_subdirectory(analysisdata)
+add_subdirectory(commandline)
+add_subdirectory(fft)
+add_subdirectory(linearalgebra)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(selection)
+add_subdirectory(trajectoryanalysis)
+add_subdirectory(utility)
+
+file(GLOB LIBGROMACS_HEADERS *.h)
+install(FILES ${LIBGROMACS_HEADERS} DESTINATION ${INCL_INSTALL_DIR}/gromacs
+ COMPONENT development)
+
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/version.h.cmakein ${CMAKE_CURRENT_BINARY_DIR}/version.h)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.h
+ DESTINATION ${INCL_INSTALL_DIR}/gromacs
+ COMPONENT development)
+
+# Add target that generates gitversion.c every time make is run
+# if git version info is requested
+# This code is here instead of utility/CMakeLists.txt because CMake
+# ignores set_source_file_properties from subdirectories.
+if (GMX_GIT_VERSION_INFO)
+ set(GENERATED_VERSION_FILE ${CMAKE_CURRENT_BINARY_DIR}/utility/gitversion.c)
+ add_custom_target(gmx_version ALL
+ COMMAND ${CMAKE_COMMAND}
+ -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
+ -D GIT_VERSION="${GIT_VERSION}"
+ -D PROJECT_VERSION="${PROJECT_VERSION}"
+ -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
+ -D VERSION_C_CMAKEIN="${CMAKE_CURRENT_SOURCE_DIR}/utility/gitversion.c.cmakein"
+ -D VERSION_C_OUT=${GENERATED_VERSION_FILE}
+ -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/utility/gitversion.c.cmakein
+ COMMENT "Generating git version information")
+ set_source_files_properties(${GENERATED_VERSION_FILE}
+ PROPERTIES GENERATED true)
+ list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+endif()
+
+# apply gcc 4.4.x bug workaround
+if(GMX_USE_GCC44_BUG_WORKAROUND)
+ include(gmxGCC44O3BugWorkaround)
+ gmx_apply_gcc44_bug_workaround("gmxlib/bondfree.c")
+ gmx_apply_gcc44_bug_workaround("mdlib/force.c")
+ gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
+endif()
+
+add_library(libgromacs ${LIBGROMACS_SOURCES})
+if (GMX_GIT_VERSION_INFO)
+ add_dependencies(libgromacs gmx_version)
+endif ()
+
+if(GMX_BUILD_OWN_FFTW)
+ # This dependency has to be made here rather than the CMakeLists.txt that
+ # does the FFTW build, because of the order in which
+ # add_subdirectory() calls are made in the top-level CMakeLists.txt; the
+ # md library target does not necessarily exist yet. Also enabling and
+ # disabling GMX_BUILD_OWN_FFTW changes dependencies correctly.
+ add_dependencies(libgromacs gmxfftw)
+endif()
+
+target_link_libraries(libgromacs ${GMX_GPU_LIBRARIES}
+ ${GMX_EXTRA_LIBRARIES}
++ ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
++ ${XML_LIBRARIES} ${GSL_LIBRARIES}
+ ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
+set_target_properties(libgromacs PROPERTIES
+ OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+ SOVERSION ${SOVERSION}
+ COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+install(TARGETS libgromacs DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
+ ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
+ DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+ RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
+ COMPONENT development)
+
+if (INSTALL_CUDART_LIB) #can be set manual by user
+ if (GMX_GPU)
+ foreach(CUDA_LIB ${CUDA_LIBRARIES})
+ string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+ if(IS_CUDART) #libcuda should not be installed
+ #install also name-links (linker uses those)
+ file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+ install(FILES ${CUDA_LIBS} DESTINATION
+ ${LIB_INSTALL_DIR} COMPONENT libraries)
+ endif()
+ endforeach()
+ else()
+ message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
+ endif()
+endif ()
--- /dev/null
- (strcmp(*(atoms->atomname[i]), "O1") == 0))
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Green Red Orange Magenta Azure Cyan Skyblue
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+
+#include "string2.h"
+#include "smalloc.h"
+#include "gstat.h"
+#include "gmx_fatal.h"
+#include "index.h"
+
+t_dlist *mk_dlist(FILE *log,
+ t_atoms *atoms, int *nlist,
+ gmx_bool bPhi, gmx_bool bPsi, gmx_bool bChi, gmx_bool bHChi,
+ int maxchi, int r0, gmx_residuetype_t rt)
+{
+ int ires, i, j, k, ii;
+ t_dihatms atm, prev;
+ int nl = 0, nc[edMax];
+ char *thisres;
+ t_dlist *dl;
+
+ snew(dl, atoms->nres+1);
+ prev.C = prev.Cn[1] = -1; /* Keep the compiler quiet */
+ for (i = 0; (i < edMax); i++)
+ {
+ nc[i] = 0;
+ }
+ ires = -1;
+ i = 0;
+ while (i < atoms->nr)
+ {
+ ires = atoms->atom[i].resind;
+
+ /* Initiate all atom numbers to -1 */
+ atm.minC = atm.H = atm.N = atm.C = atm.O = atm.minCalpha = -1;
+ for (j = 0; (j < MAXCHI+3); j++)
+ {
+ atm.Cn[j] = -1;
+ }
+
+ /* Look for atoms in this residue */
+ /* maybe should allow for chis to hydrogens? */
+ while ((i < atoms->nr) && (atoms->atom[i].resind == ires))
+ {
+ if ((strcmp(*(atoms->atomname[i]), "H") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "H1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "HN") == 0) )
+ {
+ atm.H = i;
+ }
+ else if (strcmp(*(atoms->atomname[i]), "N") == 0)
+ {
+ atm.N = i;
+ }
+ else if (strcmp(*(atoms->atomname[i]), "C") == 0)
+ {
+ atm.C = i;
+ }
+ else if ((strcmp(*(atoms->atomname[i]), "O") == 0) ||
++ (strcmp(*(atoms->atomname[i]), "O1") == 0) ||
++ (strcmp(*(atoms->atomname[i]), "OC1") == 0) ||
++ (strcmp(*(atoms->atomname[i]), "OT1") == 0))
+ {
+ atm.O = i;
+ }
+ else if (strcmp(*(atoms->atomname[i]), "CA") == 0)
+ {
+ atm.Cn[1] = i;
+ }
+ else if (strcmp(*(atoms->atomname[i]), "CB") == 0)
+ {
+ atm.Cn[2] = i;
+ }
+ else if ((strcmp(*(atoms->atomname[i]), "CG") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "CG1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "OG") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "OG1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "SG") == 0))
+ {
+ atm.Cn[3] = i;
+ }
+ else if ((strcmp(*(atoms->atomname[i]), "CD") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "CD1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "SD") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "OD1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "ND1") == 0))
+ {
+ atm.Cn[4] = i;
+ }
+ /* by grs - split the Cn[4] into 2 bits to check allowing dih to H */
+ else if (bHChi && ((strcmp(*(atoms->atomname[i]), "HG") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "HG1") == 0)) )
+ {
+ atm.Cn[4] = i;
+ }
+ else if ((strcmp(*(atoms->atomname[i]), "CE") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "CE1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "OE1") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "NE") == 0))
+ {
+ atm.Cn[5] = i;
+ }
+ else if ((strcmp(*(atoms->atomname[i]), "CZ") == 0) ||
+ (strcmp(*(atoms->atomname[i]), "NZ") == 0))
+ {
+ atm.Cn[6] = i;
+ }
+ /* HChi flag here too */
+ else if (bHChi && (strcmp(*(atoms->atomname[i]), "NH1") == 0))
+ {
+ atm.Cn[7] = i;
+ }
+ i++;
+ }
+
+ thisres = *(atoms->resinfo[ires].name);
+
+ /* added by grs - special case for aromatics, whose chis above 2 are
+ not real and produce rubbish output - so set back to -1 */
+ if (strcmp(thisres, "PHE") == 0 ||
+ strcmp(thisres, "TYR") == 0 ||
+ strcmp(thisres, "PTR") == 0 ||
+ strcmp(thisres, "TRP") == 0 ||
+ strcmp(thisres, "HIS") == 0 ||
+ strcmp(thisres, "HISA") == 0 ||
+ strcmp(thisres, "HISB") == 0)
+ {
+ for (ii = 5; ii <= 7; ii++)
+ {
+ atm.Cn[ii] = -1;
+ }
+ }
+ /* end fixing aromatics */
+
+ /* Special case for Pro, has no H */
+ if (strcmp(thisres, "PRO") == 0)
+ {
+ atm.H = atm.Cn[4];
+ }
+ /* Carbon from previous residue */
+ if (prev.C != -1)
+ {
+ atm.minC = prev.C;
+ }
+ /* Alpha-carbon from previous residue */
+ if (prev.Cn[1] != -1)
+ {
+ atm.minCalpha = prev.Cn[1];
+ }
+ prev = atm;
+
+ /* Check how many dihedrals we have */
+ if ((atm.N != -1) && (atm.Cn[1] != -1) && (atm.C != -1) &&
+ (atm.O != -1) && ((atm.H != -1) || (atm.minC != -1)))
+ {
+ dl[nl].resnr = ires+1;
+ dl[nl].atm = atm;
+ dl[nl].atm.Cn[0] = atm.N;
+ if ((atm.Cn[3] != -1) && (atm.Cn[2] != -1) && (atm.Cn[1] != -1))
+ {
+ nc[0]++;
+ if (atm.Cn[4] != -1)
+ {
+ nc[1]++;
+ if (atm.Cn[5] != -1)
+ {
+ nc[2]++;
+ if (atm.Cn[6] != -1)
+ {
+ nc[3]++;
+ if (atm.Cn[7] != -1)
+ {
+ nc[4]++;
+ if (atm.Cn[8] != -1)
+ {
+ nc[5]++;
+ }
+ }
+ }
+ }
+ }
+ }
+ if ((atm.minC != -1) && (atm.minCalpha != -1))
+ {
+ nc[6]++;
+ }
+ dl[nl].index = gmx_residuetype_get_index(rt, thisres);
+
+ sprintf(dl[nl].name, "%s%d", thisres, ires+r0);
+ nl++;
+ }
+ else if (debug)
+ {
+ fprintf(debug, "Could not find N atom but could find other atoms"
+ " in residue %s%d\n", thisres, ires+r0);
+ }
+ }
+ fprintf(stderr, "\n");
+ fprintf(log, "\n");
+ fprintf(log, "There are %d residues with dihedrals\n", nl);
+ j = 0;
+ if (bPhi)
+ {
+ j += nl;
+ }
+ if (bPsi)
+ {
+ j += nl;
+ }
+ if (bChi)
+ {
+ for (i = 0; (i < maxchi); i++)
+ {
+ j += nc[i];
+ }
+ }
+ fprintf(log, "There are %d dihedrals\n", j);
+ fprintf(log, "Dihedral: ");
+ if (bPhi)
+ {
+ fprintf(log, " Phi ");
+ }
+ if (bPsi)
+ {
+ fprintf(log, " Psi ");
+ }
+ if (bChi)
+ {
+ for (i = 0; (i < maxchi); i++)
+ {
+ fprintf(log, "Chi%d ", i+1);
+ }
+ }
+ fprintf(log, "\nNumber: ");
+ if (bPhi)
+ {
+ fprintf(log, "%4d ", nl);
+ }
+ if (bPsi)
+ {
+ fprintf(log, "%4d ", nl);
+ }
+ if (bChi)
+ {
+ for (i = 0; (i < maxchi); i++)
+ {
+ fprintf(log, "%4d ", nc[i]);
+ }
+ }
+ fprintf(log, "\n");
+
+ *nlist = nl;
+
+ return dl;
+}
+
+gmx_bool has_dihedral(int Dih, t_dlist *dl)
+{
+ gmx_bool b = FALSE;
+ int ddd;
+
+ switch (Dih)
+ {
+ case edPhi:
+ b = ((dl->atm.H != -1) && (dl->atm.N != -1) && (dl->atm.Cn[1] != -1) && (dl->atm.C != -1));
+ break;
+ case edPsi:
+ b = ((dl->atm.N != -1) && (dl->atm.Cn[1] != -1) && (dl->atm.C != -1) && (dl->atm.O != -1));
+ break;
+ case edOmega:
+ b = ((dl->atm.minCalpha != -1) && (dl->atm.minC != -1) && (dl->atm.N != -1) && (dl->atm.Cn[1] != -1));
+ break;
+ case edChi1:
+ case edChi2:
+ case edChi3:
+ case edChi4:
+ case edChi5:
+ case edChi6:
+ ddd = Dih - edChi1;
+ b = ((dl->atm.Cn[ddd] != -1) && (dl->atm.Cn[ddd+1] != -1) &&
+ (dl->atm.Cn[ddd+2] != -1) && (dl->atm.Cn[ddd+3] != -1));
+ break;
+ default:
+ pr_dlist(stdout, 1, dl, 1, 0, TRUE, TRUE, TRUE, TRUE, MAXCHI);
+ gmx_fatal(FARGS, "Non existant dihedral %d in file %s, line %d",
+ Dih, __FILE__, __LINE__);
+ }
+ return b;
+}
+
+static void pr_one_ro(FILE *fp, t_dlist *dl, int nDih, real gmx_unused dt)
+{
+ int k;
+ for (k = 0; k < NROT; k++)
+ {
+ fprintf(fp, " %6.2f", dl->rot_occ[nDih][k]);
+ }
+ fprintf(fp, "\n");
+}
+
+static void pr_ntr_s2(FILE *fp, t_dlist *dl, int nDih, real dt)
+{
+ fprintf(fp, " %6.2f %6.2f\n", (dt == 0) ? 0 : dl->ntr[nDih]/dt, dl->S2[nDih]);
+}
+
+void pr_dlist(FILE *fp, int nl, t_dlist dl[], real dt, int printtype,
+ gmx_bool bPhi, gmx_bool bPsi, gmx_bool bChi, gmx_bool bOmega, int maxchi)
+{
+ int i, Xi;
+
+ void (*pr_props)(FILE *, t_dlist *, int, real);
+
+ /* Analysis of dihedral transitions etc */
+
+ if (printtype == edPrintST)
+ {
+ pr_props = pr_ntr_s2;
+ fprintf(stderr, "Now printing out transitions and OPs...\n");
+ }
+ else
+ {
+ pr_props = pr_one_ro;
+ fprintf(stderr, "Now printing out rotamer occupancies...\n");
+ fprintf(fp, "\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n");
+ }
+
+ /* change atom numbers from 0 based to 1 based */
+ for (i = 0; (i < nl); i++)
+ {
+ fprintf(fp, "Residue %s\n", dl[i].name);
+ if (printtype == edPrintST)
+ {
+ fprintf(fp, " Angle [ AI, AJ, AK, AL] #tr/ns S^2D \n"
+ "--------------------------------------------\n");
+ }
+ else
+ {
+ fprintf(fp, " Angle [ AI, AJ, AK, AL] rotamers 0 g(-) t g(+)\n"
+ "--------------------------------------------\n");
+ }
+ if (bPhi)
+ {
+ fprintf(fp, " Phi [%5d,%5d,%5d,%5d]",
+ (dl[i].atm.H == -1) ? 1+dl[i].atm.minC : 1+dl[i].atm.H,
+ 1+dl[i].atm.N, 1+dl[i].atm.Cn[1], 1+dl[i].atm.C);
+ pr_props(fp, &dl[i], edPhi, dt);
+ }
+ if (bPsi)
+ {
+ fprintf(fp, " Psi [%5d,%5d,%5d,%5d]", 1+dl[i].atm.N, 1+dl[i].atm.Cn[1],
+ 1+dl[i].atm.C, 1+dl[i].atm.O);
+ pr_props(fp, &dl[i], edPsi, dt);
+ }
+ if (bOmega && has_dihedral(edOmega, &(dl[i])))
+ {
+ fprintf(fp, " Omega [%5d,%5d,%5d,%5d]", 1+dl[i].atm.minCalpha, 1+dl[i].atm.minC,
+ 1+dl[i].atm.N, 1+dl[i].atm.Cn[1]);
+ pr_props(fp, &dl[i], edOmega, dt);
+ }
+ for (Xi = 0; Xi < MAXCHI; Xi++)
+ {
+ if (bChi && (Xi < maxchi) && (dl[i].atm.Cn[Xi+3] != -1) )
+ {
+ fprintf(fp, " Chi%d[%5d,%5d,%5d,%5d]", Xi+1, 1+dl[i].atm.Cn[Xi],
+ 1+dl[i].atm.Cn[Xi+1], 1+dl[i].atm.Cn[Xi+2],
+ 1+dl[i].atm.Cn[Xi+3]);
+ pr_props(fp, &dl[i], Xi+edChi1, dt); /* Xi+2 was wrong here */
+ }
+ }
+ fprintf(fp, "\n");
+ }
+}
+
+
+
+int pr_trans(FILE *fp, int nl, t_dlist dl[], real dt, int Xi)
+{
+ /* never called at the moment */
+
+ int i, nn, nz;
+
+ nz = 0;
+ fprintf(fp, "\\begin{table}[h]\n");
+ fprintf(fp, "\\caption{Number of dihedral transitions per nanosecond}\n");
+ fprintf(fp, "\\begin{tabular}{|l|l|}\n");
+ fprintf(fp, "\\hline\n");
+ fprintf(fp, "Residue\t&$\\chi_%d$\t\\\\\n", Xi+1);
+ for (i = 0; (i < nl); i++)
+ {
+ nn = dl[i].ntr[Xi]/dt;
+
+ if (nn == 0)
+ {
+ fprintf(fp, "%s\t&\\HL{%d}\t\\\\\n", dl[i].name, nn);
+ nz++;
+ }
+ else if (nn > 0)
+ {
+ fprintf(fp, "%s\t&\\%d\t\\\\\n", dl[i].name, nn);
+ }
+ }
+ fprintf(fp, "\\hline\n");
+ fprintf(fp, "\\end{tabular}\n");
+ fprintf(fp, "\\end{table}\n\n");
+
+ return nz;
+}
--- /dev/null
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Green Red Orange Magenta Azure Cyan Skyblue
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <math.h>
+
+#include "string2.h"
+#include "typedefs.h"
+#include "gmx_fatal.h"
+#include "vec.h"
+#include "smalloc.h"
+#include "enxio.h"
+#include "statutil.h"
+#include "names.h"
+#include "macros.h"
+#include "xvgr.h"
+#include "gstat.h"
+#include "physics.h"
+#include "matio.h"
+#include "strdb.h"
+#include "gmx_ana.h"
+
+
+static int search_str2(int nstr, char **str, char *key)
+{
+ int i, n;
+ int keylen = strlen(key);
+ /* Linear search */
+ n = 0;
+ while ( (n < keylen) && ((key[n] < '0') || (key[n] > '9')) )
+ {
+ n++;
+ }
+ for (i = 0; (i < nstr); i++)
+ {
+ if (gmx_strncasecmp(str[i], key, n) == 0)
+ {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+int gmx_enemat(int argc, char *argv[])
+{
+ const char *desc[] = {
+ "[TT]g_enemat[tt] extracts an energy matrix from the energy file ([TT]-f[tt]).",
+ "With [TT]-groups[tt] a file must be supplied with on each",
+ "line a group of atoms to be used. For these groups matrix of",
+ "interaction energies will be extracted from the energy file",
+ "by looking for energy groups with names corresponding to pairs",
+ "of groups of atoms, e.g. if your [TT]-groups[tt] file contains:[BR]",
+ "[TT]2[tt][BR]",
+ "[TT]Protein[tt][BR]",
+ "[TT]SOL[tt][BR]",
+ "then energy groups with names like 'Coul-SR:Protein-SOL' and ",
+ "'LJ:Protein-SOL' are expected in the energy file (although",
+ "[TT]g_enemat[tt] is most useful if many groups are analyzed",
+ "simultaneously). Matrices for different energy types are written",
+ "out separately, as controlled by the",
+ "[TT]-[no]coul[tt], [TT]-[no]coulr[tt], [TT]-[no]coul14[tt], ",
+ "[TT]-[no]lj[tt], [TT]-[no]lj14[tt], ",
+ "[TT]-[no]bham[tt] and [TT]-[no]free[tt] options.",
+ "Finally, the total interaction energy energy per group can be ",
+ "calculated ([TT]-etot[tt]).[PAR]",
+
+ "An approximation of the free energy can be calculated using:",
+ "[MATH]E[SUB]free[sub] = E[SUB]0[sub] + kT [LOG][CHEVRON][EXP](E-E[SUB]0[sub])/kT[exp][chevron][log][math], where '[MATH][CHEVRON][chevron][math]'",
+ "stands for time-average. A file with reference free energies",
+ "can be supplied to calculate the free energy difference",
+ "with some reference state. Group names (e.g. residue names)",
+ "in the reference file should correspond to the group names",
+ "as used in the [TT]-groups[tt] file, but a appended number",
+ "(e.g. residue number) in the [TT]-groups[tt] will be ignored",
+ "in the comparison."
+ };
+ static gmx_bool bSum = FALSE;
+ static gmx_bool bMeanEmtx = TRUE;
+ static int skip = 0, nlevels = 20;
+ static real cutmax = 1e20, cutmin = -1e20, reftemp = 300.0;
+ static gmx_bool bCoulSR = TRUE, bCoulLR = FALSE, bCoul14 = FALSE;
+ static gmx_bool bLJSR = TRUE, bLJLR = FALSE, bLJ14 = FALSE, bBhamSR = FALSE, bBhamLR = FALSE,
+ bFree = TRUE;
+ t_pargs pa[] = {
+ { "-sum", FALSE, etBOOL, {&bSum},
+ "Sum the energy terms selected rather than display them all" },
+ { "-skip", FALSE, etINT, {&skip},
+ "Skip number of frames between data points" },
+ { "-mean", FALSE, etBOOL, {&bMeanEmtx},
+ "with [TT]-groups[tt] extracts matrix of mean energies instead of "
+ "matrix for each timestep" },
+ { "-nlevels", FALSE, etINT, {&nlevels}, "number of levels for matrix colors"},
+ { "-max", FALSE, etREAL, {&cutmax}, "max value for energies"},
+ { "-min", FALSE, etREAL, {&cutmin}, "min value for energies"},
+ { "-coulsr", FALSE, etBOOL, {&bCoulSR}, "extract Coulomb SR energies"},
+ { "-coullr", FALSE, etBOOL, {&bCoulLR}, "extract Coulomb LR energies"},
+ { "-coul14", FALSE, etBOOL, {&bCoul14}, "extract Coulomb 1-4 energies"},
+ { "-ljsr", FALSE, etBOOL, {&bLJSR}, "extract Lennard-Jones SR energies"},
+ { "-ljlr", FALSE, etBOOL, {&bLJLR}, "extract Lennard-Jones LR energies"},
+ { "-lj14", FALSE, etBOOL, {&bLJ14}, "extract Lennard-Jones 1-4 energies"},
+ { "-bhamsr", FALSE, etBOOL, {&bBhamSR}, "extract Buckingham SR energies"},
+ { "-bhamlr", FALSE, etBOOL, {&bBhamLR}, "extract Buckingham LR energies"},
+ { "-free", FALSE, etBOOL, {&bFree}, "calculate free energy"},
+ { "-temp", FALSE, etREAL, {&reftemp},
+ "reference temperature for free energy calculation"}
+ };
+ /* We will define egSP more energy-groups:
+ egTotal (total energy) */
+#define egTotal egNR
+#define egSP 1
+ gmx_bool egrp_use[egNR+egSP];
+ ener_file_t in;
+ FILE *out;
+ int timecheck = 0;
+ gmx_enxnm_t *enm = NULL;
+ t_enxframe *fr;
+ int teller = 0;
+ real sum;
+ gmx_bool bCont, bRef;
+ gmx_bool bCutmax, bCutmin;
+ real **eneset, *time = NULL;
+ int *set, i, j, k, prevk, m = 0, n, nre, nset, nenergy;
+ char **groups = NULL;
+ char groupname[255], fn[255];
+ int ngroups;
+ t_rgb rlo, rhi, rmid;
+ real emax, emid, emin;
+ real ***emat, **etot, *groupnr;
+ double beta, expE, **e, *eaver, *efree = NULL, edum;
+ char label[234];
+ char **ereflines, **erefres = NULL;
+ real *eref = NULL, *edif = NULL;
+ int neref = 0;
+ output_env_t oenv;
+
+ t_filenm fnm[] = {
+ { efEDR, "-f", NULL, ffOPTRD },
+ { efDAT, "-groups", "groups.dat", ffREAD },
+ { efDAT, "-eref", "eref.dat", ffOPTRD },
+ { efXPM, "-emat", "emat", ffWRITE },
+ { efXVG, "-etot", "energy", ffWRITE }
+ };
+#define NFILE asize(fnm)
+
+ parse_common_args(&argc, argv, PCA_CAN_VIEW | PCA_CAN_TIME | PCA_BE_NICE,
+ NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL, &oenv);
+
++ for (i = 0; (i < egNR+egSP); i++)
++ {
++ egrp_use[i] = FALSE;
++ }
+ egrp_use[egCOULSR] = bCoulSR;
+ egrp_use[egLJSR] = bLJSR;
+ egrp_use[egBHAMSR] = bBhamSR;
+ egrp_use[egCOULLR] = bCoulLR;
+ egrp_use[egLJLR] = bLJLR;
+ egrp_use[egBHAMLR] = bBhamLR;
+ egrp_use[egCOUL14] = bCoul14;
+ egrp_use[egLJ14] = bLJ14;
+ egrp_use[egTotal] = TRUE;
+
+ bRef = opt2bSet("-eref", NFILE, fnm);
+ in = open_enx(ftp2fn(efEDR, NFILE, fnm), "r");
+ do_enxnms(in, &nre, &enm);
+
+ if (nre == 0)
+ {
+ gmx_fatal(FARGS, "No energies!\n");
+ }
+
+ bCutmax = opt2parg_bSet("-max", asize(pa), pa);
+ bCutmin = opt2parg_bSet("-min", asize(pa), pa);
+
+ nenergy = 0;
+
+ /* Read groupnames from input file and construct selection of
+ energy groups from it*/
+
+ fprintf(stderr, "Will read groupnames from inputfile\n");
+ ngroups = get_lines(opt2fn("-groups", NFILE, fnm), &groups);
+ fprintf(stderr, "Read %d groups\n", ngroups);
+ snew(set, sqr(ngroups)*egNR/2);
+ n = 0;
+ prevk = 0;
+ for (i = 0; (i < ngroups); i++)
+ {
+ fprintf(stderr, "\rgroup %d", i);
+ for (j = i; (j < ngroups); j++)
+ {
+ for (m = 0; (m < egNR); m++)
+ {
+ if (egrp_use[m])
+ {
+ sprintf(groupname, "%s:%s-%s", egrp_nm[m], groups[i], groups[j]);
+#ifdef DEBUG
+ fprintf(stderr, "\r%-15s %5d", groupname, n);
+#endif
+ for (k = prevk; (k < prevk+nre); k++)
+ {
+ if (strcmp(enm[k%nre].name, groupname) == 0)
+ {
+ set[n++] = k;
+ break;
+ }
+ }
+ if (k == prevk+nre)
+ {
+ fprintf(stderr, "WARNING! could not find group %s (%d,%d)"
+ "in energy file\n", groupname, i, j);
+ }
+ else
+ {
+ prevk = k;
+ }
+ }
+ }
+ }
+ }
+ fprintf(stderr, "\n");
+ nset = n;
+ snew(eneset, nset+1);
+ fprintf(stderr, "Will select half-matrix of energies with %d elements\n", n);
+
+ /* Start reading energy frames */
+ snew(fr, 1);
+ do
+ {
+ do
+ {
+ bCont = do_enx(in, fr);
+ if (bCont)
+ {
+ timecheck = check_times(fr->t);
+ }
+ }
+ while (bCont && (timecheck < 0));
+
+ if (timecheck == 0)
+ {
+#define DONTSKIP(cnt) (skip) ? ((cnt % skip) == 0) : TRUE
+
+ if (bCont)
+ {
+ fprintf(stderr, "\rRead frame: %d, Time: %.3f", teller, fr->t);
+
+ if ((nenergy % 1000) == 0)
+ {
+ srenew(time, nenergy+1000);
+ for (i = 0; (i <= nset); i++)
+ {
+ srenew(eneset[i], nenergy+1000);
+ }
+ }
+ time[nenergy] = fr->t;
+ sum = 0;
+ for (i = 0; (i < nset); i++)
+ {
+ eneset[i][nenergy] = fr->ener[set[i]].e;
+ sum += fr->ener[set[i]].e;
+ }
+ if (bSum)
+ {
+ eneset[nset][nenergy] = sum;
+ }
+ nenergy++;
+ }
+ teller++;
+ }
+ }
+ while (bCont && (timecheck == 0));
+
+ fprintf(stderr, "\n");
+
+ fprintf(stderr, "Will build energy half-matrix of %d groups, %d elements, "
+ "over %d frames\n", ngroups, nset, nenergy);
+
+ snew(emat, egNR+egSP);
+ for (j = 0; (j < egNR+egSP); j++)
+ {
+ if (egrp_use[m])
+ {
+ snew(emat[j], ngroups);
+ for (i = 0; (i < ngroups); i++)
+ {
+ snew(emat[j][i], ngroups);
+ }
+ }
+ }
+ snew(groupnr, ngroups);
+ for (i = 0; (i < ngroups); i++)
+ {
+ groupnr[i] = i+1;
+ }
+ rlo.r = 1.0, rlo.g = 0.0, rlo.b = 0.0;
+ rmid.r = 1.0, rmid.g = 1.0, rmid.b = 1.0;
+ rhi.r = 0.0, rhi.g = 0.0, rhi.b = 1.0;
+ if (bMeanEmtx)
+ {
+ snew(e, ngroups);
+ for (i = 0; (i < ngroups); i++)
+ {
+ snew(e[i], nenergy);
+ }
+ n = 0;
+ for (i = 0; (i < ngroups); i++)
+ {
+ for (j = i; (j < ngroups); j++)
+ {
+ for (m = 0; (m < egNR); m++)
+ {
+ if (egrp_use[m])
+ {
+ for (k = 0; (k < nenergy); k++)
+ {
+ emat[m][i][j] += eneset[n][k];
+ e[i][k] += eneset[n][k]; /* *0.5; */
+ e[j][k] += eneset[n][k]; /* *0.5; */
+ }
+ n++;
+ emat[egTotal][i][j] += emat[m][i][j];
+ emat[m][i][j] /= nenergy;
+ emat[m][j][i] = emat[m][i][j];
+ }
+ }
+ emat[egTotal][i][j] /= nenergy;
+ emat[egTotal][j][i] = emat[egTotal][i][j];
+ }
+ }
+ if (bFree)
+ {
+ if (bRef)
+ {
+ fprintf(stderr, "Will read reference energies from inputfile\n");
+ neref = get_lines(opt2fn("-eref", NFILE, fnm), &ereflines);
+ fprintf(stderr, "Read %d reference energies\n", neref);
+ snew(eref, neref);
+ snew(erefres, neref);
+ for (i = 0; (i < neref); i++)
+ {
+ snew(erefres[i], 5);
+ sscanf(ereflines[i], "%s %lf", erefres[i], &edum);
+ eref[i] = edum;
+ }
+ }
+ snew(eaver, ngroups);
+ for (i = 0; (i < ngroups); i++)
+ {
+ for (k = 0; (k < nenergy); k++)
+ {
+ eaver[i] += e[i][k];
+ }
+ eaver[i] /= nenergy;
+ }
+ beta = 1.0/(BOLTZ*reftemp);
+ snew(efree, ngroups);
+ snew(edif, ngroups);
+ for (i = 0; (i < ngroups); i++)
+ {
+ expE = 0;
+ for (k = 0; (k < nenergy); k++)
+ {
+ expE += exp(beta*(e[i][k]-eaver[i]));
+ }
+ efree[i] = log(expE/nenergy)/beta + eaver[i];
+ if (bRef)
+ {
+ n = search_str2(neref, erefres, groups[i]);
+ if (n != -1)
+ {
+ edif[i] = efree[i]-eref[n];
+ }
+ else
+ {
+ edif[i] = efree[i];
+ fprintf(stderr, "WARNING: group %s not found "
+ "in reference energies.\n", groups[i]);
+ }
+ }
+ else
+ {
+ edif[i] = 0;
+ }
+ }
+ }
+
+ emid = 0.0; /*(emin+emax)*0.5;*/
+ egrp_nm[egTotal] = "total";
+ for (m = 0; (m < egNR+egSP); m++)
+ {
+ if (egrp_use[m])
+ {
+ emin = 1e10;
+ emax = -1e10;
+ for (i = 0; (i < ngroups); i++)
+ {
+ for (j = i; (j < ngroups); j++)
+ {
+ if (emat[m][i][j] > emax)
+ {
+ emax = emat[m][i][j];
+ }
+ else if (emat[m][i][j] < emin)
+ {
+ emin = emat[m][i][j];
+ }
+ }
+ }
+ if (emax == emin)
+ {
+ fprintf(stderr, "Matrix of %s energy is uniform at %f "
+ "(will not produce output).\n", egrp_nm[m], emax);
+ }
+ else
+ {
+ fprintf(stderr, "Matrix of %s energy ranges from %f to %f\n",
+ egrp_nm[m], emin, emax);
+ if ((bCutmax) || (emax > cutmax))
+ {
+ emax = cutmax;
+ }
+ if ((bCutmin) || (emin < cutmin))
+ {
+ emin = cutmin;
+ }
+ if ((emax == cutmax) || (emin == cutmin))
+ {
+ fprintf(stderr, "Energy range adjusted: %f to %f\n", emin, emax);
+ }
+
+ sprintf(fn, "%s%s", egrp_nm[m], ftp2fn(efXPM, NFILE, fnm));
+ sprintf(label, "%s Interaction Energies", egrp_nm[m]);
+ out = ffopen(fn, "w");
+ if (emin >= emid)
+ {
+ write_xpm(out, 0, label, "Energy (kJ/mol)",
+ "Residue Index", "Residue Index",
+ ngroups, ngroups, groupnr, groupnr, emat[m],
+ emid, emax, rmid, rhi, &nlevels);
+ }
+ else if (emax <= emid)
+ {
+ write_xpm(out, 0, label, "Energy (kJ/mol)",
+ "Residue Index", "Residue Index",
+ ngroups, ngroups, groupnr, groupnr, emat[m],
+ emin, emid, rlo, rmid, &nlevels);
+ }
+ else
+ {
+ write_xpm3(out, 0, label, "Energy (kJ/mol)",
+ "Residue Index", "Residue Index",
+ ngroups, ngroups, groupnr, groupnr, emat[m],
+ emin, emid, emax, rlo, rmid, rhi, &nlevels);
+ }
+ ffclose(out);
+ }
+ }
+ }
+ snew(etot, egNR+egSP);
+ for (m = 0; (m < egNR+egSP); m++)
+ {
+ snew(etot[m], ngroups);
+ for (i = 0; (i < ngroups); i++)
+ {
+ for (j = 0; (j < ngroups); j++)
+ {
+ etot[m][i] += emat[m][i][j];
+ }
+ }
+ }
+
+ out = xvgropen(ftp2fn(efXVG, NFILE, fnm), "Mean Energy", "Residue", "kJ/mol",
+ oenv);
+ xvgr_legend(out, 0, NULL, oenv);
+ j = 0;
+ for (m = 0; (m < egNR+egSP); m++)
+ {
+ if (egrp_use[m])
+ {
+ fprintf(out, "@ legend string %d \"%s\"\n", j++, egrp_nm[m]);
+ }
+ }
+ if (bFree)
+ {
+ fprintf(out, "@ legend string %d \"%s\"\n", j++, "Free");
+ }
+ if (bFree)
+ {
+ fprintf(out, "@ legend string %d \"%s\"\n", j++, "Diff");
+ }
+ fprintf(out, "@TYPE xy\n");
+ fprintf(out, "#%3s", "grp");
+ for (m = 0; (m < egNR+egSP); m++)
+ {
+ if (egrp_use[m])
+ {
+ fprintf(out, " %9s", egrp_nm[m]);
+ }
+ }
+ if (bFree)
+ {
+ fprintf(out, " %9s", "Free");
+ }
+ if (bFree)
+ {
+ fprintf(out, " %9s", "Diff");
+ }
+ fprintf(out, "\n");
+ for (i = 0; (i < ngroups); i++)
+ {
+ fprintf(out, "%3.0f", groupnr[i]);
+ for (m = 0; (m < egNR+egSP); m++)
+ {
+ if (egrp_use[m])
+ {
+ fprintf(out, " %9.5g", etot[m][i]);
+ }
+ }
+ if (bFree)
+ {
+ fprintf(out, " %9.5g", efree[i]);
+ }
+ if (bRef)
+ {
+ fprintf(out, " %9.5g", edif[i]);
+ }
+ fprintf(out, "\n");
+ }
+ ffclose(out);
+ }
+ else
+ {
+ fprintf(stderr, "While typing at your keyboard, suddenly...\n"
+ "...nothing happens.\nWARNING: Not Implemented Yet\n");
+/*
+ out=ftp2FILE(efMAT,NFILE,fnm,"w");
+ n=0;
+ emin=emax=0.0;
+ for (k=0; (k<nenergy); k++) {
+ for (i=0; (i<ngroups); i++)
+ for (j=i+1; (j<ngroups); j++)
+ emat[i][j]=eneset[n][k];
+ sprintf(label,"t=%.0f ps",time[k]);
+ write_matrix(out,ngroups,1,ngroups,groupnr,emat,label,emin,emax,nlevels);
+ n++;
+ }
+ ffclose(out);
+ */
+ }
+ close_enx(in);
+
+ return 0;
+}
--- /dev/null
- #ifdef GMX_X86_SSE2
- #define SIMD_BONDEDS
-
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROningen Mixture of Alchemy and Childrens' Stories
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "physics.h"
+#include "vec.h"
+#include "maths.h"
+#include "txtdump.h"
+#include "bondf.h"
+#include "smalloc.h"
+#include "pbc.h"
+#include "ns.h"
+#include "macros.h"
+#include "names.h"
+#include "gmx_fatal.h"
+#include "mshift.h"
+#include "main.h"
+#include "disre.h"
+#include "orires.h"
+#include "force.h"
+#include "nonbonded.h"
+
- /* Below are 3 SIMD vector operations.
- * Currently these are only used here, but they should be moved to
- * a general SIMD include file when used elsewhere.
- */
-
- /* SIMD inner-product of multiple vectors */
- static gmx_inline gmx_mm_pr
- gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
- gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz)
- {
- gmx_mm_pr ret;
-
- ret = gmx_mul_pr(ax, bx);
- ret = gmx_madd_pr(ay, by, ret);
- ret = gmx_madd_pr(az, bz, ret);
-
- return ret;
- }
-
- /* SIMD norm squared of multiple vectors */
- static gmx_inline gmx_mm_pr
- gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az)
- {
- gmx_mm_pr ret;
-
- ret = gmx_mul_pr(ax, ax);
- ret = gmx_madd_pr(ay, ay, ret);
- ret = gmx_madd_pr(az, az, ret);
-
- return ret;
- }
-
- /* SIMD cross-product of multiple vectors */
- static gmx_inline void
- gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
- gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz,
- gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz)
- {
- *cx = gmx_mul_pr(ay, bz);
- *cx = gmx_nmsub_pr(az, by, *cx);
-
- *cy = gmx_mul_pr(az, bx);
- *cy = gmx_nmsub_pr(ax, bz, *cy);
-
- *cz = gmx_mul_pr(ax, by);
- *cz = gmx_nmsub_pr(ay, bx, *cz);
- }
-
++/* Include the SIMD macro file and then check for support */
+#include "gmx_simd_macros.h"
++#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_TRIGONOMETRIC
++#define SIMD_BONDEDS
++#include "gmx_simd_vec.h"
+#endif
+
+/* Find a better place for this? */
+const int cmap_coeff_matrix[] = {
+ 1, 0, -3, 2, 0, 0, 0, 0, -3, 0, 9, -6, 2, 0, -6, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, -9, 6, -2, 0, 6, -4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -6, 0, 0, -6, 4,
+ 0, 0, 3, -2, 0, 0, 0, 0, 0, 0, -9, 6, 0, 0, 6, -4,
+ 0, 0, 0, 0, 1, 0, -3, 2, -2, 0, 6, -4, 1, 0, -3, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 3, -2, 1, 0, -3, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 2, 0, 0, 3, -2,
+ 0, 0, 0, 0, 0, 0, 3, -2, 0, 0, -6, 4, 0, 0, 3, -2,
+ 0, 1, -2, 1, 0, 0, 0, 0, 0, -3, 6, -3, 0, 2, -4, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -6, 3, 0, -2, 4, -2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 2, -2,
+ 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 3, -3, 0, 0, -2, 2,
+ 0, 0, 0, 0, 0, 1, -2, 1, 0, -2, 4, -2, 0, 1, -2, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 2, -1, 0, 1, -2, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 1,
+ 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 2, -2, 0, 0, -1, 1
+};
+
+
+
+int glatnr(int *global_atom_index, int i)
+{
+ int atnr;
+
+ if (global_atom_index == NULL)
+ {
+ atnr = i + 1;
+ }
+ else
+ {
+ atnr = global_atom_index[i] + 1;
+ }
+
+ return atnr;
+}
+
+static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
+{
+ if (pbc)
+ {
+ return pbc_dx_aiuc(pbc, xi, xj, dx);
+ }
+ else
+ {
+ rvec_sub(xi, xj, dx);
+ return CENTRAL;
+ }
+}
+
+#ifdef SIMD_BONDEDS
+
- /* Using -0.0 should lead to only the sign bit being set */
- gmx_mm_pr sign_mask_S = gmx_set1_pr(-0.0);
+/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
+typedef struct {
+ gmx_mm_pr inv_bzz;
+ gmx_mm_pr inv_byy;
+ gmx_mm_pr inv_bxx;
+ gmx_mm_pr bzx;
+ gmx_mm_pr bzy;
+ gmx_mm_pr bzz;
+ gmx_mm_pr byx;
+ gmx_mm_pr byy;
+ gmx_mm_pr bxx;
+} pbc_simd_t;
+
+/* Set the SIMD pbc data from a normal t_pbc struct */
+static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
+{
+ rvec inv_bdiag;
+ int d;
+
+ /* Setting inv_bdiag to 0 effectively turns off PBC */
+ clear_rvec(inv_bdiag);
+ if (pbc != NULL)
+ {
+ for (d = 0; d < pbc->ndim_ePBC; d++)
+ {
+ inv_bdiag[d] = 1.0/pbc->box[d][d];
+ }
+ }
+
+ pbc_simd->inv_bzz = gmx_set1_pr(inv_bdiag[ZZ]);
+ pbc_simd->inv_byy = gmx_set1_pr(inv_bdiag[YY]);
+ pbc_simd->inv_bxx = gmx_set1_pr(inv_bdiag[XX]);
+
+ if (pbc != NULL)
+ {
+ pbc_simd->bzx = gmx_set1_pr(pbc->box[ZZ][XX]);
+ pbc_simd->bzy = gmx_set1_pr(pbc->box[ZZ][YY]);
+ pbc_simd->bzz = gmx_set1_pr(pbc->box[ZZ][ZZ]);
+ pbc_simd->byx = gmx_set1_pr(pbc->box[YY][XX]);
+ pbc_simd->byy = gmx_set1_pr(pbc->box[YY][YY]);
+ pbc_simd->bxx = gmx_set1_pr(pbc->box[XX][XX]);
+ }
+ else
+ {
+ pbc_simd->bzx = gmx_setzero_pr();
+ pbc_simd->bzy = gmx_setzero_pr();
+ pbc_simd->bzz = gmx_setzero_pr();
+ pbc_simd->byx = gmx_setzero_pr();
+ pbc_simd->byy = gmx_setzero_pr();
+ pbc_simd->bxx = gmx_setzero_pr();
+ }
+}
+
+/* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
+static gmx_inline void
+pbc_dx_simd(gmx_mm_pr *dx, gmx_mm_pr *dy, gmx_mm_pr *dz,
+ const pbc_simd_t *pbc)
+{
+ gmx_mm_pr sh;
+
+ sh = gmx_round_pr(gmx_mul_pr(*dz, pbc->inv_bzz));
+ *dx = gmx_nmsub_pr(sh, pbc->bzx, *dx);
+ *dy = gmx_nmsub_pr(sh, pbc->bzy, *dy);
+ *dz = gmx_nmsub_pr(sh, pbc->bzz, *dz);
+
+ sh = gmx_round_pr(gmx_mul_pr(*dy, pbc->inv_byy));
+ *dx = gmx_nmsub_pr(sh, pbc->byx, *dx);
+ *dy = gmx_nmsub_pr(sh, pbc->byy, *dy);
+
+ sh = gmx_round_pr(gmx_mul_pr(*dx, pbc->inv_bxx));
+ *dx = gmx_nmsub_pr(sh, pbc->bxx, *dx);
+}
+
+#endif /* SIMD_BONDEDS */
+
+/*
+ * Morse potential bond by Frank Everdij
+ *
+ * Three parameters needed:
+ *
+ * b0 = equilibrium distance in nm
+ * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
+ * cb = well depth in kJ/mol
+ *
+ * Note: the potential is referenced to be +cb at infinite separation
+ * and zero at the equilibrium distance!
+ */
+
+real morse_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const real one = 1.0;
+ const real two = 2.0;
+ real dr, dr2, temp, omtemp, cbomtemp, fbond, vbond, fij, vtot;
+ real b0, be, cb, b0A, beA, cbA, b0B, beB, cbB, L1;
+ rvec dx;
+ int i, m, ki, type, ai, aj;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ b0A = forceparams[type].morse.b0A;
+ beA = forceparams[type].morse.betaA;
+ cbA = forceparams[type].morse.cbA;
+
+ b0B = forceparams[type].morse.b0B;
+ beB = forceparams[type].morse.betaB;
+ cbB = forceparams[type].morse.cbB;
+
+ L1 = one-lambda; /* 1 */
+ b0 = L1*b0A + lambda*b0B; /* 3 */
+ be = L1*beA + lambda*beB; /* 3 */
+ cb = L1*cbA + lambda*cbB; /* 3 */
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+ temp = exp(-be*(dr-b0)); /* 12 */
+
+ if (temp == one)
+ {
+ /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
+ *dvdlambda += cbB-cbA;
+ continue;
+ }
+
+ omtemp = one-temp; /* 1 */
+ cbomtemp = cb*omtemp; /* 1 */
+ vbond = cbomtemp*omtemp; /* 1 */
+ fbond = -two*be*temp*cbomtemp*gmx_invsqrt(dr2); /* 9 */
+ vtot += vbond; /* 1 */
+
+ *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 83 TOTAL */
+ return vtot;
+}
+
+real cubic_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const real three = 3.0;
+ const real two = 2.0;
+ real kb, b0, kcub;
+ real dr, dr2, dist, kdist, kdist2, fbond, vbond, fij, vtot;
+ rvec dx;
+ int i, m, ki, type, ai, aj;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ b0 = forceparams[type].cubic.b0;
+ kb = forceparams[type].cubic.kb;
+ kcub = forceparams[type].cubic.kcub;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+ dist = dr-b0;
+ kdist = kb*dist;
+ kdist2 = kdist*dist;
+
+ vbond = kdist2 + kcub*kdist2*dist;
+ fbond = -(two*kdist + three*kdist2*kcub)/dr;
+
+ vtot += vbond; /* 21 */
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 54 TOTAL */
+ return vtot;
+}
+
+real FENE_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int *global_atom_index)
+{
+ const real half = 0.5;
+ const real one = 1.0;
+ real bm, kb;
+ real dr, dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
+ rvec dx;
+ int i, m, ki, type, ai, aj;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ bm = forceparams[type].fene.bm;
+ kb = forceparams[type].fene.kb;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ bm2 = bm*bm;
+
+ if (dr2 >= bm2)
+ {
+ gmx_fatal(FARGS,
+ "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
+ dr2, bm2,
+ glatnr(global_atom_index, ai),
+ glatnr(global_atom_index, aj));
+ }
+
+ omdr2obm2 = one - dr2/bm2;
+
+ vbond = -half*kb*bm2*log(omdr2obm2);
+ fbond = -kb/omdr2obm2;
+
+ vtot += vbond; /* 35 */
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 58 TOTAL */
+ return vtot;
+}
+
+real harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ real *V, real *F)
+{
+ const real half = 0.5;
+ real L1, kk, x0, dx, dx2;
+ real v, f, dvdlambda;
+
+ L1 = 1.0-lambda;
+ kk = L1*kA+lambda*kB;
+ x0 = L1*xA+lambda*xB;
+
+ dx = x-x0;
+ dx2 = dx*dx;
+
+ f = -kk*dx;
+ v = half*kk*dx2;
+ dvdlambda = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+
+ *F = f;
+ *V = v;
+
+ return dvdlambda;
+
+ /* That was 19 flops */
+}
+
+
+real bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, fij, vtot;
+ rvec dx;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA,
+ forceparams[type].harmonic.rB,
+ dr, lambda, &vbond, &fbond); /* 19 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx_invsqrt(dr2); /* 6 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "BONDS: dr = %10g vbond = %10g fbond = %10g\n",
+ dr, vbond, fbond);
+ }
+#endif
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 59 TOTAL */
+ return vtot;
+}
+
+real restraint_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, fij, vtot;
+ real L1;
+ real low, dlow, up1, dup1, up2, dup2, k, dk;
+ real drh, drh2;
+ rvec dx;
+ ivec dt;
+
+ L1 = 1.0 - lambda;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+
+ low = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
+ dlow = -forceparams[type].restraint.lowA + forceparams[type].restraint.lowB;
+ up1 = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
+ dup1 = -forceparams[type].restraint.up1A + forceparams[type].restraint.up1B;
+ up2 = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
+ dup2 = -forceparams[type].restraint.up2A + forceparams[type].restraint.up2B;
+ k = L1*forceparams[type].restraint.kA + lambda*forceparams[type].restraint.kB;
+ dk = -forceparams[type].restraint.kA + forceparams[type].restraint.kB;
+ /* 24 */
+
+ if (dr < low)
+ {
+ drh = dr - low;
+ drh2 = drh*drh;
+ vbond = 0.5*k*drh2;
+ fbond = -k*drh;
+ *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
+ } /* 11 */
+ else if (dr <= up1)
+ {
+ vbond = 0;
+ fbond = 0;
+ }
+ else if (dr <= up2)
+ {
+ drh = dr - up1;
+ drh2 = drh*drh;
+ vbond = 0.5*k*drh2;
+ fbond = -k*drh;
+ *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
+ } /* 11 */
+ else
+ {
+ drh = dr - up2;
+ vbond = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
+ fbond = -k*(up2 - up1);
+ *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
+ + k*(dup2 - dup1)*(up2 - up1 + drh)
+ - k*(up2 - up1)*dup2;
+ }
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx_invsqrt(dr2); /* 6 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "BONDS: dr = %10g vbond = %10g fbond = %10g\n",
+ dr, vbond, fbond);
+ }
+#endif
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 59 TOTAL */
+
+ return vtot;
+}
+
+real polarize(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, fij, vtot, ksh;
+ rvec dx;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ksh = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
+ if (debug)
+ {
+ fprintf(debug, "POL: local ai = %d aj = %d ksh = %.3f\n", ai, aj, ksh);
+ }
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /* 19 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx_invsqrt(dr2); /* 6 */
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 59 TOTAL */
+ return vtot;
+}
+
+real anharm_polarize(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, fij, vtot, ksh, khyp, drcut, ddr, ddr3;
+ rvec dx;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ksh = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
+ khyp = forceparams[type].anharm_polarize.khyp;
+ drcut = forceparams[type].anharm_polarize.drcut;
+ if (debug)
+ {
+ fprintf(debug, "POL: local ai = %d aj = %d ksh = %.3f\n", ai, aj, ksh);
+ }
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /* 19 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ if (dr > drcut)
+ {
+ ddr = dr-drcut;
+ ddr3 = ddr*ddr*ddr;
+ vbond += khyp*ddr*ddr3;
+ fbond -= 4*khyp*ddr3;
+ }
+ fbond *= gmx_invsqrt(dr2); /* 6 */
+ vtot += vbond; /* 1*/
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 72 TOTAL */
+ return vtot;
+}
+
+real water_pol(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec gmx_unused fshift[],
+ const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* This routine implements anisotropic polarizibility for water, through
+ * a shell connected to a dummy with spring constant that differ in the
+ * three spatial dimensions in the molecular frame.
+ */
+ int i, m, aO, aH1, aH2, aD, aS, type, type0;
+ rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
+#ifdef DEBUG
+ rvec df;
+#endif
+ real vtot, fij, r_HH, r_OD, r_nW, tx, ty, tz, qS;
+
+ vtot = 0.0;
+ if (nbonds > 0)
+ {
+ type0 = forceatoms[0];
+ aS = forceatoms[5];
+ qS = md->chargeA[aS];
+ kk[XX] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
+ kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
+ kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
+ r_HH = 1.0/forceparams[type0].wpol.rHH;
+ r_OD = 1.0/forceparams[type0].wpol.rOD;
+ if (debug)
+ {
+ fprintf(debug, "WPOL: qS = %10.5f aS = %5d\n", qS, aS);
+ fprintf(debug, "WPOL: kk = %10.3f %10.3f %10.3f\n",
+ kk[XX], kk[YY], kk[ZZ]);
+ fprintf(debug, "WPOL: rOH = %10.3f rHH = %10.3f rOD = %10.3f\n",
+ forceparams[type0].wpol.rOH,
+ forceparams[type0].wpol.rHH,
+ forceparams[type0].wpol.rOD);
+ }
+ for (i = 0; (i < nbonds); i += 6)
+ {
+ type = forceatoms[i];
+ if (type != type0)
+ {
+ gmx_fatal(FARGS, "Sorry, type = %d, type0 = %d, file = %s, line = %d",
+ type, type0, __FILE__, __LINE__);
+ }
+ aO = forceatoms[i+1];
+ aH1 = forceatoms[i+2];
+ aH2 = forceatoms[i+3];
+ aD = forceatoms[i+4];
+ aS = forceatoms[i+5];
+
+ /* Compute vectors describing the water frame */
+ rvec_sub(x[aH1], x[aO], dOH1);
+ rvec_sub(x[aH2], x[aO], dOH2);
+ rvec_sub(x[aH2], x[aH1], dHH);
+ rvec_sub(x[aD], x[aO], dOD);
+ rvec_sub(x[aS], x[aD], dDS);
+ cprod(dOH1, dOH2, nW);
+
+ /* Compute inverse length of normal vector
+ * (this one could be precomputed, but I'm too lazy now)
+ */
+ r_nW = gmx_invsqrt(iprod(nW, nW));
+ /* This is for precision, but does not make a big difference,
+ * it can go later.
+ */
+ r_OD = gmx_invsqrt(iprod(dOD, dOD));
+
+ /* Normalize the vectors in the water frame */
+ svmul(r_nW, nW, nW);
+ svmul(r_HH, dHH, dHH);
+ svmul(r_OD, dOD, dOD);
+
+ /* Compute displacement of shell along components of the vector */
+ dx[ZZ] = iprod(dDS, dOD);
+ /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
+ for (m = 0; (m < DIM); m++)
+ {
+ proj[m] = dDS[m]-dx[ZZ]*dOD[m];
+ }
+
+ /*dx[XX] = iprod(dDS,nW);
+ dx[YY] = iprod(dDS,dHH);*/
+ dx[XX] = iprod(proj, nW);
+ for (m = 0; (m < DIM); m++)
+ {
+ proj[m] -= dx[XX]*nW[m];
+ }
+ dx[YY] = iprod(proj, dHH);
+ /*#define DEBUG*/
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "WPOL: dx2=%10g dy2=%10g dz2=%10g sum=%10g dDS^2=%10g\n",
+ sqr(dx[XX]), sqr(dx[YY]), sqr(dx[ZZ]), iprod(dx, dx), iprod(dDS, dDS));
+ fprintf(debug, "WPOL: dHH=(%10g,%10g,%10g)\n", dHH[XX], dHH[YY], dHH[ZZ]);
+ fprintf(debug, "WPOL: dOD=(%10g,%10g,%10g), 1/r_OD = %10g\n",
+ dOD[XX], dOD[YY], dOD[ZZ], 1/r_OD);
+ fprintf(debug, "WPOL: nW =(%10g,%10g,%10g), 1/r_nW = %10g\n",
+ nW[XX], nW[YY], nW[ZZ], 1/r_nW);
+ fprintf(debug, "WPOL: dx =%10g, dy =%10g, dz =%10g\n",
+ dx[XX], dx[YY], dx[ZZ]);
+ fprintf(debug, "WPOL: dDSx=%10g, dDSy=%10g, dDSz=%10g\n",
+ dDS[XX], dDS[YY], dDS[ZZ]);
+ }
+#endif
+ /* Now compute the forces and energy */
+ kdx[XX] = kk[XX]*dx[XX];
+ kdx[YY] = kk[YY]*dx[YY];
+ kdx[ZZ] = kk[ZZ]*dx[ZZ];
+ vtot += iprod(dx, kdx);
+ for (m = 0; (m < DIM); m++)
+ {
+ /* This is a tensor operation but written out for speed */
+ tx = nW[m]*kdx[XX];
+ ty = dHH[m]*kdx[YY];
+ tz = dOD[m]*kdx[ZZ];
+ fij = -tx-ty-tz;
+#ifdef DEBUG
+ df[m] = fij;
+#endif
+ f[aS][m] += fij;
+ f[aD][m] -= fij;
+ }
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "WPOL: vwpol=%g\n", 0.5*iprod(dx, kdx));
+ fprintf(debug, "WPOL: df = (%10g, %10g, %10g)\n", df[XX], df[YY], df[ZZ]);
+ }
+#endif
+ }
+ }
+ return 0.5*vtot;
+}
+
+static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
+ const t_pbc *pbc, real qq,
+ rvec fshift[], real afac)
+{
+ rvec r12;
+ real r12sq, r12_1, r12n, r12bar, v0, v1, fscal, ebar, fff;
+ int m, t;
+
+ t = pbc_rvec_sub(pbc, xi, xj, r12); /* 3 */
+
+ r12sq = iprod(r12, r12); /* 5 */
+ r12_1 = gmx_invsqrt(r12sq); /* 5 */
+ r12bar = afac/r12_1; /* 5 */
+ v0 = qq*ONE_4PI_EPS0*r12_1; /* 2 */
+ ebar = exp(-r12bar); /* 5 */
+ v1 = (1-(1+0.5*r12bar)*ebar); /* 4 */
+ fscal = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
+ if (debug)
+ {
+ fprintf(debug, "THOLE: v0 = %.3f v1 = %.3f r12= % .3f r12bar = %.3f fscal = %.3f ebar = %.3f\n", v0, v1, 1/r12_1, r12bar, fscal, ebar);
+ }
+
+ for (m = 0; (m < DIM); m++)
+ {
+ fff = fscal*r12[m];
+ fi[m] += fff;
+ fj[m] -= fff;
+ fshift[t][m] += fff;
+ fshift[CENTRAL][m] -= fff;
+ } /* 15 */
+
+ return v0*v1; /* 1 */
+ /* 54 */
+}
+
+real thole_pol(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* Interaction between two pairs of particles with opposite charge */
+ int i, type, a1, da1, a2, da2;
+ real q1, q2, qq, a, al1, al2, afac;
+ real V = 0;
+
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ a1 = forceatoms[i++];
+ da1 = forceatoms[i++];
+ a2 = forceatoms[i++];
+ da2 = forceatoms[i++];
+ q1 = md->chargeA[da1];
+ q2 = md->chargeA[da2];
+ a = forceparams[type].thole.a;
+ al1 = forceparams[type].thole.alpha1;
+ al2 = forceparams[type].thole.alpha2;
+ qq = q1*q2;
+ afac = a*pow(al1*al2, -1.0/6.0);
+ V += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
+ V += do_1_thole(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
+ V += do_1_thole(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
+ V += do_1_thole(x[da1], x[da2], f[da1], f[da2], pbc, qq, fshift, afac);
+ }
+ /* 290 flops */
+ return V;
+}
+
+real bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ rvec r_ij, rvec r_kj, real *costh,
+ int *t1, int *t2)
+/* Return value is the angle between the bonds i-j and j-k */
+{
+ /* 41 FLOPS */
+ real th;
+
+ *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /* 3 */
+ *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /* 3 */
+
+ *costh = cos_angle(r_ij, r_kj); /* 25 */
+ th = acos(*costh); /* 10 */
+ /* 41 TOTAL */
+ return th;
+}
+
+real angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ai, aj, ak, t1, t2, type;
+ rvec r_ij, r_kj;
+ real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; i < nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA*DEG2RAD,
+ forceparams[type].harmonic.rB*DEG2RAD,
+ theta, lambda, &va, &dVdt); /* 21 */
+ vtot += va;
+
+ cos_theta2 = sqr(cos_theta);
+ if (cos_theta2 < 1)
+ {
+ int m;
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ real nrkj_1, nrij_1;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx_invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "ANGLES: theta = %10g vth = %10g dV/dtheta = %10g\n",
+ theta*RAD2DEG, va, dVdt);
+ }
+#endif
+ nrij2 = iprod(r_ij, r_ij); /* 5 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+
+ nrij_1 = gmx_invsqrt(nrij2); /* 10 */
+ nrkj_1 = gmx_invsqrt(nrkj2); /* 10 */
+
+ cik = st*nrij_1*nrkj_1; /* 2 */
+ cii = sth*nrij_1*nrij_1; /* 2 */
+ ckk = sth*nrkj_1*nrkj_1; /* 2 */
+
+ for (m = 0; m < DIM; m++)
+ { /* 39 */
+ f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]);
+ f_j[m] = -f_i[m] - f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ if (g != NULL)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ } /* 161 TOTAL */
+ }
+
+ return vtot;
+}
+
+#ifdef SIMD_BONDEDS
+
+/* As angles, but using SIMD to calculate many dihedrals at once.
+ * This routines does not calculate energies and shift forces.
+ */
+static gmx_inline void
+angles_noener_simd(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+#define UNROLL GMX_SIMD_WIDTH_HERE
+ const int nfa1 = 4;
+ int i, iu, s, m;
+ int type, ai[UNROLL], aj[UNROLL], ak[UNROLL];
+ real coeff_array[2*UNROLL+UNROLL], *coeff;
+ real dr_array[2*DIM*UNROLL+UNROLL], *dr;
+ real f_buf_array[6*UNROLL+UNROLL], *f_buf;
+ gmx_mm_pr k_S, theta0_S;
+ gmx_mm_pr rijx_S, rijy_S, rijz_S;
+ gmx_mm_pr rkjx_S, rkjy_S, rkjz_S;
+ gmx_mm_pr one_S;
+ gmx_mm_pr rij_rkj_S;
+ gmx_mm_pr nrij2_S, nrij_1_S;
+ gmx_mm_pr nrkj2_S, nrkj_1_S;
+ gmx_mm_pr cos_S, sin_S;
+ gmx_mm_pr theta_S;
+ gmx_mm_pr st_S, sth_S;
+ gmx_mm_pr cik_S, cii_S, ckk_S;
+ gmx_mm_pr f_ix_S, f_iy_S, f_iz_S;
+ gmx_mm_pr f_kx_S, f_ky_S, f_kz_S;
+ pbc_simd_t pbc_simd;
+
+ /* Ensure register memory alignment */
+ coeff = gmx_simd_align_real(coeff_array);
+ dr = gmx_simd_align_real(dr_array);
+ f_buf = gmx_simd_align_real(f_buf_array);
+
+ set_pbc_simd(pbc, &pbc_simd);
+
+ one_S = gmx_set1_pr(1.0);
+
+ /* nbonds is the number of angles times nfa1, here we step UNROLL angles */
+ for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+ {
+ /* Collect atoms for UNROLL angles.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ iu = i;
+ for (s = 0; s < UNROLL; s++)
+ {
+ type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+
+ coeff[s] = forceparams[type].harmonic.krA;
+ coeff[UNROLL+s] = forceparams[type].harmonic.rA*DEG2RAD;
+
+ /* If you can't use pbc_dx_simd below for PBC, e.g. because
+ * you can't round in SIMD, use pbc_rvec_sub here.
+ */
+ /* Store the non PBC corrected distances packed and aligned */
+ for (m = 0; m < DIM; m++)
+ {
+ dr[s + m *UNROLL] = x[ai[s]][m] - x[aj[s]][m];
+ dr[s + (DIM+m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
+ }
+
+ /* At the end fill the arrays with identical entries */
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+
+ k_S = gmx_load_pr(coeff);
+ theta0_S = gmx_load_pr(coeff+UNROLL);
+
+ rijx_S = gmx_load_pr(dr + 0*UNROLL);
+ rijy_S = gmx_load_pr(dr + 1*UNROLL);
+ rijz_S = gmx_load_pr(dr + 2*UNROLL);
+ rkjx_S = gmx_load_pr(dr + 3*UNROLL);
+ rkjy_S = gmx_load_pr(dr + 4*UNROLL);
+ rkjz_S = gmx_load_pr(dr + 5*UNROLL);
+
+ pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
+ pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
+
+ rij_rkj_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
+ rkjx_S, rkjy_S, rkjz_S);
+
+ nrij2_S = gmx_norm2_pr(rijx_S, rijy_S, rijz_S);
+ nrkj2_S = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+
+ nrij_1_S = gmx_invsqrt_pr(nrij2_S);
+ nrkj_1_S = gmx_invsqrt_pr(nrkj2_S);
+
+ cos_S = gmx_mul_pr(rij_rkj_S, gmx_mul_pr(nrij_1_S, nrkj_1_S));
+
+ theta_S = gmx_acos_pr(cos_S);
+
+ sin_S = gmx_invsqrt_pr(gmx_max_pr(gmx_sub_pr(one_S, gmx_mul_pr(cos_S, cos_S)),
+ gmx_setzero_pr()));
+ st_S = gmx_mul_pr(gmx_mul_pr(k_S, gmx_sub_pr(theta0_S, theta_S)),
+ sin_S);
+ sth_S = gmx_mul_pr(st_S, cos_S);
+
+ cik_S = gmx_mul_pr(st_S, gmx_mul_pr(nrij_1_S, nrkj_1_S));
+ cii_S = gmx_mul_pr(sth_S, gmx_mul_pr(nrij_1_S, nrij_1_S));
+ ckk_S = gmx_mul_pr(sth_S, gmx_mul_pr(nrkj_1_S, nrkj_1_S));
+
+ f_ix_S = gmx_mul_pr(cii_S, rijx_S);
+ f_ix_S = gmx_nmsub_pr(cik_S, rkjx_S, f_ix_S);
+ f_iy_S = gmx_mul_pr(cii_S, rijy_S);
+ f_iy_S = gmx_nmsub_pr(cik_S, rkjy_S, f_iy_S);
+ f_iz_S = gmx_mul_pr(cii_S, rijz_S);
+ f_iz_S = gmx_nmsub_pr(cik_S, rkjz_S, f_iz_S);
+ f_kx_S = gmx_mul_pr(ckk_S, rkjx_S);
+ f_kx_S = gmx_nmsub_pr(cik_S, rijx_S, f_kx_S);
+ f_ky_S = gmx_mul_pr(ckk_S, rkjy_S);
+ f_ky_S = gmx_nmsub_pr(cik_S, rijy_S, f_ky_S);
+ f_kz_S = gmx_mul_pr(ckk_S, rkjz_S);
+ f_kz_S = gmx_nmsub_pr(cik_S, rijz_S, f_kz_S);
+
+ gmx_store_pr(f_buf + 0*UNROLL, f_ix_S);
+ gmx_store_pr(f_buf + 1*UNROLL, f_iy_S);
+ gmx_store_pr(f_buf + 2*UNROLL, f_iz_S);
+ gmx_store_pr(f_buf + 3*UNROLL, f_kx_S);
+ gmx_store_pr(f_buf + 4*UNROLL, f_ky_S);
+ gmx_store_pr(f_buf + 5*UNROLL, f_kz_S);
+
+ iu = i;
+ s = 0;
+ do
+ {
+ for (m = 0; m < DIM; m++)
+ {
+ f[ai[s]][m] += f_buf[s + m*UNROLL];
+ f[aj[s]][m] -= f_buf[s + m*UNROLL] + f_buf[s + (DIM+m)*UNROLL];
+ f[ak[s]][m] += f_buf[s + (DIM+m)*UNROLL];
+ }
+ s++;
+ iu += nfa1;
+ }
+ while (s < UNROLL && iu < nbonds);
+ }
+#undef UNROLL
+}
+
+#endif /* SIMD_BONDEDS */
+
+real linear_angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ai, aj, ak, t1, t2, type;
+ rvec f_i, f_j, f_k;
+ real L1, kA, kB, aA, aB, dr, dr2, va, vtot, a, b, klin;
+ ivec jt, dt_ij, dt_kj;
+ rvec r_ij, r_kj, r_ik, dx;
+
+ L1 = 1-lambda;
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ kA = forceparams[type].linangle.klinA;
+ kB = forceparams[type].linangle.klinB;
+ klin = L1*kA + lambda*kB;
+
+ aA = forceparams[type].linangle.aA;
+ aB = forceparams[type].linangle.aB;
+ a = L1*aA+lambda*aB;
+ b = 1-a;
+
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ rvec_sub(r_ij, r_kj, r_ik);
+
+ dr2 = 0;
+ for (m = 0; (m < DIM); m++)
+ {
+ dr = -a * r_ij[m] - b * r_kj[m];
+ dr2 += dr*dr;
+ dx[m] = dr;
+ f_i[m] = a*klin*dr;
+ f_k[m] = b*klin*dr;
+ f_j[m] = -(f_i[m]+f_k[m]);
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ va = 0.5*klin*dr2;
+ *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx, r_ik);
+
+ vtot += va;
+
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ } /* 57 TOTAL */
+ return vtot;
+}
+
+real urey_bradley(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ai, aj, ak, t1, t2, type, ki;
+ rvec r_ij, r_kj, r_ik;
+ real cos_theta, cos_theta2, theta;
+ real dVdt, va, vtot, dr, dr2, vbond, fbond, fik;
+ real kthA, th0A, kUBA, r13A, kthB, th0B, kUBB, r13B;
+ ivec jt, dt_ij, dt_kj, dt_ik;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ th0A = forceparams[type].u_b.thetaA*DEG2RAD;
+ kthA = forceparams[type].u_b.kthetaA;
+ r13A = forceparams[type].u_b.r13A;
+ kUBA = forceparams[type].u_b.kUBA;
+ th0B = forceparams[type].u_b.thetaB*DEG2RAD;
+ kthB = forceparams[type].u_b.kthetaB;
+ r13B = forceparams[type].u_b.r13B;
+ kUBB = forceparams[type].u_b.kUBB;
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ *dvdlambda += harmonic(kthA, kthB, th0A, th0B, theta, lambda, &va, &dVdt); /* 21 */
+ vtot += va;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik); /* 3 */
+ dr2 = iprod(r_ik, r_ik); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(kUBA, kUBB, r13A, r13B, dr, lambda, &vbond, &fbond); /* 19 */
+
+ cos_theta2 = sqr(cos_theta); /* 1 */
+ if (cos_theta2 < 1)
+ {
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx_invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "ANGLES: theta = %10g vth = %10g dV/dtheta = %10g\n",
+ theta*RAD2DEG, va, dVdt);
+ }
+#endif
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ nrij2 = iprod(r_ij, r_ij);
+
+ cik = st*gmx_invsqrt(nrkj2*nrij2); /* 12 */
+ cii = sth/nrij2; /* 10 */
+ ckk = sth/nrkj2; /* 10 */
+
+ for (m = 0; (m < DIM); m++) /* 39 */
+ {
+ f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ } /* 161 TOTAL */
+ /* Time for the bond calculations */
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx_invsqrt(dr2); /* 6 */
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), dt_ik);
+ ki = IVEC2IS(dt_ik);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fik = fbond*r_ik[m];
+ f[ai][m] += fik;
+ f[ak][m] -= fik;
+ fshift[ki][m] += fik;
+ fshift[CENTRAL][m] -= fik;
+ }
+ }
+ return vtot;
+}
+
+real quartic_angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, j, ai, aj, ak, t1, t2, type;
+ rvec r_ij, r_kj;
+ real cos_theta, cos_theta2, theta, dt, dVdt, va, dtp, c, vtot;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ dt = theta - forceparams[type].qangle.theta*DEG2RAD; /* 2 */
+
+ dVdt = 0;
+ va = forceparams[type].qangle.c[0];
+ dtp = 1.0;
+ for (j = 1; j <= 4; j++)
+ {
+ c = forceparams[type].qangle.c[j];
+ dVdt -= j*c*dtp;
+ dtp *= dt;
+ va += c*dtp;
+ }
+ /* 20 */
+
+ vtot += va;
+
+ cos_theta2 = sqr(cos_theta); /* 1 */
+ if (cos_theta2 < 1)
+ {
+ int m;
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx_invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "ANGLES: theta = %10g vth = %10g dV/dtheta = %10g\n",
+ theta*RAD2DEG, va, dVdt);
+ }
+#endif
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ nrij2 = iprod(r_ij, r_ij);
+
+ cik = st*gmx_invsqrt(nrkj2*nrij2); /* 12 */
+ cii = sth/nrij2; /* 10 */
+ ckk = sth/nrkj2; /* 10 */
+
+ for (m = 0; (m < DIM); m++) /* 39 */
+ {
+ f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ } /* 153 TOTAL */
+ }
+ return vtot;
+}
+
+real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
+ const t_pbc *pbc,
+ rvec r_ij, rvec r_kj, rvec r_kl, rvec m, rvec n,
+ real *sign, int *t1, int *t2, int *t3)
+{
+ real ipr, phi;
+
+ *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /* 3 */
+ *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /* 3 */
+ *t3 = pbc_rvec_sub(pbc, xk, xl, r_kl); /* 3 */
+
+ cprod(r_ij, r_kj, m); /* 9 */
+ cprod(r_kj, r_kl, n); /* 9 */
+ phi = gmx_angle(m, n); /* 49 (assuming 25 for atan2) */
+ ipr = iprod(r_ij, n); /* 5 */
+ (*sign) = (ipr < 0.0) ? -1.0 : 1.0;
+ phi = (*sign)*phi; /* 1 */
+ /* 82 TOTAL */
+ return phi;
+}
+
+
+#ifdef SIMD_BONDEDS
+
+/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
+ * also calculates the pre-factor required for the dihedral force update.
+ * Note that bv and buf should be register aligned.
+ */
+static gmx_inline void
+dih_angle_simd(const rvec *x,
+ const int *ai, const int *aj, const int *ak, const int *al,
+ const pbc_simd_t *pbc,
+ real *dr,
+ gmx_mm_pr *phi_S,
+ gmx_mm_pr *mx_S, gmx_mm_pr *my_S, gmx_mm_pr *mz_S,
+ gmx_mm_pr *nx_S, gmx_mm_pr *ny_S, gmx_mm_pr *nz_S,
+ gmx_mm_pr *nrkj_m2_S,
+ gmx_mm_pr *nrkj_n2_S,
+ real *p,
+ real *q)
+{
+#define UNROLL GMX_SIMD_WIDTH_HERE
+ int s, m;
+ gmx_mm_pr rijx_S, rijy_S, rijz_S;
+ gmx_mm_pr rkjx_S, rkjy_S, rkjz_S;
+ gmx_mm_pr rklx_S, rkly_S, rklz_S;
+ gmx_mm_pr cx_S, cy_S, cz_S;
+ gmx_mm_pr cn_S;
+ gmx_mm_pr s_S;
+ gmx_mm_pr ipr_S;
+ gmx_mm_pr iprm_S, iprn_S;
+ gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+ gmx_mm_pr p_S, q_S;
+ gmx_mm_pr fmin_S = gmx_set1_pr(GMX_FLOAT_MIN);
- /* Set sign of the angle with the sign of ipr_S.
- * Since phi is currently positive, we can use OR instead of XOR.
- */
- *phi_S = gmx_or_pr(*phi_S, gmx_and_pr(ipr_S, sign_mask_S));
+
+ for (s = 0; s < UNROLL; s++)
+ {
+ /* If you can't use pbc_dx_simd below for PBC, e.g. because
+ * you can't round in SIMD, use pbc_rvec_sub here.
+ */
+ for (m = 0; m < DIM; m++)
+ {
+ dr[s + (0*DIM + m)*UNROLL] = x[ai[s]][m] - x[aj[s]][m];
+ dr[s + (1*DIM + m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
+ dr[s + (2*DIM + m)*UNROLL] = x[ak[s]][m] - x[al[s]][m];
+ }
+ }
+
+ rijx_S = gmx_load_pr(dr + 0*UNROLL);
+ rijy_S = gmx_load_pr(dr + 1*UNROLL);
+ rijz_S = gmx_load_pr(dr + 2*UNROLL);
+ rkjx_S = gmx_load_pr(dr + 3*UNROLL);
+ rkjy_S = gmx_load_pr(dr + 4*UNROLL);
+ rkjz_S = gmx_load_pr(dr + 5*UNROLL);
+ rklx_S = gmx_load_pr(dr + 6*UNROLL);
+ rkly_S = gmx_load_pr(dr + 7*UNROLL);
+ rklz_S = gmx_load_pr(dr + 8*UNROLL);
+
+ pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
+ pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
+ pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
+
+ gmx_cprod_pr(rijx_S, rijy_S, rijz_S,
+ rkjx_S, rkjy_S, rkjz_S,
+ mx_S, my_S, mz_S);
+
+ gmx_cprod_pr(rkjx_S, rkjy_S, rkjz_S,
+ rklx_S, rkly_S, rklz_S,
+ nx_S, ny_S, nz_S);
+
+ gmx_cprod_pr(*mx_S, *my_S, *mz_S,
+ *nx_S, *ny_S, *nz_S,
+ &cx_S, &cy_S, &cz_S);
+
+ cn_S = gmx_sqrt_pr(gmx_norm2_pr(cx_S, cy_S, cz_S));
+
+ s_S = gmx_iprod_pr(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
+
+ /* Determine the dihedral angle, the sign might need correction */
+ *phi_S = gmx_atan2_pr(cn_S, s_S);
+
+ ipr_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
+ *nx_S, *ny_S, *nz_S);
+
+ iprm_S = gmx_norm2_pr(*mx_S, *my_S, *mz_S);
+ iprn_S = gmx_norm2_pr(*nx_S, *ny_S, *nz_S);
+
+ nrkj2_S = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+
+ /* Avoid division by zero. When zero, the result is multiplied by 0
+ * anyhow, so the 3 max below do not affect the final result.
+ */
+ nrkj2_S = gmx_max_pr(nrkj2_S, fmin_S);
+ nrkj_1_S = gmx_invsqrt_pr(nrkj2_S);
+ nrkj_2_S = gmx_mul_pr(nrkj_1_S, nrkj_1_S);
+ nrkj_S = gmx_mul_pr(nrkj2_S, nrkj_1_S);
+
+ iprm_S = gmx_max_pr(iprm_S, fmin_S);
+ iprn_S = gmx_max_pr(iprn_S, fmin_S);
+ *nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S));
+ *nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S));
+
++ /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
++ *phi_S = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
+
+ p_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
+ rkjx_S, rkjy_S, rkjz_S);
+ p_S = gmx_mul_pr(p_S, nrkj_2_S);
+
+ q_S = gmx_iprod_pr(rklx_S, rkly_S, rklz_S,
+ rkjx_S, rkjy_S, rkjz_S);
+ q_S = gmx_mul_pr(q_S, nrkj_2_S);
+
+ gmx_store_pr(p, p_S);
+ gmx_store_pr(q, q_S);
+#undef UNROLL
+}
+
+#endif /* SIMD_BONDEDS */
+
+
+void do_dih_fup(int i, int j, int k, int l, real ddphi,
+ rvec r_ij, rvec r_kj, rvec r_kl,
+ rvec m, rvec n, rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ const rvec x[], int t1, int t2, int t3)
+{
+ /* 143 FLOPS */
+ rvec f_i, f_j, f_k, f_l;
+ rvec uvec, vvec, svec, dx_jl;
+ real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
+ real a, b, p, q, toler;
+ ivec jt, dt_ij, dt_kj, dt_lj;
+
+ iprm = iprod(m, m); /* 5 */
+ iprn = iprod(n, n); /* 5 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ toler = nrkj2*GMX_REAL_EPS;
+ if ((iprm > toler) && (iprn > toler))
+ {
+ nrkj_1 = gmx_invsqrt(nrkj2); /* 10 */
+ nrkj_2 = nrkj_1*nrkj_1; /* 1 */
+ nrkj = nrkj2*nrkj_1; /* 1 */
+ a = -ddphi*nrkj/iprm; /* 11 */
+ svmul(a, m, f_i); /* 3 */
+ b = ddphi*nrkj/iprn; /* 11 */
+ svmul(b, n, f_l); /* 3 */
+ p = iprod(r_ij, r_kj); /* 5 */
+ p *= nrkj_2; /* 1 */
+ q = iprod(r_kl, r_kj); /* 5 */
+ q *= nrkj_2; /* 1 */
+ svmul(p, f_i, uvec); /* 3 */
+ svmul(q, f_l, vvec); /* 3 */
+ rvec_sub(uvec, vvec, svec); /* 3 */
+ rvec_sub(f_i, svec, f_j); /* 3 */
+ rvec_add(f_l, svec, f_k); /* 3 */
+ rvec_inc(f[i], f_i); /* 3 */
+ rvec_dec(f[j], f_j); /* 3 */
+ rvec_dec(f[k], f_k); /* 3 */
+ rvec_inc(f[l], f_l); /* 3 */
+
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, j), jt);
+ ivec_sub(SHIFT_IVEC(g, i), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, k), jt, dt_kj);
+ ivec_sub(SHIFT_IVEC(g, l), jt, dt_lj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ t3 = IVEC2IS(dt_lj);
+ }
+ else if (pbc)
+ {
+ t3 = pbc_rvec_sub(pbc, x[l], x[j], dx_jl);
+ }
+ else
+ {
+ t3 = CENTRAL;
+ }
+
+ rvec_inc(fshift[t1], f_i);
+ rvec_dec(fshift[CENTRAL], f_j);
+ rvec_dec(fshift[t2], f_k);
+ rvec_inc(fshift[t3], f_l);
+ }
+ /* 112 TOTAL */
+}
+
+/* As do_dih_fup above, but without shift forces */
+static void
+do_dih_fup_noshiftf(int i, int j, int k, int l, real ddphi,
+ rvec r_ij, rvec r_kj, rvec r_kl,
+ rvec m, rvec n, rvec f[])
+{
+ rvec f_i, f_j, f_k, f_l;
+ rvec uvec, vvec, svec, dx_jl;
+ real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
+ real a, b, p, q, toler;
+ ivec jt, dt_ij, dt_kj, dt_lj;
+
+ iprm = iprod(m, m); /* 5 */
+ iprn = iprod(n, n); /* 5 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ toler = nrkj2*GMX_REAL_EPS;
+ if ((iprm > toler) && (iprn > toler))
+ {
+ nrkj_1 = gmx_invsqrt(nrkj2); /* 10 */
+ nrkj_2 = nrkj_1*nrkj_1; /* 1 */
+ nrkj = nrkj2*nrkj_1; /* 1 */
+ a = -ddphi*nrkj/iprm; /* 11 */
+ svmul(a, m, f_i); /* 3 */
+ b = ddphi*nrkj/iprn; /* 11 */
+ svmul(b, n, f_l); /* 3 */
+ p = iprod(r_ij, r_kj); /* 5 */
+ p *= nrkj_2; /* 1 */
+ q = iprod(r_kl, r_kj); /* 5 */
+ q *= nrkj_2; /* 1 */
+ svmul(p, f_i, uvec); /* 3 */
+ svmul(q, f_l, vvec); /* 3 */
+ rvec_sub(uvec, vvec, svec); /* 3 */
+ rvec_sub(f_i, svec, f_j); /* 3 */
+ rvec_add(f_l, svec, f_k); /* 3 */
+ rvec_inc(f[i], f_i); /* 3 */
+ rvec_dec(f[j], f_j); /* 3 */
+ rvec_dec(f[k], f_k); /* 3 */
+ rvec_inc(f[l], f_l); /* 3 */
+ }
+}
+
+/* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
+static gmx_inline void
+do_dih_fup_noshiftf_precalc(int i, int j, int k, int l,
+ real p, real q,
+ real f_i_x, real f_i_y, real f_i_z,
+ real mf_l_x, real mf_l_y, real mf_l_z,
+ rvec f[])
+{
+ rvec f_i, f_j, f_k, f_l;
+ rvec uvec, vvec, svec;
+
+ f_i[XX] = f_i_x;
+ f_i[YY] = f_i_y;
+ f_i[ZZ] = f_i_z;
+ f_l[XX] = -mf_l_x;
+ f_l[YY] = -mf_l_y;
+ f_l[ZZ] = -mf_l_z;
+ svmul(p, f_i, uvec);
+ svmul(q, f_l, vvec);
+ rvec_sub(uvec, vvec, svec);
+ rvec_sub(f_i, svec, f_j);
+ rvec_add(f_l, svec, f_k);
+ rvec_inc(f[i], f_i);
+ rvec_dec(f[j], f_j);
+ rvec_dec(f[k], f_k);
+ rvec_inc(f[l], f_l);
+}
+
+
+real dopdihs(real cpA, real cpB, real phiA, real phiB, int mult,
+ real phi, real lambda, real *V, real *F)
+{
+ real v, dvdlambda, mdphi, v1, sdphi, ddphi;
+ real L1 = 1.0 - lambda;
+ real ph0 = (L1*phiA + lambda*phiB)*DEG2RAD;
+ real dph0 = (phiB - phiA)*DEG2RAD;
+ real cp = L1*cpA + lambda*cpB;
+
+ mdphi = mult*phi - ph0;
+ sdphi = sin(mdphi);
+ ddphi = -cp*mult*sdphi;
+ v1 = 1.0 + cos(mdphi);
+ v = cp*v1;
+
+ dvdlambda = (cpB - cpA)*v1 + cp*dph0*sdphi;
+
+ *V = v;
+ *F = ddphi;
+
+ return dvdlambda;
+
+ /* That was 40 flops */
+}
+
+static void
+dopdihs_noener(real cpA, real cpB, real phiA, real phiB, int mult,
+ real phi, real lambda, real *F)
+{
+ real mdphi, sdphi, ddphi;
+ real L1 = 1.0 - lambda;
+ real ph0 = (L1*phiA + lambda*phiB)*DEG2RAD;
+ real cp = L1*cpA + lambda*cpB;
+
+ mdphi = mult*phi - ph0;
+ sdphi = sin(mdphi);
+ ddphi = -cp*mult*sdphi;
+
+ *F = ddphi;
+
+ /* That was 20 flops */
+}
+
+static void
+dopdihs_mdphi(real cpA, real cpB, real phiA, real phiB, int mult,
+ real phi, real lambda, real *cp, real *mdphi)
+{
+ real L1 = 1.0 - lambda;
+ real ph0 = (L1*phiA + lambda*phiB)*DEG2RAD;
+
+ *cp = L1*cpA + lambda*cpB;
+
+ *mdphi = mult*phi - ph0;
+}
+
+static real dopdihs_min(real cpA, real cpB, real phiA, real phiB, int mult,
+ real phi, real lambda, real *V, real *F)
+/* similar to dopdihs, except for a minus sign *
+ * and a different treatment of mult/phi0 */
+{
+ real v, dvdlambda, mdphi, v1, sdphi, ddphi;
+ real L1 = 1.0 - lambda;
+ real ph0 = (L1*phiA + lambda*phiB)*DEG2RAD;
+ real dph0 = (phiB - phiA)*DEG2RAD;
+ real cp = L1*cpA + lambda*cpB;
+
+ mdphi = mult*(phi-ph0);
+ sdphi = sin(mdphi);
+ ddphi = cp*mult*sdphi;
+ v1 = 1.0-cos(mdphi);
+ v = cp*v1;
+
+ dvdlambda = (cpB-cpA)*v1 + cp*dph0*sdphi;
+
+ *V = v;
+ *F = ddphi;
+
+ return dvdlambda;
+
+ /* That was 40 flops */
+}
+
+real pdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, type, ai, aj, ak, al;
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real phi, sign, ddphi, vpd, vtot;
+
+ vtot = 0.0;
+
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &sign, &t1, &t2, &t3); /* 84 */
+ *dvdlambda += dopdihs(forceparams[type].pdihs.cpA,
+ forceparams[type].pdihs.cpB,
+ forceparams[type].pdihs.phiA,
+ forceparams[type].pdihs.phiB,
+ forceparams[type].pdihs.mult,
+ phi, lambda, &vpd, &ddphi);
+
+ vtot += vpd;
+ do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+
+#ifdef DEBUG
+ fprintf(debug, "pdih: (%d,%d,%d,%d) phi=%g\n",
+ ai, aj, ak, al, phi);
+#endif
+ } /* 223 TOTAL */
+
+ return vtot;
+}
+
+void make_dp_periodic(real *dp) /* 1 flop? */
+{
+ /* dp cannot be outside (-pi,pi) */
+ if (*dp >= M_PI)
+ {
+ *dp -= 2*M_PI;
+ }
+ else if (*dp < -M_PI)
+ {
+ *dp += 2*M_PI;
+ }
+ return;
+}
+
+/* As pdihs above, but without calculating energies and shift forces */
+static void
+pdihs_noener(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[],
+ const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ real lambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, type, ai, aj, ak, al;
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real phi, sign, ddphi_tot, ddphi;
+
+ for (i = 0; (i < nbonds); )
+ {
+ ai = forceatoms[i+1];
+ aj = forceatoms[i+2];
+ ak = forceatoms[i+3];
+ al = forceatoms[i+4];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &sign, &t1, &t2, &t3);
+
+ ddphi_tot = 0;
+
+ /* Loop over dihedrals working on the same atoms,
+ * so we avoid recalculating angles and force distributions.
+ */
+ do
+ {
+ type = forceatoms[i];
+ dopdihs_noener(forceparams[type].pdihs.cpA,
+ forceparams[type].pdihs.cpB,
+ forceparams[type].pdihs.phiA,
+ forceparams[type].pdihs.phiB,
+ forceparams[type].pdihs.mult,
+ phi, lambda, &ddphi);
+ ddphi_tot += ddphi;
+
+ i += 5;
+ }
+ while (i < nbonds &&
+ forceatoms[i+1] == ai &&
+ forceatoms[i+2] == aj &&
+ forceatoms[i+3] == ak &&
+ forceatoms[i+4] == al);
+
+ do_dih_fup_noshiftf(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n, f);
+ }
+}
+
+
+#ifdef SIMD_BONDEDS
+
+/* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
+static void
+pdihs_noener_simd(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+#define UNROLL GMX_SIMD_WIDTH_HERE
+ const int nfa1 = 5;
+ int i, iu, s;
+ int type, ai[UNROLL], aj[UNROLL], ak[UNROLL], al[UNROLL];
+ int t1[UNROLL], t2[UNROLL], t3[UNROLL];
+ real ddphi;
+ real dr_array[3*DIM*UNROLL+UNROLL], *dr;
+ real buf_array[7*UNROLL+UNROLL], *buf;
+ real *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
+ gmx_mm_pr phi0_S, phi_S;
+ gmx_mm_pr mx_S, my_S, mz_S;
+ gmx_mm_pr nx_S, ny_S, nz_S;
+ gmx_mm_pr nrkj_m2_S, nrkj_n2_S;
+ gmx_mm_pr cp_S, mdphi_S, mult_S;
+ gmx_mm_pr sin_S, cos_S;
+ gmx_mm_pr mddphi_S;
+ gmx_mm_pr sf_i_S, msf_l_S;
+ pbc_simd_t pbc_simd;
+
+ /* Ensure SIMD register alignment */
+ dr = gmx_simd_align_real(dr_array);
+ buf = gmx_simd_align_real(buf_array);
+
+ /* Extract aligned pointer for parameters and variables */
+ cp = buf + 0*UNROLL;
+ phi0 = buf + 1*UNROLL;
+ mult = buf + 2*UNROLL;
+ p = buf + 3*UNROLL;
+ q = buf + 4*UNROLL;
+ sf_i = buf + 5*UNROLL;
+ msf_l = buf + 6*UNROLL;
+
+ set_pbc_simd(pbc, &pbc_simd);
+
+ /* nbonds is the number of dihedrals times nfa1, here we step UNROLL dihs */
+ for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+ {
+ /* Collect atoms quadruplets for UNROLL dihedrals.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ iu = i;
+ for (s = 0; s < UNROLL; s++)
+ {
+ type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+ al[s] = forceatoms[iu+4];
+
+ cp[s] = forceparams[type].pdihs.cpA;
+ phi0[s] = forceparams[type].pdihs.phiA*DEG2RAD;
+ mult[s] = forceparams[type].pdihs.mult;
+
+ /* At the end fill the arrays with identical entries */
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+
+ /* Caclulate UNROLL dihedral angles at once */
+ dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+ dr,
+ &phi_S,
+ &mx_S, &my_S, &mz_S,
+ &nx_S, &ny_S, &nz_S,
+ &nrkj_m2_S,
+ &nrkj_n2_S,
+ p, q);
+
+ cp_S = gmx_load_pr(cp);
+ phi0_S = gmx_load_pr(phi0);
+ mult_S = gmx_load_pr(mult);
+
+ mdphi_S = gmx_sub_pr(gmx_mul_pr(mult_S, phi_S), phi0_S);
+
+ /* Calculate UNROLL sines at once */
+ gmx_sincos_pr(mdphi_S, &sin_S, &cos_S);
+ mddphi_S = gmx_mul_pr(gmx_mul_pr(cp_S, mult_S), sin_S);
+ sf_i_S = gmx_mul_pr(mddphi_S, nrkj_m2_S);
+ msf_l_S = gmx_mul_pr(mddphi_S, nrkj_n2_S);
+
+ /* After this m?_S will contain f[i] */
+ mx_S = gmx_mul_pr(sf_i_S, mx_S);
+ my_S = gmx_mul_pr(sf_i_S, my_S);
+ mz_S = gmx_mul_pr(sf_i_S, mz_S);
+
+ /* After this m?_S will contain -f[l] */
+ nx_S = gmx_mul_pr(msf_l_S, nx_S);
+ ny_S = gmx_mul_pr(msf_l_S, ny_S);
+ nz_S = gmx_mul_pr(msf_l_S, nz_S);
+
+ gmx_store_pr(dr + 0*UNROLL, mx_S);
+ gmx_store_pr(dr + 1*UNROLL, my_S);
+ gmx_store_pr(dr + 2*UNROLL, mz_S);
+ gmx_store_pr(dr + 3*UNROLL, nx_S);
+ gmx_store_pr(dr + 4*UNROLL, ny_S);
+ gmx_store_pr(dr + 5*UNROLL, nz_S);
+
+ iu = i;
+ s = 0;
+ do
+ {
+ do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+ p[s], q[s],
+ dr[ XX *UNROLL+s],
+ dr[ YY *UNROLL+s],
+ dr[ ZZ *UNROLL+s],
+ dr[(DIM+XX)*UNROLL+s],
+ dr[(DIM+YY)*UNROLL+s],
+ dr[(DIM+ZZ)*UNROLL+s],
+ f);
+ s++;
+ iu += nfa1;
+ }
+ while (s < UNROLL && iu < nbonds);
+ }
+#undef UNROLL
+}
+
+#endif /* SIMD_BONDEDS */
+
+
+real idihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, type, ai, aj, ak, al;
+ int t1, t2, t3;
+ real phi, phi0, dphi0, ddphi, sign, vtot;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real L1, kk, dp, dp2, kA, kB, pA, pB, dvdl_term;
+
+ L1 = 1.0-lambda;
+ dvdl_term = 0;
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &sign, &t1, &t2, &t3); /* 84 */
+
+ /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ * force changes if we just apply a normal harmonic.
+ * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ * This means we will never have the periodicity problem, unless
+ * the dihedral is Pi away from phiO, which is very unlikely due to
+ * the potential.
+ */
+ kA = forceparams[type].harmonic.krA;
+ kB = forceparams[type].harmonic.krB;
+ pA = forceparams[type].harmonic.rA;
+ pB = forceparams[type].harmonic.rB;
+
+ kk = L1*kA + lambda*kB;
+ phi0 = (L1*pA + lambda*pB)*DEG2RAD;
+ dphi0 = (pB - pA)*DEG2RAD;
+
+ dp = phi-phi0;
+
+ make_dp_periodic(&dp);
+
+ dp2 = dp*dp;
+
+ vtot += 0.5*kk*dp2;
+ ddphi = -kk*dp;
+
+ dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
+
+ do_dih_fup(ai, aj, ak, al, (real)(-ddphi), r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ /* 218 TOTAL */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "idih: (%d,%d,%d,%d) phi=%g\n",
+ ai, aj, ak, al, phi);
+ }
+#endif
+ }
+
+ *dvdlambda += dvdl_term;
+ return vtot;
+}
+
+
+/*! \brief returns dx, rdist, and dpdl for functions posres() and fbposres()
+ */
+static void posres_dx(const rvec x, const rvec pos0A, const rvec pos0B,
+ const rvec comA_sc, const rvec comB_sc,
+ real lambda,
+ t_pbc *pbc, int refcoord_scaling, int npbcdim,
+ rvec dx, rvec rdist, rvec dpdl)
+{
+ int m, d;
+ real posA, posB, L1, ref = 0.;
+ rvec pos;
+
+ L1 = 1.0-lambda;
+
+ for (m = 0; m < DIM; m++)
+ {
+ posA = pos0A[m];
+ posB = pos0B[m];
+ if (m < npbcdim)
+ {
+ switch (refcoord_scaling)
+ {
+ case erscNO:
+ ref = 0;
+ rdist[m] = L1*posA + lambda*posB;
+ dpdl[m] = posB - posA;
+ break;
+ case erscALL:
+ /* Box relative coordinates are stored for dimensions with pbc */
+ posA *= pbc->box[m][m];
+ posB *= pbc->box[m][m];
+ for (d = m+1; d < npbcdim; d++)
+ {
+ posA += pos0A[d]*pbc->box[d][m];
+ posB += pos0B[d]*pbc->box[d][m];
+ }
+ ref = L1*posA + lambda*posB;
+ rdist[m] = 0;
+ dpdl[m] = posB - posA;
+ break;
+ case erscCOM:
+ ref = L1*comA_sc[m] + lambda*comB_sc[m];
+ rdist[m] = L1*posA + lambda*posB;
+ dpdl[m] = comB_sc[m] - comA_sc[m] + posB - posA;
+ break;
+ default:
+ gmx_fatal(FARGS, "No such scaling method implemented");
+ }
+ }
+ else
+ {
+ ref = L1*posA + lambda*posB;
+ rdist[m] = 0;
+ dpdl[m] = posB - posA;
+ }
+
+ /* We do pbc_dx with ref+rdist,
+ * since with only ref we can be up to half a box vector wrong.
+ */
+ pos[m] = ref + rdist[m];
+ }
+
+ if (pbc)
+ {
+ pbc_dx(pbc, x, pos, dx);
+ }
+ else
+ {
+ rvec_sub(x, pos, dx);
+ }
+}
+
+/*! \brief Adds forces of flat-bottomed positions restraints to f[]
+ * and fixes vir_diag. Returns the flat-bottomed potential. */
+real fbposres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec vir_diag,
+ t_pbc *pbc,
+ int refcoord_scaling, int ePBC, rvec com)
+/* compute flat-bottomed positions restraints */
+{
+ int i, ai, m, d, type, npbcdim = 0, fbdim;
+ const t_iparams *pr;
+ real vtot, kk, v;
+ real ref = 0, dr, dr2, rpot, rfb, rfb2, fact, invdr;
+ rvec com_sc, rdist, pos, dx, dpdl, fm;
+ gmx_bool bInvert;
+
+ npbcdim = ePBC2npbcdim(ePBC);
+
+ if (refcoord_scaling == erscCOM)
+ {
+ clear_rvec(com_sc);
+ for (m = 0; m < npbcdim; m++)
+ {
+ for (d = m; d < npbcdim; d++)
+ {
+ com_sc[m] += com[d]*pbc->box[d][m];
+ }
+ }
+ }
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ pr = &forceparams[type];
+
+ /* same calculation as for normal posres, but with identical A and B states, and lambda==0 */
+ posres_dx(x[ai], forceparams[type].fbposres.pos0, forceparams[type].fbposres.pos0,
+ com_sc, com_sc, 0.0,
+ pbc, refcoord_scaling, npbcdim,
+ dx, rdist, dpdl);
+
+ clear_rvec(fm);
+ v = 0.0;
+
+ kk = pr->fbposres.k;
+ rfb = pr->fbposres.r;
+ rfb2 = sqr(rfb);
+
+ /* with rfb<0, push particle out of the sphere/cylinder/layer */
+ bInvert = FALSE;
+ if (rfb < 0.)
+ {
+ bInvert = TRUE;
+ rfb = -rfb;
+ }
+
+ switch (pr->fbposres.geom)
+ {
+ case efbposresSPHERE:
+ /* spherical flat-bottom posres */
+ dr2 = norm2(dx);
+ if (dr2 > 0.0 &&
+ ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
+ )
+ {
+ dr = sqrt(dr2);
+ v = 0.5*kk*sqr(dr - rfb);
+ fact = -kk*(dr-rfb)/dr; /* Force pointing to the center pos0 */
+ svmul(fact, dx, fm);
+ }
+ break;
+ case efbposresCYLINDER:
+ /* cylidrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
+ dr2 = sqr(dx[XX])+sqr(dx[YY]);
+ if (dr2 > 0.0 &&
+ ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
+ )
+ {
+ dr = sqrt(dr2);
+ invdr = 1./dr;
+ v = 0.5*kk*sqr(dr - rfb);
+ fm[XX] = -kk*(dr-rfb)*dx[XX]*invdr; /* Force pointing to the center */
+ fm[YY] = -kk*(dr-rfb)*dx[YY]*invdr;
+ }
+ break;
+ case efbposresX: /* fbdim=XX */
+ case efbposresY: /* fbdim=YY */
+ case efbposresZ: /* fbdim=ZZ */
+ /* 1D flat-bottom potential */
+ fbdim = pr->fbposres.geom - efbposresX;
+ dr = dx[fbdim];
+ if ( ( dr > rfb && bInvert == FALSE ) || ( 0 < dr && dr < rfb && bInvert == TRUE ) )
+ {
+ v = 0.5*kk*sqr(dr - rfb);
+ fm[fbdim] = -kk*(dr - rfb);
+ }
+ else if ( (dr < (-rfb) && bInvert == FALSE ) || ( (-rfb) < dr && dr < 0 && bInvert == TRUE ))
+ {
+ v = 0.5*kk*sqr(dr + rfb);
+ fm[fbdim] = -kk*(dr + rfb);
+ }
+ break;
+ }
+
+ vtot += v;
+
+ for (m = 0; (m < DIM); m++)
+ {
+ f[ai][m] += fm[m];
+ /* Here we correct for the pbc_dx which included rdist */
+ vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm[m];
+ }
+ }
+
+ return vtot;
+}
+
+
+real posres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec vir_diag,
+ t_pbc *pbc,
+ real lambda, real *dvdlambda,
+ int refcoord_scaling, int ePBC, rvec comA, rvec comB)
+{
+ int i, ai, m, d, type, ki, npbcdim = 0;
+ const t_iparams *pr;
+ real L1;
+ real vtot, kk, fm;
+ real posA, posB, ref = 0;
+ rvec comA_sc, comB_sc, rdist, dpdl, pos, dx;
+ gmx_bool bForceValid = TRUE;
+
+ if ((f == NULL) || (vir_diag == NULL)) /* should both be null together! */
+ {
+ bForceValid = FALSE;
+ }
+
+ npbcdim = ePBC2npbcdim(ePBC);
+
+ if (refcoord_scaling == erscCOM)
+ {
+ clear_rvec(comA_sc);
+ clear_rvec(comB_sc);
+ for (m = 0; m < npbcdim; m++)
+ {
+ for (d = m; d < npbcdim; d++)
+ {
+ comA_sc[m] += comA[d]*pbc->box[d][m];
+ comB_sc[m] += comB[d]*pbc->box[d][m];
+ }
+ }
+ }
+
+ L1 = 1.0 - lambda;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ pr = &forceparams[type];
+
+ /* return dx, rdist, and dpdl */
+ posres_dx(x[ai], forceparams[type].posres.pos0A, forceparams[type].posres.pos0B,
+ comA_sc, comB_sc, lambda,
+ pbc, refcoord_scaling, npbcdim,
+ dx, rdist, dpdl);
+
+ for (m = 0; (m < DIM); m++)
+ {
+ kk = L1*pr->posres.fcA[m] + lambda*pr->posres.fcB[m];
+ fm = -kk*dx[m];
+ vtot += 0.5*kk*dx[m]*dx[m];
+ *dvdlambda +=
+ 0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
+ -fm*dpdl[m];
+
+ /* Here we correct for the pbc_dx which included rdist */
+ if (bForceValid)
+ {
+ f[ai][m] += fm;
+ vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm;
+ }
+ }
+ }
+
+ return vtot;
+}
+
+static real low_angres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ gmx_bool bZAxis)
+{
+ int i, m, type, ai, aj, ak, al;
+ int t1, t2;
+ real phi, cos_phi, cos_phi2, vid, vtot, dVdphi;
+ rvec r_ij, r_kl, f_i, f_k = {0, 0, 0};
+ real st, sth, nrij2, nrkl2, c, cij, ckl;
+
+ ivec dt;
+ t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
+
+ vtot = 0.0;
+ ak = al = 0; /* to avoid warnings */
+ for (i = 0; i < nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ t1 = pbc_rvec_sub(pbc, x[aj], x[ai], r_ij); /* 3 */
+ if (!bZAxis)
+ {
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+ t2 = pbc_rvec_sub(pbc, x[al], x[ak], r_kl); /* 3 */
+ }
+ else
+ {
+ r_kl[XX] = 0;
+ r_kl[YY] = 0;
+ r_kl[ZZ] = 1;
+ }
+
+ cos_phi = cos_angle(r_ij, r_kl); /* 25 */
+ phi = acos(cos_phi); /* 10 */
+
+ *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
+ forceparams[type].pdihs.cpB,
+ forceparams[type].pdihs.phiA,
+ forceparams[type].pdihs.phiB,
+ forceparams[type].pdihs.mult,
+ phi, lambda, &vid, &dVdphi); /* 40 */
+
+ vtot += vid;
+
+ cos_phi2 = sqr(cos_phi); /* 1 */
+ if (cos_phi2 < 1)
+ {
+ st = -dVdphi*gmx_invsqrt(1 - cos_phi2); /* 12 */
+ sth = st*cos_phi; /* 1 */
+ nrij2 = iprod(r_ij, r_ij); /* 5 */
+ nrkl2 = iprod(r_kl, r_kl); /* 5 */
+
+ c = st*gmx_invsqrt(nrij2*nrkl2); /* 11 */
+ cij = sth/nrij2; /* 10 */
+ ckl = sth/nrkl2; /* 10 */
+
+ for (m = 0; m < DIM; m++) /* 18+18 */
+ {
+ f_i[m] = (c*r_kl[m]-cij*r_ij[m]);
+ f[ai][m] += f_i[m];
+ f[aj][m] -= f_i[m];
+ if (!bZAxis)
+ {
+ f_k[m] = (c*r_ij[m]-ckl*r_kl[m]);
+ f[ak][m] += f_k[m];
+ f[al][m] -= f_k[m];
+ }
+ }
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ t1 = IVEC2IS(dt);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_dec(fshift[CENTRAL], f_i);
+ if (!bZAxis)
+ {
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, al), dt);
+ t2 = IVEC2IS(dt);
+ }
+ rvec_inc(fshift[t2], f_k);
+ rvec_dec(fshift[CENTRAL], f_k);
+ }
+ }
+ }
+
+ return vtot; /* 184 / 157 (bZAxis) total */
+}
+
+real angres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ lambda, dvdlambda, FALSE);
+}
+
+real angresz(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ lambda, dvdlambda, TRUE);
+}
+
+real dihres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ real vtot = 0;
+ int ai, aj, ak, al, i, k, type, t1, t2, t3;
+ real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
+ real phi, ddphi, ddp, ddp2, dp, sign, d2r, fc, L1;
+ rvec r_ij, r_kj, r_kl, m, n;
+
+ L1 = 1.0-lambda;
+
+ d2r = DEG2RAD;
+ k = 0;
+
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi0A = forceparams[type].dihres.phiA*d2r;
+ dphiA = forceparams[type].dihres.dphiA*d2r;
+ kfacA = forceparams[type].dihres.kfacA;
+
+ phi0B = forceparams[type].dihres.phiB*d2r;
+ dphiB = forceparams[type].dihres.dphiB*d2r;
+ kfacB = forceparams[type].dihres.kfacB;
+
+ phi0 = L1*phi0A + lambda*phi0B;
+ dphi = L1*dphiA + lambda*dphiB;
+ kfac = L1*kfacA + lambda*kfacB;
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &sign, &t1, &t2, &t3);
+ /* 84 flops */
+
+ if (debug)
+ {
+ fprintf(debug, "dihres[%d]: %d %d %d %d : phi=%f, dphi=%f, kfac=%f\n",
+ k++, ai, aj, ak, al, phi0, dphi, kfac);
+ }
+ /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ * force changes if we just apply a normal harmonic.
+ * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ * This means we will never have the periodicity problem, unless
+ * the dihedral is Pi away from phiO, which is very unlikely due to
+ * the potential.
+ */
+ dp = phi-phi0;
+ make_dp_periodic(&dp);
+
+ if (dp > dphi)
+ {
+ ddp = dp-dphi;
+ }
+ else if (dp < -dphi)
+ {
+ ddp = dp+dphi;
+ }
+ else
+ {
+ ddp = 0;
+ }
+
+ if (ddp != 0.0)
+ {
+ ddp2 = ddp*ddp;
+ vtot += 0.5*kfac*ddp2;
+ ddphi = kfac*ddp;
+
+ *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
+ /* lambda dependence from changing restraint distances */
+ if (ddp > 0)
+ {
+ *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
+ }
+ else if (ddp < 0)
+ {
+ *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
+ }
+ do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ }
+ }
+ return vtot;
+}
+
+
+real unimplemented(int gmx_unused nbonds,
+ const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
+ const rvec gmx_unused x[], rvec gmx_unused f[], rvec gmx_unused fshift[],
+ const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ gmx_impl("*** you are using a not implemented function");
+
+ return 0.0; /* To make the compiler happy */
+}
+
+real rbdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
+ int type, ai, aj, ak, al, i, j;
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real parmA[NR_RBDIHS];
+ real parmB[NR_RBDIHS];
+ real parm[NR_RBDIHS];
+ real cos_phi, phi, rbp, rbpBA;
+ real v, sign, ddphi, sin_phi;
+ real cosfac, vtot;
+ real L1 = 1.0-lambda;
+ real dvdl_term = 0;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &sign, &t1, &t2, &t3); /* 84 */
+
+ /* Change to polymer convention */
+ if (phi < c0)
+ {
+ phi += M_PI;
+ }
+ else
+ {
+ phi -= M_PI; /* 1 */
+
+ }
+ cos_phi = cos(phi);
+ /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
+ sin_phi = sin(phi);
+
+ for (j = 0; (j < NR_RBDIHS); j++)
+ {
+ parmA[j] = forceparams[type].rbdihs.rbcA[j];
+ parmB[j] = forceparams[type].rbdihs.rbcB[j];
+ parm[j] = L1*parmA[j]+lambda*parmB[j];
+ }
+ /* Calculate cosine powers */
+ /* Calculate the energy */
+ /* Calculate the derivative */
+
+ v = parm[0];
+ dvdl_term += (parmB[0]-parmA[0]);
+ ddphi = c0;
+ cosfac = c1;
+
+ rbp = parm[1];
+ rbpBA = parmB[1]-parmA[1];
+ ddphi += rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[2];
+ rbpBA = parmB[2]-parmA[2];
+ ddphi += c2*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[3];
+ rbpBA = parmB[3]-parmA[3];
+ ddphi += c3*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[4];
+ rbpBA = parmB[4]-parmA[4];
+ ddphi += c4*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[5];
+ rbpBA = parmB[5]-parmA[5];
+ ddphi += c5*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+
+ ddphi = -ddphi*sin_phi; /* 11 */
+
+ do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ vtot += v;
+ }
+ *dvdlambda += dvdl_term;
+
+ return vtot;
+}
+
+int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
+{
+ int im1, ip1, ip2;
+
+ if (ip < 0)
+ {
+ ip = ip + grid_spacing - 1;
+ }
+ else if (ip > grid_spacing)
+ {
+ ip = ip - grid_spacing - 1;
+ }
+
+ im1 = ip - 1;
+ ip1 = ip + 1;
+ ip2 = ip + 2;
+
+ if (ip == 0)
+ {
+ im1 = grid_spacing - 1;
+ }
+ else if (ip == grid_spacing-2)
+ {
+ ip2 = 0;
+ }
+ else if (ip == grid_spacing-1)
+ {
+ ip1 = 0;
+ ip2 = 1;
+ }
+
+ *ipm1 = im1;
+ *ipp1 = ip1;
+ *ipp2 = ip2;
+
+ return ip;
+
+}
+
+real cmap_dihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const gmx_cmap_t *cmap_grid,
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, j, k, n, idx;
+ int ai, aj, ak, al, am;
+ int a1i, a1j, a1k, a1l, a2i, a2j, a2k, a2l;
+ int type, cmapA;
+ int t11, t21, t31, t12, t22, t32;
+ int iphi1, ip1m1, ip1p1, ip1p2;
+ int iphi2, ip2m1, ip2p1, ip2p2;
+ int l1, l2, l3, l4;
+ int pos1, pos2, pos3, pos4, tmp;
+
+ real ty[4], ty1[4], ty2[4], ty12[4], tc[16], tx[16];
+ real phi1, psi1, cos_phi1, sin_phi1, sign1, xphi1;
+ real phi2, psi2, cos_phi2, sin_phi2, sign2, xphi2;
+ real dx, xx, tt, tu, e, df1, df2, ddf1, ddf2, ddf12, vtot;
+ real ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
+ real ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
+ real fg1, hg1, fga1, hgb1, gaa1, gbb1;
+ real fg2, hg2, fga2, hgb2, gaa2, gbb2;
+ real fac;
+
+ rvec r1_ij, r1_kj, r1_kl, m1, n1;
+ rvec r2_ij, r2_kj, r2_kl, m2, n2;
+ rvec f1_i, f1_j, f1_k, f1_l;
+ rvec f2_i, f2_j, f2_k, f2_l;
+ rvec a1, b1, a2, b2;
+ rvec f1, g1, h1, f2, g2, h2;
+ rvec dtf1, dtg1, dth1, dtf2, dtg2, dth2;
+ ivec jt1, dt1_ij, dt1_kj, dt1_lj;
+ ivec jt2, dt2_ij, dt2_kj, dt2_lj;
+
+ const real *cmapd;
+
+ int loop_index[4][4] = {
+ {0, 4, 8, 12},
+ {1, 5, 9, 13},
+ {2, 6, 10, 14},
+ {3, 7, 11, 15}
+ };
+
+ /* Total CMAP energy */
+ vtot = 0;
+
+ for (n = 0; n < nbonds; )
+ {
+ /* Five atoms are involved in the two torsions */
+ type = forceatoms[n++];
+ ai = forceatoms[n++];
+ aj = forceatoms[n++];
+ ak = forceatoms[n++];
+ al = forceatoms[n++];
+ am = forceatoms[n++];
+
+ /* Which CMAP type is this */
+ cmapA = forceparams[type].cmap.cmapA;
+ cmapd = cmap_grid->cmapdata[cmapA].cmap;
+
+ /* First torsion */
+ a1i = ai;
+ a1j = aj;
+ a1k = ak;
+ a1l = al;
+
+ phi1 = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
+ &sign1, &t11, &t21, &t31); /* 84 */
+
+ cos_phi1 = cos(phi1);
+
+ a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
+ a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
+ a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
+
+ b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
+ b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
+ b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
+
+ tmp = pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
+
+ ra21 = iprod(a1, a1); /* 5 */
+ rb21 = iprod(b1, b1); /* 5 */
+ rg21 = iprod(r1_kj, r1_kj); /* 5 */
+ rg1 = sqrt(rg21);
+
+ rgr1 = 1.0/rg1;
+ ra2r1 = 1.0/ra21;
+ rb2r1 = 1.0/rb21;
+ rabr1 = sqrt(ra2r1*rb2r1);
+
+ sin_phi1 = rg1 * rabr1 * iprod(a1, h1) * (-1);
+
+ if (cos_phi1 < -0.5 || cos_phi1 > 0.5)
+ {
+ phi1 = asin(sin_phi1);
+
+ if (cos_phi1 < 0)
+ {
+ if (phi1 > 0)
+ {
+ phi1 = M_PI - phi1;
+ }
+ else
+ {
+ phi1 = -M_PI - phi1;
+ }
+ }
+ }
+ else
+ {
+ phi1 = acos(cos_phi1);
+
+ if (sin_phi1 < 0)
+ {
+ phi1 = -phi1;
+ }
+ }
+
+ xphi1 = phi1 + M_PI; /* 1 */
+
+ /* Second torsion */
+ a2i = aj;
+ a2j = ak;
+ a2k = al;
+ a2l = am;
+
+ phi2 = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
+ &sign2, &t12, &t22, &t32); /* 84 */
+
+ cos_phi2 = cos(phi2);
+
+ a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
+ a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
+ a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
+
+ b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
+ b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
+ b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
+
+ tmp = pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
+
+ ra22 = iprod(a2, a2); /* 5 */
+ rb22 = iprod(b2, b2); /* 5 */
+ rg22 = iprod(r2_kj, r2_kj); /* 5 */
+ rg2 = sqrt(rg22);
+
+ rgr2 = 1.0/rg2;
+ ra2r2 = 1.0/ra22;
+ rb2r2 = 1.0/rb22;
+ rabr2 = sqrt(ra2r2*rb2r2);
+
+ sin_phi2 = rg2 * rabr2 * iprod(a2, h2) * (-1);
+
+ if (cos_phi2 < -0.5 || cos_phi2 > 0.5)
+ {
+ phi2 = asin(sin_phi2);
+
+ if (cos_phi2 < 0)
+ {
+ if (phi2 > 0)
+ {
+ phi2 = M_PI - phi2;
+ }
+ else
+ {
+ phi2 = -M_PI - phi2;
+ }
+ }
+ }
+ else
+ {
+ phi2 = acos(cos_phi2);
+
+ if (sin_phi2 < 0)
+ {
+ phi2 = -phi2;
+ }
+ }
+
+ xphi2 = phi2 + M_PI; /* 1 */
+
+ /* Range mangling */
+ if (xphi1 < 0)
+ {
+ xphi1 = xphi1 + 2*M_PI;
+ }
+ else if (xphi1 >= 2*M_PI)
+ {
+ xphi1 = xphi1 - 2*M_PI;
+ }
+
+ if (xphi2 < 0)
+ {
+ xphi2 = xphi2 + 2*M_PI;
+ }
+ else if (xphi2 >= 2*M_PI)
+ {
+ xphi2 = xphi2 - 2*M_PI;
+ }
+
+ /* Number of grid points */
+ dx = 2*M_PI / cmap_grid->grid_spacing;
+
+ /* Where on the grid are we */
+ iphi1 = (int)(xphi1/dx);
+ iphi2 = (int)(xphi2/dx);
+
+ iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
+ iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
+
+ pos1 = iphi1*cmap_grid->grid_spacing+iphi2;
+ pos2 = ip1p1*cmap_grid->grid_spacing+iphi2;
+ pos3 = ip1p1*cmap_grid->grid_spacing+ip2p1;
+ pos4 = iphi1*cmap_grid->grid_spacing+ip2p1;
+
+ ty[0] = cmapd[pos1*4];
+ ty[1] = cmapd[pos2*4];
+ ty[2] = cmapd[pos3*4];
+ ty[3] = cmapd[pos4*4];
+
+ ty1[0] = cmapd[pos1*4+1];
+ ty1[1] = cmapd[pos2*4+1];
+ ty1[2] = cmapd[pos3*4+1];
+ ty1[3] = cmapd[pos4*4+1];
+
+ ty2[0] = cmapd[pos1*4+2];
+ ty2[1] = cmapd[pos2*4+2];
+ ty2[2] = cmapd[pos3*4+2];
+ ty2[3] = cmapd[pos4*4+2];
+
+ ty12[0] = cmapd[pos1*4+3];
+ ty12[1] = cmapd[pos2*4+3];
+ ty12[2] = cmapd[pos3*4+3];
+ ty12[3] = cmapd[pos4*4+3];
+
+ /* Switch to degrees */
+ dx = 360.0 / cmap_grid->grid_spacing;
+ xphi1 = xphi1 * RAD2DEG;
+ xphi2 = xphi2 * RAD2DEG;
+
+ for (i = 0; i < 4; i++) /* 16 */
+ {
+ tx[i] = ty[i];
+ tx[i+4] = ty1[i]*dx;
+ tx[i+8] = ty2[i]*dx;
+ tx[i+12] = ty12[i]*dx*dx;
+ }
+
+ idx = 0;
+ for (i = 0; i < 4; i++) /* 1056 */
+ {
+ for (j = 0; j < 4; j++)
+ {
+ xx = 0;
+ for (k = 0; k < 16; k++)
+ {
+ xx = xx + cmap_coeff_matrix[k*16+idx]*tx[k];
+ }
+
+ idx++;
+ tc[i*4+j] = xx;
+ }
+ }
+
+ tt = (xphi1-iphi1*dx)/dx;
+ tu = (xphi2-iphi2*dx)/dx;
+
+ e = 0;
+ df1 = 0;
+ df2 = 0;
+ ddf1 = 0;
+ ddf2 = 0;
+ ddf12 = 0;
+
+ for (i = 3; i >= 0; i--)
+ {
+ l1 = loop_index[i][3];
+ l2 = loop_index[i][2];
+ l3 = loop_index[i][1];
+
+ e = tt * e + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
+ df1 = tu * df1 + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
+ df2 = tt * df2 + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
+ ddf1 = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
+ ddf2 = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
+ }
+
+ ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
+ 3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
+
+ fac = RAD2DEG/dx;
+ df1 = df1 * fac;
+ df2 = df2 * fac;
+ ddf1 = ddf1 * fac * fac;
+ ddf2 = ddf2 * fac * fac;
+ ddf12 = ddf12 * fac * fac;
+
+ /* CMAP energy */
+ vtot += e;
+
+ /* Do forces - first torsion */
+ fg1 = iprod(r1_ij, r1_kj);
+ hg1 = iprod(r1_kl, r1_kj);
+ fga1 = fg1*ra2r1*rgr1;
+ hgb1 = hg1*rb2r1*rgr1;
+ gaa1 = -ra2r1*rg1;
+ gbb1 = rb2r1*rg1;
+
+ for (i = 0; i < DIM; i++)
+ {
+ dtf1[i] = gaa1 * a1[i];
+ dtg1[i] = fga1 * a1[i] - hgb1 * b1[i];
+ dth1[i] = gbb1 * b1[i];
+
+ f1[i] = df1 * dtf1[i];
+ g1[i] = df1 * dtg1[i];
+ h1[i] = df1 * dth1[i];
+
+ f1_i[i] = f1[i];
+ f1_j[i] = -f1[i] - g1[i];
+ f1_k[i] = h1[i] + g1[i];
+ f1_l[i] = -h1[i];
+
+ f[a1i][i] = f[a1i][i] + f1_i[i];
+ f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */
+ f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */
+ f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */
+ }
+
+ /* Do forces - second torsion */
+ fg2 = iprod(r2_ij, r2_kj);
+ hg2 = iprod(r2_kl, r2_kj);
+ fga2 = fg2*ra2r2*rgr2;
+ hgb2 = hg2*rb2r2*rgr2;
+ gaa2 = -ra2r2*rg2;
+ gbb2 = rb2r2*rg2;
+
+ for (i = 0; i < DIM; i++)
+ {
+ dtf2[i] = gaa2 * a2[i];
+ dtg2[i] = fga2 * a2[i] - hgb2 * b2[i];
+ dth2[i] = gbb2 * b2[i];
+
+ f2[i] = df2 * dtf2[i];
+ g2[i] = df2 * dtg2[i];
+ h2[i] = df2 * dth2[i];
+
+ f2_i[i] = f2[i];
+ f2_j[i] = -f2[i] - g2[i];
+ f2_k[i] = h2[i] + g2[i];
+ f2_l[i] = -h2[i];
+
+ f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */
+ f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */
+ f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */
+ f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */
+ }
+
+ /* Shift forces */
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, a1j), jt1);
+ ivec_sub(SHIFT_IVEC(g, a1i), jt1, dt1_ij);
+ ivec_sub(SHIFT_IVEC(g, a1k), jt1, dt1_kj);
+ ivec_sub(SHIFT_IVEC(g, a1l), jt1, dt1_lj);
+ t11 = IVEC2IS(dt1_ij);
+ t21 = IVEC2IS(dt1_kj);
+ t31 = IVEC2IS(dt1_lj);
+
+ copy_ivec(SHIFT_IVEC(g, a2j), jt2);
+ ivec_sub(SHIFT_IVEC(g, a2i), jt2, dt2_ij);
+ ivec_sub(SHIFT_IVEC(g, a2k), jt2, dt2_kj);
+ ivec_sub(SHIFT_IVEC(g, a2l), jt2, dt2_lj);
+ t12 = IVEC2IS(dt2_ij);
+ t22 = IVEC2IS(dt2_kj);
+ t32 = IVEC2IS(dt2_lj);
+ }
+ else if (pbc)
+ {
+ t31 = pbc_rvec_sub(pbc, x[a1l], x[a1j], h1);
+ t32 = pbc_rvec_sub(pbc, x[a2l], x[a2j], h2);
+ }
+ else
+ {
+ t31 = CENTRAL;
+ t32 = CENTRAL;
+ }
+
+ rvec_inc(fshift[t11], f1_i);
+ rvec_inc(fshift[CENTRAL], f1_j);
+ rvec_inc(fshift[t21], f1_k);
+ rvec_inc(fshift[t31], f1_l);
+
+ rvec_inc(fshift[t21], f2_i);
+ rvec_inc(fshift[CENTRAL], f2_j);
+ rvec_inc(fshift[t22], f2_k);
+ rvec_inc(fshift[t32], f2_l);
+ }
+ return vtot;
+}
+
+
+
+/***********************************************************
+ *
+ * G R O M O S 9 6 F U N C T I O N S
+ *
+ ***********************************************************/
+real g96harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ real *V, real *F)
+{
+ const real half = 0.5;
+ real L1, kk, x0, dx, dx2;
+ real v, f, dvdlambda;
+
+ L1 = 1.0-lambda;
+ kk = L1*kA+lambda*kB;
+ x0 = L1*xA+lambda*xB;
+
+ dx = x-x0;
+ dx2 = dx*dx;
+
+ f = -kk*dx;
+ v = half*kk*dx2;
+ dvdlambda = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+
+ *F = f;
+ *V = v;
+
+ return dvdlambda;
+
+ /* That was 21 flops */
+}
+
+real g96bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ki, ai, aj, type;
+ real dr2, fbond, vbond, fij, vtot;
+ rvec dx;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+
+ *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA,
+ forceparams[type].harmonic.rB,
+ dr2, lambda, &vbond, &fbond);
+
+ vtot += 0.5*vbond; /* 1*/
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "G96-BONDS: dr = %10g vbond = %10g fbond = %10g\n",
+ sqrt(dr2), vbond, fbond);
+ }
+#endif
+
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 44 TOTAL */
+ return vtot;
+}
+
+real g96bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ rvec r_ij, rvec r_kj,
+ int *t1, int *t2)
+/* Return value is the angle between the bonds i-j and j-k */
+{
+ real costh;
+
+ *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /* 3 */
+ *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /* 3 */
+
+ costh = cos_angle(r_ij, r_kj); /* 25 */
+ /* 41 TOTAL */
+ return costh;
+}
+
+real g96angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ai, aj, ak, type, m, t1, t2;
+ rvec r_ij, r_kj;
+ real cos_theta, dVdt, va, vtot;
+ real rij_1, rij_2, rkj_1, rkj_2, rijrkj_1;
+ rvec f_i, f_j, f_k;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ cos_theta = g96bond_angle(x[ai], x[aj], x[ak], pbc, r_ij, r_kj, &t1, &t2);
+
+ *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA,
+ forceparams[type].harmonic.rB,
+ cos_theta, lambda, &va, &dVdt);
+ vtot += va;
+
+ rij_1 = gmx_invsqrt(iprod(r_ij, r_ij));
+ rkj_1 = gmx_invsqrt(iprod(r_kj, r_kj));
+ rij_2 = rij_1*rij_1;
+ rkj_2 = rkj_1*rkj_1;
+ rijrkj_1 = rij_1*rkj_1; /* 23 */
+
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "G96ANGLES: costheta = %10g vth = %10g dV/dct = %10g\n",
+ cos_theta, va, dVdt);
+ }
+#endif
+ for (m = 0; (m < DIM); m++) /* 42 */
+ {
+ f_i[m] = dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
+ f_k[m] = dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k); /* 9 */
+ /* 163 TOTAL */
+ }
+ return vtot;
+}
+
+real cross_bond_bond(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ * pp. 842-847
+ */
+ int i, ai, aj, ak, type, m, t1, t2;
+ rvec r_ij, r_kj;
+ real vtot, vrr, s1, s2, r1, r2, r1e, r2e, krr;
+ rvec f_i, f_j, f_k;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ r1e = forceparams[type].cross_bb.r1e;
+ r2e = forceparams[type].cross_bb.r2e;
+ krr = forceparams[type].cross_bb.krr;
+
+ /* Compute distance vectors ... */
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+
+ /* ... and their lengths */
+ r1 = norm(r_ij);
+ r2 = norm(r_kj);
+
+ /* Deviations from ideality */
+ s1 = r1-r1e;
+ s2 = r2-r2e;
+
+ /* Energy (can be negative!) */
+ vrr = krr*s1*s2;
+ vtot += vrr;
+
+ /* Forces */
+ svmul(-krr*s2/r1, r_ij, f_i);
+ svmul(-krr*s1/r2, r_kj, f_k);
+
+ for (m = 0; (m < DIM); m++) /* 12 */
+ {
+ f_j[m] = -f_i[m] - f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ /* Virial stuff */
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k); /* 9 */
+ /* 163 TOTAL */
+ }
+ return vtot;
+}
+
+real cross_bond_angle(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ * pp. 842-847
+ */
+ int i, ai, aj, ak, type, m, t1, t2, t3;
+ rvec r_ij, r_kj, r_ik;
+ real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
+ rvec f_i, f_j, f_k;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ r1e = forceparams[type].cross_ba.r1e;
+ r2e = forceparams[type].cross_ba.r2e;
+ r3e = forceparams[type].cross_ba.r3e;
+ krt = forceparams[type].cross_ba.krt;
+
+ /* Compute distance vectors ... */
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ t3 = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
+
+ /* ... and their lengths */
+ r1 = norm(r_ij);
+ r2 = norm(r_kj);
+ r3 = norm(r_ik);
+
+ /* Deviations from ideality */
+ s1 = r1-r1e;
+ s2 = r2-r2e;
+ s3 = r3-r3e;
+
+ /* Energy (can be negative!) */
+ vrt = krt*s3*(s1+s2);
+ vtot += vrt;
+
+ /* Forces */
+ k1 = -krt*(s3/r1);
+ k2 = -krt*(s3/r2);
+ k3 = -krt*(s1+s2)/r3;
+ for (m = 0; (m < DIM); m++)
+ {
+ f_i[m] = k1*r_ij[m] + k3*r_ik[m];
+ f_k[m] = k2*r_kj[m] - k3*r_ik[m];
+ f_j[m] = -f_i[m] - f_k[m];
+ }
+
+ for (m = 0; (m < DIM); m++) /* 12 */
+ {
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ /* Virial stuff */
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k); /* 9 */
+ /* 163 TOTAL */
+ }
+ return vtot;
+}
+
+static real bonded_tab(const char *type, int table_nr,
+ const bondedtable_t *table, real kA, real kB, real r,
+ real lambda, real *V, real *F)
+{
+ real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
+ int n0, nnn;
+ real v, f, dvdlambda;
+
+ k = (1.0 - lambda)*kA + lambda*kB;
+
+ tabscale = table->scale;
+ VFtab = table->data;
+
+ rt = r*tabscale;
+ n0 = rt;
+ if (n0 >= table->n)
+ {
+ gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
+ type, table_nr, r, n0, n0+1, table->n);
+ }
+ eps = rt - n0;
+ eps2 = eps*eps;
+ nnn = 4*n0;
+ Yt = VFtab[nnn];
+ Ft = VFtab[nnn+1];
+ Geps = VFtab[nnn+2]*eps;
+ Heps2 = VFtab[nnn+3]*eps2;
+ Fp = Ft + Geps + Heps2;
+ VV = Yt + Fp*eps;
+ FF = Fp + Geps + 2.0*Heps2;
+
+ *F = -k*FF*tabscale;
+ *V = k*VV;
+ dvdlambda = (kB - kA)*VV;
+
+ return dvdlambda;
+
+ /* That was 22 flops */
+}
+
+real tab_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ki, ai, aj, type, table;
+ real dr, dr2, fbond, vbond, fij, vtot;
+ rvec dx;
+ ivec dt;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx_invsqrt(dr2); /* 10 */
+
+ table = forceparams[type].tab.table;
+
+ *dvdlambda += bonded_tab("bond", table,
+ &fcd->bondtab[table],
+ forceparams[type].tab.kA,
+ forceparams[type].tab.kB,
+ dr, lambda, &vbond, &fbond); /* 22 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx_invsqrt(dr2); /* 6 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "TABBONDS: dr = %10g vbond = %10g fbond = %10g\n",
+ dr, vbond, fbond);
+ }
+#endif
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ ki = IVEC2IS(dt);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fij = fbond*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ } /* 62 TOTAL */
+ return vtot;
+}
+
+real tab_angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ai, aj, ak, t1, t2, type, table;
+ rvec r_ij, r_kj;
+ real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ table = forceparams[type].tab.table;
+
+ *dvdlambda += bonded_tab("angle", table,
+ &fcd->angletab[table],
+ forceparams[type].tab.kA,
+ forceparams[type].tab.kB,
+ theta, lambda, &va, &dVdt); /* 22 */
+ vtot += va;
+
+ cos_theta2 = sqr(cos_theta); /* 1 */
+ if (cos_theta2 < 1)
+ {
+ int m;
+ real snt, st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx_invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+#ifdef DEBUG
+ if (debug)
+ {
+ fprintf(debug, "ANGLES: theta = %10g vth = %10g dV/dtheta = %10g\n",
+ theta*RAD2DEG, va, dVdt);
+ }
+#endif
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ nrij2 = iprod(r_ij, r_ij);
+
+ cik = st*gmx_invsqrt(nrkj2*nrij2); /* 12 */
+ cii = sth/nrij2; /* 10 */
+ ckk = sth/nrkj2; /* 10 */
+
+ for (m = 0; (m < DIM); m++) /* 39 */
+ {
+ f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ } /* 169 TOTAL */
+ }
+ return vtot;
+}
+
+real tab_dihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, type, ai, aj, ak, al, table;
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real phi, sign, ddphi, vpd, vtot;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &sign, &t1, &t2, &t3); /* 84 */
+
+ table = forceparams[type].tab.table;
+
+ /* Hopefully phi+M_PI never results in values < 0 */
+ *dvdlambda += bonded_tab("dihedral", table,
+ &fcd->dihtab[table],
+ forceparams[type].tab.kA,
+ forceparams[type].tab.kB,
+ phi+M_PI, lambda, &vpd, &ddphi);
+
+ vtot += vpd;
+ do_dih_fup(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+
+#ifdef DEBUG
+ fprintf(debug, "pdih: (%d,%d,%d,%d) phi=%g\n",
+ ai, aj, ak, al, phi);
+#endif
+ } /* 227 TOTAL */
+
+ return vtot;
+}
+
+static unsigned
+calc_bonded_reduction_mask(const t_idef *idef,
+ int shift,
+ int t, int nt)
+{
+ unsigned mask;
+ int ftype, nb, nat1, nb0, nb1, i, a;
+
+ mask = 0;
+
+ for (ftype = 0; ftype < F_NRE; ftype++)
+ {
+ if (interaction_function[ftype].flags & IF_BOND &&
+ !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
+ (ftype<F_GB12 || ftype>F_GB14))
+ {
+ nb = idef->il[ftype].nr;
+ if (nb > 0)
+ {
+ nat1 = interaction_function[ftype].nratoms + 1;
+
+ /* Divide this interaction equally over the threads.
+ * This is not stored: should match division in calc_bonds.
+ */
+ nb0 = (((nb/nat1)* t )/nt)*nat1;
+ nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
+
+ for (i = nb0; i < nb1; i += nat1)
+ {
+ for (a = 1; a < nat1; a++)
+ {
+ mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
+ }
+ }
+ }
+ }
+ }
+
+ return mask;
+}
+
+void init_bonded_thread_force_reduction(t_forcerec *fr,
+ const t_idef *idef)
+{
+#define MAX_BLOCK_BITS 32
+ int t;
+ int ctot, c, b;
+
+ if (fr->nthreads <= 1)
+ {
+ fr->red_nblock = 0;
+
+ return;
+ }
+
+ /* We divide the force array in a maximum of 32 blocks.
+ * Minimum force block reduction size is 2^6=64.
+ */
+ fr->red_ashift = 6;
+ while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
+ {
+ fr->red_ashift++;
+ }
+ if (debug)
+ {
+ fprintf(debug, "bonded force buffer block atom shift %d bits\n",
+ fr->red_ashift);
+ }
+
+ /* Determine to which blocks each thread's bonded force calculation
+ * contributes. Store this is a mask for each thread.
+ */
+#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+ for (t = 1; t < fr->nthreads; t++)
+ {
+ fr->f_t[t].red_mask =
+ calc_bonded_reduction_mask(idef, fr->red_ashift, t, fr->nthreads);
+ }
+
+ /* Determine the maximum number of blocks we need to reduce over */
+ fr->red_nblock = 0;
+ ctot = 0;
+ for (t = 0; t < fr->nthreads; t++)
+ {
+ c = 0;
+ for (b = 0; b < MAX_BLOCK_BITS; b++)
+ {
+ if (fr->f_t[t].red_mask & (1U<<b))
+ {
+ fr->red_nblock = max(fr->red_nblock, b+1);
+ c++;
+ }
+ }
+ if (debug)
+ {
+ fprintf(debug, "thread %d flags %x count %d\n",
+ t, fr->f_t[t].red_mask, c);
+ }
+ ctot += c;
+ }
+ if (debug)
+ {
+ fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
+ fr->red_nblock, 1<<fr->red_ashift);
+ fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
+ ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
+ ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
+ }
+}
+
+static void zero_thread_forces(f_thread_t *f_t, int n,
+ int nblock, int blocksize)
+{
+ int b, a0, a1, a, i, j;
+
+ if (n > f_t->f_nalloc)
+ {
+ f_t->f_nalloc = over_alloc_large(n);
+ srenew(f_t->f, f_t->f_nalloc);
+ }
+
+ if (f_t->red_mask != 0)
+ {
+ for (b = 0; b < nblock; b++)
+ {
+ if (f_t->red_mask && (1U<<b))
+ {
+ a0 = b*blocksize;
+ a1 = min((b+1)*blocksize, n);
+ for (a = a0; a < a1; a++)
+ {
+ clear_rvec(f_t->f[a]);
+ }
+ }
+ }
+ }
+ for (i = 0; i < SHIFTS; i++)
+ {
+ clear_rvec(f_t->fshift[i]);
+ }
+ for (i = 0; i < F_NRE; i++)
+ {
+ f_t->ener[i] = 0;
+ }
+ for (i = 0; i < egNR; i++)
+ {
+ for (j = 0; j < f_t->grpp.nener; j++)
+ {
+ f_t->grpp.ener[i][j] = 0;
+ }
+ }
+ for (i = 0; i < efptNR; i++)
+ {
+ f_t->dvdl[i] = 0;
+ }
+}
+
+static void reduce_thread_force_buffer(int n, rvec *f,
+ int nthreads, f_thread_t *f_t,
+ int nblock, int block_size)
+{
+ /* The max thread number is arbitrary,
+ * we used a fixed number to avoid memory management.
+ * Using more than 16 threads is probably never useful performance wise.
+ */
+#define MAX_BONDED_THREADS 256
+ int b;
+
+ if (nthreads > MAX_BONDED_THREADS)
+ {
+ gmx_fatal(FARGS, "Can not reduce bonded forces on more than %d threads",
+ MAX_BONDED_THREADS);
+ }
+
+ /* This reduction can run on any number of threads,
+ * independently of nthreads.
+ */
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+ for (b = 0; b < nblock; b++)
+ {
+ rvec *fp[MAX_BONDED_THREADS];
+ int nfb, ft, fb;
+ int a0, a1, a;
+
+ /* Determine which threads contribute to this block */
+ nfb = 0;
+ for (ft = 1; ft < nthreads; ft++)
+ {
+ if (f_t[ft].red_mask & (1U<<b))
+ {
+ fp[nfb++] = f_t[ft].f;
+ }
+ }
+ if (nfb > 0)
+ {
+ /* Reduce force buffers for threads that contribute */
+ a0 = b *block_size;
+ a1 = (b+1)*block_size;
+ a1 = min(a1, n);
+ for (a = a0; a < a1; a++)
+ {
+ for (fb = 0; fb < nfb; fb++)
+ {
+ rvec_inc(f[a], fp[fb][a]);
+ }
+ }
+ }
+ }
+}
+
+static void reduce_thread_forces(int n, rvec *f, rvec *fshift,
+ real *ener, gmx_grppairener_t *grpp, real *dvdl,
+ int nthreads, f_thread_t *f_t,
+ int nblock, int block_size,
+ gmx_bool bCalcEnerVir,
+ gmx_bool bDHDL)
+{
+ if (nblock > 0)
+ {
+ /* Reduce the bonded force buffer */
+ reduce_thread_force_buffer(n, f, nthreads, f_t, nblock, block_size);
+ }
+
+ /* When necessary, reduce energy and virial using one thread only */
+ if (bCalcEnerVir)
+ {
+ int t, i, j;
+
+ for (i = 0; i < SHIFTS; i++)
+ {
+ for (t = 1; t < nthreads; t++)
+ {
+ rvec_inc(fshift[i], f_t[t].fshift[i]);
+ }
+ }
+ for (i = 0; i < F_NRE; i++)
+ {
+ for (t = 1; t < nthreads; t++)
+ {
+ ener[i] += f_t[t].ener[i];
+ }
+ }
+ for (i = 0; i < egNR; i++)
+ {
+ for (j = 0; j < f_t[1].grpp.nener; j++)
+ {
+ for (t = 1; t < nthreads; t++)
+ {
+
+ grpp->ener[i][j] += f_t[t].grpp.ener[i][j];
+ }
+ }
+ }
+ if (bDHDL)
+ {
+ for (i = 0; i < efptNR; i++)
+ {
+
+ for (t = 1; t < nthreads; t++)
+ {
+ dvdl[i] += f_t[t].dvdl[i];
+ }
+ }
+ }
+ }
+}
+
+static real calc_one_bond(FILE *fplog, int thread,
+ int ftype, const t_idef *idef,
+ rvec x[], rvec f[], rvec fshift[],
+ t_forcerec *fr,
+ const t_pbc *pbc, const t_graph *g,
+ gmx_enerdata_t gmx_unused *enerd, gmx_grppairener_t *grpp,
+ t_nrnb *nrnb,
+ real *lambda, real *dvdl,
+ const t_mdatoms *md, t_fcdata *fcd,
+ gmx_bool bCalcEnerVir,
+ int *global_atom_index, gmx_bool bPrintSepPot)
+{
+ int ind, nat1, nbonds, efptFTYPE;
+ real v = 0;
+ t_iatom *iatoms;
+ int nb0, nbn;
+
+ if (IS_RESTRAINT_TYPE(ftype))
+ {
+ efptFTYPE = efptRESTRAINT;
+ }
+ else
+ {
+ efptFTYPE = efptBONDED;
+ }
+
+ if (interaction_function[ftype].flags & IF_BOND &&
+ !(ftype == F_CONNBONDS || ftype == F_POSRES))
+ {
+ ind = interaction_function[ftype].nrnb_ind;
+ nat1 = interaction_function[ftype].nratoms + 1;
+ nbonds = idef->il[ftype].nr/nat1;
+ iatoms = idef->il[ftype].iatoms;
+
+ nb0 = ((nbonds* thread )/(fr->nthreads))*nat1;
+ nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
+
+ if (!IS_LISTED_LJ_C(ftype))
+ {
+ if (ftype == F_CMAP)
+ {
+ v = cmap_dihs(nbn, iatoms+nb0,
+ idef->iparams, &idef->cmap_grid,
+ (const rvec*)x, f, fshift,
+ pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+ md, fcd, global_atom_index);
+ }
+#ifdef SIMD_BONDEDS
+ else if (ftype == F_ANGLES &&
+ !bCalcEnerVir && fr->efep == efepNO)
+ {
+ /* No energies, shift forces, dvdl */
+ angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+ idef->iparams,
+ (const rvec*)x, f,
+ pbc, g, lambda[efptFTYPE], md, fcd,
+ global_atom_index);
+ v = 0;
+ }
+#endif
+ else if (ftype == F_PDIHS &&
+ !bCalcEnerVir && fr->efep == efepNO)
+ {
+ /* No energies, shift forces, dvdl */
+#ifndef SIMD_BONDEDS
+ pdihs_noener
+#else
+ pdihs_noener_simd
+#endif
+ (nbn, idef->il[ftype].iatoms+nb0,
+ idef->iparams,
+ (const rvec*)x, f,
+ pbc, g, lambda[efptFTYPE], md, fcd,
+ global_atom_index);
+ v = 0;
+ }
+ else
+ {
+ v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
+ idef->iparams,
+ (const rvec*)x, f, fshift,
+ pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+ md, fcd, global_atom_index);
+ }
+ if (bPrintSepPot)
+ {
+ fprintf(fplog, " %-23s #%4d V %12.5e dVdl %12.5e\n",
+ interaction_function[ftype].longname,
+ nbonds/nat1, v, lambda[efptFTYPE]);
+ }
+ }
+ else
+ {
+ v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
+ pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
+
+ if (bPrintSepPot)
+ {
+ fprintf(fplog, " %-5s + %-15s #%4d dVdl %12.5e\n",
+ interaction_function[ftype].longname,
+ interaction_function[F_LJ14].longname, nbonds/nat1, dvdl[efptVDW]);
+ fprintf(fplog, " %-5s + %-15s #%4d dVdl %12.5e\n",
+ interaction_function[ftype].longname,
+ interaction_function[F_COUL14].longname, nbonds/nat1, dvdl[efptCOUL]);
+ }
+ }
+ if (ind != -1 && thread == 0)
+ {
+ inc_nrnb(nrnb, ind, nbonds);
+ }
+ }
+
+ return v;
+}
+
+/* WARNING! THIS FUNCTION MUST EXACTLY TRACK THE calc
+ function, or horrible things will happen when doing free energy
+ calculations! In a good coding world, this would not be a
+ different function, but for speed reasons, it needs to be made a
+ separate function. TODO for 5.0 - figure out a way to reorganize
+ to reduce duplication.
+ */
+
+static real calc_one_bond_foreign(FILE gmx_unused *fplog, int ftype, const t_idef *idef,
+ rvec x[], rvec f[], t_forcerec *fr,
+ const t_pbc *pbc, const t_graph *g,
+ gmx_grppairener_t *grpp, t_nrnb *nrnb,
+ real *lambda, real *dvdl,
+ const t_mdatoms *md, t_fcdata *fcd,
+ int *global_atom_index, gmx_bool gmx_unused bPrintSepPot)
+{
+ int ind, nat1, nbonds, efptFTYPE, nbonds_np;
+ real v = 0;
+ t_iatom *iatoms;
+
+ if (IS_RESTRAINT_TYPE(ftype))
+ {
+ efptFTYPE = efptRESTRAINT;
+ }
+ else
+ {
+ efptFTYPE = efptBONDED;
+ }
+
+ if (ftype < F_GB12 || ftype > F_GB14)
+ {
+ if (interaction_function[ftype].flags & IF_BOND &&
+ !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
+ {
+ ind = interaction_function[ftype].nrnb_ind;
+ nat1 = interaction_function[ftype].nratoms+1;
+ nbonds_np = idef->il[ftype].nr_nonperturbed;
+ nbonds = idef->il[ftype].nr - nbonds_np;
+ iatoms = idef->il[ftype].iatoms + nbonds_np;
+ if (nbonds > 0)
+ {
+ if (!IS_LISTED_LJ_C(ftype))
+ {
+ if (ftype == F_CMAP)
+ {
+ v = cmap_dihs(nbonds, iatoms,
+ idef->iparams, &idef->cmap_grid,
+ (const rvec*)x, f, fr->fshift,
+ pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]), md, fcd,
+ global_atom_index);
+ }
+ else
+ {
+ v = interaction_function[ftype].ifunc(nbonds, iatoms,
+ idef->iparams,
+ (const rvec*)x, f, fr->fshift,
+ pbc, g, lambda[efptFTYPE], &dvdl[efptFTYPE],
+ md, fcd, global_atom_index);
+ }
+ }
+ else
+ {
+ v = do_nonbonded_listed(ftype, nbonds, iatoms,
+ idef->iparams,
+ (const rvec*)x, f, fr->fshift,
+ pbc, g, lambda, dvdl,
+ md, fr, grpp, global_atom_index);
+ }
+ if (ind != -1)
+ {
+ inc_nrnb(nrnb, ind, nbonds/nat1);
+ }
+ }
+ }
+ }
+ return v;
+}
+
+void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
+ const t_idef *idef,
+ rvec x[], history_t *hist,
+ rvec f[], t_forcerec *fr,
+ const t_pbc *pbc, const t_graph *g,
+ gmx_enerdata_t *enerd, t_nrnb *nrnb,
+ real *lambda,
+ const t_mdatoms *md,
+ t_fcdata *fcd, int *global_atom_index,
+ t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
+ int force_flags,
+ gmx_bool bPrintSepPot, gmx_large_int_t step)
+{
+ gmx_bool bCalcEnerVir;
+ int i;
+ real v, dvdl[efptNR], dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
+ of lambda, which will be thrown away in the end*/
+ const t_pbc *pbc_null;
+ char buf[22];
+ int thread;
+
+ bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
+
+ for (i = 0; i < efptNR; i++)
+ {
+ dvdl[i] = 0.0;
+ }
+ if (fr->bMolPBC)
+ {
+ pbc_null = pbc;
+ }
+ else
+ {
+ pbc_null = NULL;
+ }
+ if (bPrintSepPot)
+ {
+ fprintf(fplog, "Step %s: bonded V and dVdl for this node\n",
+ gmx_step_str(step, buf));
+ }
+
+#ifdef DEBUG
+ if (g && debug)
+ {
+ p_graph(debug, "Bondage is fun", g);
+ }
+#endif
+
+ /* Do pre force calculation stuff which might require communication */
+ if (idef->il[F_ORIRES].nr)
+ {
+ enerd->term[F_ORIRESDEV] =
+ calc_orires_dev(ms, idef->il[F_ORIRES].nr,
+ idef->il[F_ORIRES].iatoms,
+ idef->iparams, md, (const rvec*)x,
+ pbc_null, fcd, hist);
+ }
+ if (idef->il[F_DISRES].nr)
+ {
+ calc_disres_R_6(ms, idef->il[F_DISRES].nr,
+ idef->il[F_DISRES].iatoms,
+ idef->iparams, (const rvec*)x, pbc_null,
+ fcd, hist);
+ }
+
+#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+ for (thread = 0; thread < fr->nthreads; thread++)
+ {
+ int ftype, nbonds, ind, nat1;
+ real *epot, v;
+ /* thread stuff */
+ rvec *ft, *fshift;
+ real *dvdlt;
+ gmx_grppairener_t *grpp;
+ int nb0, nbn;
+
+ if (thread == 0)
+ {
+ ft = f;
+ fshift = fr->fshift;
+ epot = enerd->term;
+ grpp = &enerd->grpp;
+ dvdlt = dvdl;
+ }
+ else
+ {
+ zero_thread_forces(&fr->f_t[thread], fr->natoms_force,
+ fr->red_nblock, 1<<fr->red_ashift);
+
+ ft = fr->f_t[thread].f;
+ fshift = fr->f_t[thread].fshift;
+ epot = fr->f_t[thread].ener;
+ grpp = &fr->f_t[thread].grpp;
+ dvdlt = fr->f_t[thread].dvdl;
+ }
+ /* Loop over all bonded force types to calculate the bonded forces */
+ for (ftype = 0; (ftype < F_NRE); ftype++)
+ {
+ if (idef->il[ftype].nr > 0 &&
+ (interaction_function[ftype].flags & IF_BOND) &&
+ (ftype < F_GB12 || ftype > F_GB14) &&
+ !(ftype == F_CONNBONDS || ftype == F_POSRES))
+ {
+ v = calc_one_bond(fplog, thread, ftype, idef, x,
+ ft, fshift, fr, pbc_null, g, enerd, grpp,
+ nrnb, lambda, dvdlt,
+ md, fcd, bCalcEnerVir,
+ global_atom_index, bPrintSepPot);
+ epot[ftype] += v;
+ }
+ }
+ }
+ if (fr->nthreads > 1)
+ {
+ reduce_thread_forces(fr->natoms_force, f, fr->fshift,
+ enerd->term, &enerd->grpp, dvdl,
+ fr->nthreads, fr->f_t,
+ fr->red_nblock, 1<<fr->red_ashift,
+ bCalcEnerVir,
+ force_flags & GMX_FORCE_DHDL);
+ }
+ if (force_flags & GMX_FORCE_DHDL)
+ {
+ for (i = 0; i < efptNR; i++)
+ {
+ enerd->dvdl_nonlin[i] += dvdl[i];
+ }
+ }
+
+ /* Copy the sum of violations for the distance restraints from fcd */
+ if (fcd)
+ {
+ enerd->term[F_DISRESVIOL] = fcd->disres.sumviol;
+
+ }
+}
+
+void calc_bonds_lambda(FILE *fplog,
+ const t_idef *idef,
+ rvec x[],
+ t_forcerec *fr,
+ const t_pbc *pbc, const t_graph *g,
+ gmx_grppairener_t *grpp, real *epot, t_nrnb *nrnb,
+ real *lambda,
+ const t_mdatoms *md,
+ t_fcdata *fcd,
+ int *global_atom_index)
+{
+ int i, ftype, nbonds_np, nbonds, ind, nat;
+ real v, dr, dr2;
+ real dvdl_dum[efptNR];
+ rvec *f, *fshift_orig;
+ const t_pbc *pbc_null;
+ t_iatom *iatom_fe;
+
+ if (fr->bMolPBC)
+ {
+ pbc_null = pbc;
+ }
+ else
+ {
+ pbc_null = NULL;
+ }
+
+ snew(f, fr->natoms_force);
+ /* We want to preserve the fshift array in forcerec */
+ fshift_orig = fr->fshift;
+ snew(fr->fshift, SHIFTS);
+
+ /* Loop over all bonded force types to calculate the bonded forces */
+ for (ftype = 0; (ftype < F_NRE); ftype++)
+ {
+ v = calc_one_bond_foreign(fplog, ftype, idef, x,
+ f, fr, pbc_null, g, grpp, nrnb, lambda, dvdl_dum,
+ md, fcd, global_atom_index, FALSE);
+ epot[ftype] += v;
+ }
+
+ sfree(fr->fshift);
+ fr->fshift = fshift_orig;
+ sfree(f);
+}
--- /dev/null
- list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.03
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+
+#include <sys/types.h>
+#include <math.h>
+#include "typedefs.h"
+#include "physics.h"
+#include "smalloc.h"
+#include "gmx_fatal.h"
+#include "macros.h"
+#include "vec.h"
+#include "coulomb.h"
+#include "calc_verletbuf.h"
+#include "../mdlib/nbnxn_consts.h"
+
++#ifdef GMX_NBNXN_SIMD
++/* The include below sets the SIMD instruction type (precision+width)
++ * for all nbnxn SIMD search and non-bonded kernel code.
++ */
++#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
++#define GMX_USE_HALF_WIDTH_SIMD_HERE
++#endif
++#include "gmx_simd_macros.h"
++#endif
++
+/* Struct for unique atom type for calculating the energy drift.
+ * The atom displacement depends on mass and constraints.
+ * The energy jump for given distance depend on LJ type and q.
+ */
+typedef struct
+{
+ real mass; /* mass */
+ int type; /* type (used for LJ parameters) */
+ real q; /* charge */
+ int con; /* constrained: 0, else 1, if 1, use #DOF=2 iso 3 */
+ int n; /* total #atoms of this type in the system */
+} verletbuf_atomtype_t;
+
+
+void verletbuf_get_list_setup(gmx_bool bGPU,
+ verletbuf_list_setup_t *list_setup)
+{
+ list_setup->cluster_size_i = NBNXN_CPU_CLUSTER_I_SIZE;
+
+ if (bGPU)
+ {
+ list_setup->cluster_size_j = NBNXN_GPU_CLUSTER_SIZE;
+ }
+ else
+ {
+#ifndef GMX_NBNXN_SIMD
+ list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
+#else
++ list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE;
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We assume the smallest cluster size to be on the safe side */
+ list_setup->cluster_size_j /= 2;
+#endif
+#endif
+ }
+}
+
+static void add_at(verletbuf_atomtype_t **att_p, int *natt_p,
+ real mass, int type, real q, int con, int nmol)
+{
+ verletbuf_atomtype_t *att;
+ int natt, i;
+
+ if (mass == 0)
+ {
+ /* Ignore massless particles */
+ return;
+ }
+
+ att = *att_p;
+ natt = *natt_p;
+
+ i = 0;
+ while (i < natt &&
+ !(mass == att[i].mass &&
+ type == att[i].type &&
+ q == att[i].q &&
+ con == att[i].con))
+ {
+ i++;
+ }
+
+ if (i < natt)
+ {
+ att[i].n += nmol;
+ }
+ else
+ {
+ (*natt_p)++;
+ srenew(*att_p, *natt_p);
+ (*att_p)[i].mass = mass;
+ (*att_p)[i].type = type;
+ (*att_p)[i].q = q;
+ (*att_p)[i].con = con;
+ (*att_p)[i].n = nmol;
+ }
+}
+
+static void get_verlet_buffer_atomtypes(const gmx_mtop_t *mtop,
+ verletbuf_atomtype_t **att_p,
+ int *natt_p,
+ int *n_nonlin_vsite)
+{
+ verletbuf_atomtype_t *att;
+ int natt;
+ int mb, nmol, ft, i, j, a1, a2, a3, a;
+ const t_atoms *atoms;
+ const t_ilist *il;
+ const t_atom *at;
+ const t_iparams *ip;
+ real *con_m, *vsite_m, cam[5];
+
+ att = NULL;
+ natt = 0;
+
+ if (n_nonlin_vsite != NULL)
+ {
+ *n_nonlin_vsite = 0;
+ }
+
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+
+ /* Check for constraints, as they affect the kinetic energy */
+ snew(con_m, atoms->nr);
+ snew(vsite_m, atoms->nr);
+
+ for (ft = F_CONSTR; ft <= F_CONSTRNC; ft++)
+ {
+ il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+
+ for (i = 0; i < il->nr; i += 1+NRAL(ft))
+ {
+ a1 = il->iatoms[i+1];
+ a2 = il->iatoms[i+2];
+ con_m[a1] += atoms->atom[a2].m;
+ con_m[a2] += atoms->atom[a1].m;
+ }
+ }
+
+ il = &mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE];
+
+ for (i = 0; i < il->nr; i += 1+NRAL(F_SETTLE))
+ {
+ a1 = il->iatoms[i+1];
+ a2 = il->iatoms[i+2];
+ a3 = il->iatoms[i+3];
+ con_m[a1] += atoms->atom[a2].m + atoms->atom[a3].m;
+ con_m[a2] += atoms->atom[a1].m + atoms->atom[a3].m;
+ con_m[a3] += atoms->atom[a1].m + atoms->atom[a2].m;
+ }
+
+ /* Check for virtual sites, determine mass from constructing atoms */
+ for (ft = 0; ft < F_NRE; ft++)
+ {
+ if (IS_VSITE(ft))
+ {
+ il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+
+ for (i = 0; i < il->nr; i += 1+NRAL(ft))
+ {
+ ip = &mtop->ffparams.iparams[il->iatoms[i]];
+
+ a1 = il->iatoms[i+1];
+
+ for (j = 1; j < NRAL(ft); j++)
+ {
+ cam[j] = atoms->atom[il->iatoms[i+1+j]].m;
+ if (cam[j] == 0)
+ {
+ cam[j] = vsite_m[il->iatoms[i+1+j]];
+ }
+ if (cam[j] == 0)
+ {
+ gmx_fatal(FARGS, "In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
+ *mtop->moltype[mtop->molblock[mb].type].name,
+ interaction_function[ft].longname,
+ il->iatoms[i+1+j]+1);
+ }
+ }
+
+ switch (ft)
+ {
+ case F_VSITE2:
+ /* Exact except for ignoring constraints */
+ vsite_m[a1] = (cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a))/(cam[1]*cam[2]);
+ break;
+ case F_VSITE3:
+ /* Exact except for ignoring constraints */
+ vsite_m[a1] = (cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b))/(cam[1]*cam[2]*cam[3]);
+ break;
+ default:
+ /* Use the mass of the lightest constructing atom.
+ * This is an approximation.
+ * If the distance of the virtual site to the
+ * constructing atom is less than all distances
+ * between constructing atoms, this is a safe
+ * over-estimate of the displacement of the vsite.
+ * This condition holds for all H mass replacement
+ * replacement vsite constructions, except for SP2/3
+ * groups. In SP3 groups one H will have a F_VSITE3
+ * construction, so even there the total drift
+ * estimation shouldn't be far off.
+ */
+ assert(j >= 1);
+ vsite_m[a1] = cam[1];
+ for (j = 2; j < NRAL(ft); j++)
+ {
+ vsite_m[a1] = min(vsite_m[a1], cam[j]);
+ }
+ if (n_nonlin_vsite != NULL)
+ {
+ *n_nonlin_vsite += nmol;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ for (a = 0; a < atoms->nr; a++)
+ {
+ at = &atoms->atom[a];
+ /* We consider an atom constrained, #DOF=2, when it is
+ * connected with constraints to one or more atoms with
+ * total mass larger than 1.5 that of the atom itself.
+ */
+ add_at(&att, &natt,
+ at->m, at->type, at->q, con_m[a] > 1.5*at->m, nmol);
+ }
+
+ sfree(vsite_m);
+ sfree(con_m);
+ }
+
+ if (gmx_debug_at)
+ {
+ for (a = 0; a < natt; a++)
+ {
+ fprintf(debug, "type %d: m %5.2f t %d q %6.3f con %d n %d\n",
+ a, att[a].mass, att[a].type, att[a].q, att[a].con, att[a].n);
+ }
+ }
+
+ *att_p = att;
+ *natt_p = natt;
+}
+
+static void approx_2dof(real s2, real x,
+ real *shift, real *scale)
+{
+ /* A particle with 1 DOF constrained has 2 DOFs instead of 3.
+ * This code is also used for particles with multiple constraints,
+ * in which case we overestimate the displacement.
+ * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s).
+ * We approximate this with scale*Gaussian(s,r+shift),
+ * by matching the distribution value and derivative at x.
+ * This is a tight overestimate for all r>=0 at any s and x.
+ */
+ real ex, er;
+
+ ex = exp(-x*x/(2*s2));
+ er = gmx_erfc(x/sqrt(2*s2));
+
+ *shift = -x + sqrt(2*s2/M_PI)*ex/er;
+ *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er;
+}
+
+static real ener_drift(const verletbuf_atomtype_t *att, int natt,
+ const gmx_ffparams_t *ffp,
+ real kT_fac,
+ real md_ljd, real md_ljr, real md_el, real dd_el,
+ real r_buffer,
+ real rlist, real boxvol)
+{
+ double drift_tot, pot1, pot2, pot;
+ int i, j;
+ real s2i, s2j, s2, s;
+ int ti, tj;
+ real md, dd;
+ real sc_fac, rsh;
+ double c_exp, c_erfc;
+
+ drift_tot = 0;
+
+ /* Loop over the different atom type pairs */
+ for (i = 0; i < natt; i++)
+ {
+ s2i = kT_fac/att[i].mass;
+ ti = att[i].type;
+
+ for (j = i; j < natt; j++)
+ {
+ s2j = kT_fac/att[j].mass;
+ tj = att[j].type;
+
+ /* Note that attractive and repulsive potentials for individual
+ * pairs will partially cancel.
+ */
+ /* -dV/dr at the cut-off for LJ + Coulomb */
+ md =
+ md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 +
+ md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 +
+ md_el*att[i].q*att[j].q;
+
+ /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */
+ dd = dd_el*att[i].q*att[j].q;
+
+ s2 = s2i + s2j;
+
+ rsh = r_buffer;
+ sc_fac = 1.0;
+ /* For constraints: adapt r and scaling for the Gaussian */
+ if (att[i].con)
+ {
+ real sh, sc;
+ approx_2dof(s2i, r_buffer*s2i/s2, &sh, &sc);
+ rsh += sh;
+ sc_fac *= sc;
+ }
+ if (att[j].con)
+ {
+ real sh, sc;
+ approx_2dof(s2j, r_buffer*s2j/s2, &sh, &sc);
+ rsh += sh;
+ sc_fac *= sc;
+ }
+
+ /* Exact contribution of an atom pair with Gaussian displacement
+ * with sigma s to the energy drift for a potential with
+ * derivative -md and second derivative dd at the cut-off.
+ * The only catch is that for potentials that change sign
+ * near the cut-off there could be an unlucky compensation
+ * of positive and negative energy drift.
+ * Such potentials are extremely rare though.
+ *
+ * Note that pot has unit energy*length, as the linear
+ * atom density still needs to be put in.
+ */
+ c_exp = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI);
+ c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2)));
+ s = sqrt(s2);
+
+ pot1 = sc_fac*
+ md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp);
+ pot2 = sc_fac*
+ dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc);
+ pot = pot1 + pot2;
+
+ if (gmx_debug_at)
+ {
+ fprintf(debug, "n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
+ att[i].n, att[j].n, sqrt(s2i), sqrt(s2j),
+ att[i].con+att[j].con,
+ md, dd, pot1, pot2, pot);
+ }
+
+ /* Multiply by the number of atom pairs */
+ if (j == i)
+ {
+ pot *= (double)att[i].n*(att[i].n - 1)/2;
+ }
+ else
+ {
+ pot *= (double)att[i].n*att[j].n;
+ }
+ /* We need the line density to get the energy drift of the system.
+ * The effective average r^2 is close to (rlist+sigma)^2.
+ */
+ pot *= 4*M_PI*sqr(rlist + s)/boxvol;
+
+ /* Add the unsigned drift to avoid cancellation of errors */
+ drift_tot += fabs(pot);
+ }
+ }
+
+ return drift_tot;
+}
+
+static real surface_frac(int cluster_size, real particle_distance, real rlist)
+{
+ real d, area_rel;
+
+ if (rlist < 0.5*particle_distance)
+ {
+ /* We have non overlapping spheres */
+ return 1.0;
+ }
+
+ /* Half the inter-particle distance relative to rlist */
+ d = 0.5*particle_distance/rlist;
+
+ /* Determine the area of the surface at distance rlist to the closest
+ * particle, relative to surface of a sphere of radius rlist.
+ * The formulas below assume close to cubic cells for the pair search grid,
+ * which the pair search code tries to achieve.
+ * Note that in practice particle distances will not be delta distributed,
+ * but have some spread, often involving shorter distances,
+ * as e.g. O-H bonds in a water molecule. Thus the estimates below will
+ * usually be slightly too high and thus conservative.
+ */
+ switch (cluster_size)
+ {
+ case 1:
+ /* One particle: trivial */
+ area_rel = 1.0;
+ break;
+ case 2:
+ /* Two particles: two spheres at fractional distance 2*a */
+ area_rel = 1.0 + d;
+ break;
+ case 4:
+ /* We assume a perfect, symmetric tetrahedron geometry.
+ * The surface around a tetrahedron is too complex for a full
+ * analytical solution, so we use a Taylor expansion.
+ */
+ area_rel = (1.0 + 1/M_PI*(6*acos(1/sqrt(3))*d +
+ sqrt(3)*d*d*(1.0 +
+ 5.0/18.0*d*d +
+ 7.0/45.0*d*d*d*d +
+ 83.0/756.0*d*d*d*d*d*d)));
+ break;
+ default:
+ gmx_incons("surface_frac called with unsupported cluster_size");
+ area_rel = 1.0;
+ }
+
+ return area_rel/cluster_size;
+}
+
+void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
+ const t_inputrec *ir, real drift_target,
+ const verletbuf_list_setup_t *list_setup,
+ int *n_nonlin_vsite,
+ real *rlist)
+{
+ double resolution;
+ char *env;
+
+ real particle_distance;
+ real nb_clust_frac_pairs_not_in_list_at_cutoff;
+
+ verletbuf_atomtype_t *att = NULL;
+ int natt = -1, i;
+ double reppow;
+ real md_ljd, md_ljr, md_el, dd_el;
+ real elfac;
+ real kT_fac, mass_min;
+ int ib0, ib1, ib;
+ real rb, rl;
+ real drift;
+
+ /* Resolution of the buffer size */
+ resolution = 0.001;
+
+ env = getenv("GMX_VERLET_BUFFER_RES");
+ if (env != NULL)
+ {
+ sscanf(env, "%lf", &resolution);
+ }
+
+ /* In an atom wise pair-list there would be no pairs in the list
+ * beyond the pair-list cut-off.
+ * However, we use a pair-list of groups vs groups of atoms.
+ * For groups of 4 atoms, the parallelism of SSE instructions, only
+ * 10% of the atoms pairs are not in the list just beyond the cut-off.
+ * As this percentage increases slowly compared to the decrease of the
+ * Gaussian displacement distribution over this range, we can simply
+ * reduce the drift by this fraction.
+ * For larger groups, e.g. of 8 atoms, this fraction will be lower,
+ * so then buffer size will be on the conservative (large) side.
+ *
+ * Note that the formulas used here do not take into account
+ * cancellation of errors which could occur by missing both
+ * attractive and repulsive interactions.
+ *
+ * The only major assumption is homogeneous particle distribution.
+ * For an inhomogeneous system, such as a liquid-vapor system,
+ * the buffer will be underestimated. The actual energy drift
+ * will be higher by the factor: local/homogeneous particle density.
+ *
+ * The results of this estimate have been checked againt simulations.
+ * In most cases the real drift differs by less than a factor 2.
+ */
+
+ /* Worst case assumption: HCP packing of particles gives largest distance */
+ particle_distance = pow(boxvol*sqrt(2)/mtop->natoms, 1.0/3.0);
+
+ get_verlet_buffer_atomtypes(mtop, &att, &natt, n_nonlin_vsite);
+ assert(att != NULL && natt >= 0);
+
+ if (debug)
+ {
+ fprintf(debug, "particle distance assuming HCP packing: %f nm\n",
+ particle_distance);
+ fprintf(debug, "energy drift atom types: %d\n", natt);
+ }
+
+ reppow = mtop->ffparams.reppow;
+ md_ljd = 0;
+ md_ljr = 0;
+ if (ir->vdwtype == evdwCUT)
+ {
+ /* -dV/dr of -r^-6 and r^-repporw */
+ md_ljd = -6*pow(ir->rvdw, -7.0);
+ md_ljr = reppow*pow(ir->rvdw, -(reppow+1));
+ /* The contribution of the second derivative is negligible */
+ }
+ else
+ {
+ gmx_fatal(FARGS, "Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions");
+ }
+
+ elfac = ONE_4PI_EPS0/ir->epsilon_r;
+
+ /* Determine md=-dV/dr and dd=d^2V/dr^2 */
+ md_el = 0;
+ dd_el = 0;
+ if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
+ {
+ real eps_rf, k_rf;
+
+ if (ir->coulombtype == eelCUT)
+ {
+ eps_rf = 1;
+ k_rf = 0;
+ }
+ else
+ {
+ eps_rf = ir->epsilon_rf/ir->epsilon_r;
+ if (eps_rf != 0)
+ {
+ k_rf = pow(ir->rcoulomb, -3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r);
+ }
+ else
+ {
+ /* epsilon_rf = infinity */
+ k_rf = 0.5*pow(ir->rcoulomb, -3.0);
+ }
+ }
+
+ if (eps_rf > 0)
+ {
+ md_el = elfac*(pow(ir->rcoulomb, -2.0) - 2*k_rf*ir->rcoulomb);
+ }
+ dd_el = elfac*(2*pow(ir->rcoulomb, -3.0) + 2*k_rf);
+ }
+ else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD)
+ {
+ real b, rc, br;
+
+ b = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ rc = ir->rcoulomb;
+ br = b*rc;
+ md_el = elfac*(b*exp(-br*br)*M_2_SQRTPI/rc + gmx_erfc(br)/(rc*rc));
+ dd_el = elfac/(rc*rc)*(2*b*(1 + br*br)*exp(-br*br)*M_2_SQRTPI + 2*gmx_erfc(br)/rc);
+ }
+ else
+ {
+ gmx_fatal(FARGS, "Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics");
+ }
+
+ /* Determine the variance of the atomic displacement
+ * over nstlist-1 steps: kT_fac
+ * For inertial dynamics (not Brownian dynamics) the mass factor
+ * is not included in kT_fac, it is added later.
+ */
+ if (ir->eI == eiBD)
+ {
+ /* Get the displacement distribution from the random component only.
+ * With accurate integration the systematic (force) displacement
+ * should be negligible (unless nstlist is extremely large, which
+ * you wouldn't do anyhow).
+ */
+ kT_fac = 2*BOLTZ*ir->opts.ref_t[0]*(ir->nstlist-1)*ir->delta_t;
+ if (ir->bd_fric > 0)
+ {
+ /* This is directly sigma^2 of the displacement */
+ kT_fac /= ir->bd_fric;
+
+ /* Set the masses to 1 as kT_fac is the full sigma^2,
+ * but we divide by m in ener_drift().
+ */
+ for (i = 0; i < natt; i++)
+ {
+ att[i].mass = 1;
+ }
+ }
+ else
+ {
+ real tau_t;
+
+ /* Per group tau_t is not implemented yet, use the maximum */
+ tau_t = ir->opts.tau_t[0];
+ for (i = 1; i < ir->opts.ngtc; i++)
+ {
+ tau_t = max(tau_t, ir->opts.tau_t[i]);
+ }
+
+ kT_fac *= tau_t;
+ /* This kT_fac needs to be divided by the mass to get sigma^2 */
+ }
+ }
+ else
+ {
+ kT_fac = BOLTZ*ir->opts.ref_t[0]*sqr((ir->nstlist-1)*ir->delta_t);
+ }
+
+ mass_min = att[0].mass;
+ for (i = 1; i < natt; i++)
+ {
+ mass_min = min(mass_min, att[i].mass);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "md_ljd %e md_ljr %e\n", md_ljd, md_ljr);
+ fprintf(debug, "md_el %e dd_el %e\n", md_el, dd_el);
+ fprintf(debug, "sqrt(kT_fac) %f\n", sqrt(kT_fac));
+ fprintf(debug, "mass_min %f\n", mass_min);
+ }
+
+ /* Search using bisection */
+ ib0 = -1;
+ /* The drift will be neglible at 5 times the max sigma */
+ ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1;
+ while (ib1 - ib0 > 1)
+ {
+ ib = (ib0 + ib1)/2;
+ rb = ib*resolution;
+ rl = max(ir->rvdw, ir->rcoulomb) + rb;
+
+ /* Calculate the average energy drift at the last step
+ * of the nstlist steps at which the pair-list is used.
+ */
+ drift = ener_drift(att, natt, &mtop->ffparams,
+ kT_fac,
+ md_ljd, md_ljr, md_el, dd_el, rb,
+ rl, boxvol);
+
+ /* Correct for the fact that we are using a Ni x Nj particle pair list
+ * and not a 1 x 1 particle pair list. This reduces the drift.
+ */
+ /* We don't have a formula for 8 (yet), use 4 which is conservative */
+ nb_clust_frac_pairs_not_in_list_at_cutoff =
+ surface_frac(min(list_setup->cluster_size_i, 4),
+ particle_distance, rl)*
+ surface_frac(min(list_setup->cluster_size_j, 4),
+ particle_distance, rl);
+ drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
+
+ /* Convert the drift to drift per unit time per atom */
+ drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+
+ if (debug)
+ {
+ fprintf(debug, "ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %f\n",
+ ib0, ib, ib1, rb,
+ list_setup->cluster_size_i, list_setup->cluster_size_j,
+ nb_clust_frac_pairs_not_in_list_at_cutoff,
+ drift);
+ }
+
+ if (fabs(drift) > drift_target)
+ {
+ ib0 = ib;
+ }
+ else
+ {
+ ib1 = ib;
+ }
+ }
+
+ sfree(att);
+
+ *rlist = max(ir->rvdw, ir->rcoulomb) + ib1*resolution;
+}
--- /dev/null
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "string2.h"
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include "gen_vsite.h"
+#include "smalloc.h"
+#include "resall.h"
+#include "add_par.h"
+#include "vec.h"
+#include "toputil.h"
+#include "physics.h"
+#include "index.h"
+#include "names.h"
+#include "futil.h"
+#include "gpp_atomtype.h"
+#include "fflibutil.h"
+#include "macros.h"
+
+#define MAXNAME 32
+#define OPENDIR '[' /* starting sign for directive */
+#define CLOSEDIR ']' /* ending sign for directive */
+
+typedef struct {
+ char atomtype[MAXNAME]; /* Type for the XH3/XH2 atom */
+ gmx_bool isplanar; /* If true, the atomtype above and the three connected
+ * ones are in a planar geometry. The two next entries
+ * are undefined in that case
+ */
+ int nhydrogens; /* number of connected hydrogens */
+ char nextheavytype[MAXNAME]; /* Type for the heavy atom bonded to XH2/XH3 */
+ char dummymass[MAXNAME]; /* The type of MNH* or MCH3* dummy mass to use */
+} t_vsiteconf;
+
+
+/* Structure to represent average bond and angles values in vsite aromatic
+ * residues. Note that these are NOT necessarily the bonds and angles from the
+ * forcefield; many forcefields (like Amber, OPLS) have some inherent strain in
+ * 5-rings (i.e. the sum of angles is !=540, but impropers keep it planar)
+ */
+typedef struct {
+ char resname[MAXNAME];
+ int nbonds;
+ int nangles;
+ struct vsitetop_bond {
+ char atom1[MAXNAME];
+ char atom2[MAXNAME];
+ float value;
+ } *bond; /* list of bonds */
+ struct vsitetop_angle {
+ char atom1[MAXNAME];
+ char atom2[MAXNAME];
+ char atom3[MAXNAME];
+ float value;
+ } *angle; /* list of angles */
+} t_vsitetop;
+
+
+enum {
+ DDB_CH3, DDB_NH3, DDB_NH2, DDB_PHE, DDB_TYR,
+ DDB_TRP, DDB_HISA, DDB_HISB, DDB_HISH, DDB_DIR_NR
+};
+
+typedef char t_dirname[STRLEN];
+
+static const t_dirname ddb_dirnames[DDB_DIR_NR] = {
+ "CH3",
+ "NH3",
+ "NH2",
+ "PHE",
+ "TYR",
+ "TRP",
+ "HISA",
+ "HISB",
+ "HISH"
+};
+
+static int ddb_name2dir(char *name)
+{
+ /* Translate a directive name to the number of the directive.
+ * HID/HIE/HIP names are translated to the ones we use in Gromacs.
+ */
+
+ int i, index;
+
+ index = -1;
+
+ for (i = 0; i < DDB_DIR_NR && index < 0; i++)
+ {
+ if (!gmx_strcasecmp(name, ddb_dirnames[i]))
+ {
+ index = i;
+ }
+ }
+
+ return index;
+}
+
+
+static void read_vsite_database(const char *ddbname,
+ t_vsiteconf **pvsiteconflist, int *nvsiteconf,
+ t_vsitetop **pvsitetoplist, int *nvsitetop)
+{
+ /* This routine is a quick hack to fix the problem with hardcoded atomtypes
+ * and aromatic vsite parameters by reading them from a ff???.vsd file.
+ *
+ * The file can contain sections [ NH3 ], [ CH3 ], [ NH2 ], and ring residue names.
+ * For the NH3 and CH3 section each line has three fields. The first is the atomtype
+ * (nb: not bonded type) of the N/C atom to be replaced, the second field is
+ * the type of the next heavy atom it is bonded to, and the third field the type
+ * of dummy mass that will be used for this group.
+ *
+ * If the NH2 group planar (sp2 N) a different vsite construct is used, so in this
+ * case the second field should just be the word planar.
+ */
+
+ FILE *ddb;
+ char dirstr[STRLEN];
+ char pline[STRLEN];
+ int i, j, n, k, nvsite, ntop, curdir, prevdir;
+ t_vsiteconf *vsiteconflist;
+ t_vsitetop *vsitetoplist;
+ char *ch;
+ char s1[MAXNAME], s2[MAXNAME], s3[MAXNAME], s4[MAXNAME];
+
+ ddb = libopen(ddbname);
+
+ nvsite = *nvsiteconf;
+ vsiteconflist = *pvsiteconflist;
+ ntop = *nvsitetop;
+ vsitetoplist = *pvsitetoplist;
+
+ curdir = -1;
+
+ snew(vsiteconflist, 1);
+ snew(vsitetoplist, 1);
+
+ while (fgets2(pline, STRLEN-2, ddb) != NULL)
+ {
+ strip_comment(pline);
+ trim(pline);
+ if (strlen(pline) > 0)
+ {
+ if (pline[0] == OPENDIR)
+ {
+ strncpy(dirstr, pline+1, STRLEN-2);
+ if ((ch = strchr (dirstr, CLOSEDIR)) != NULL)
+ {
+ (*ch) = 0;
+ }
+ trim (dirstr);
+
+ if (!gmx_strcasecmp(dirstr, "HID") ||
+ !gmx_strcasecmp(dirstr, "HISD"))
+ {
+ sprintf(dirstr, "HISA");
+ }
+ else if (!gmx_strcasecmp(dirstr, "HIE") ||
+ !gmx_strcasecmp(dirstr, "HISE"))
+ {
+ sprintf(dirstr, "HISB");
+ }
+ else if (!gmx_strcasecmp(dirstr, "HIP"))
+ {
+ sprintf(dirstr, "HISH");
+ }
+
+ curdir = ddb_name2dir(dirstr);
+ if (curdir < 0)
+ {
+ gmx_fatal(FARGS, "Invalid directive %s in vsite database %s",
+ dirstr, ddbname);
+ }
+ }
+ else
+ {
+ switch (curdir)
+ {
+ case -1:
+ gmx_fatal(FARGS, "First entry in vsite database must be a directive.\n");
+ break;
+ case DDB_CH3:
+ case DDB_NH3:
+ case DDB_NH2:
+ n = sscanf(pline, "%s%s%s", s1, s2, s3);
+ if (n < 3 && !gmx_strcasecmp(s2, "planar"))
+ {
+ srenew(vsiteconflist, nvsite+1);
+ strncpy(vsiteconflist[nvsite].atomtype, s1, MAXNAME-1);
+ vsiteconflist[nvsite].isplanar = TRUE;
+ vsiteconflist[nvsite].nextheavytype[0] = 0;
+ vsiteconflist[nvsite].dummymass[0] = 0;
+ vsiteconflist[nvsite].nhydrogens = 2;
+ nvsite++;
+ }
+ else if (n == 3)
+ {
+ srenew(vsiteconflist, (nvsite+1));
+ strncpy(vsiteconflist[nvsite].atomtype, s1, MAXNAME-1);
+ vsiteconflist[nvsite].isplanar = FALSE;
+ strncpy(vsiteconflist[nvsite].nextheavytype, s2, MAXNAME-1);
+ strncpy(vsiteconflist[nvsite].dummymass, s3, MAXNAME-1);
+ if (curdir == DDB_NH2)
+ {
+ vsiteconflist[nvsite].nhydrogens = 2;
+ }
+ else
+ {
+ vsiteconflist[nvsite].nhydrogens = 3;
+ }
+ nvsite++;
+ }
+ else
+ {
+ gmx_fatal(FARGS, "Not enough directives in vsite database line: %s\n", pline);
+ }
+ break;
+ case DDB_PHE:
+ case DDB_TYR:
+ case DDB_TRP:
+ case DDB_HISA:
+ case DDB_HISB:
+ case DDB_HISH:
+ i = 0;
+ while ((i < ntop) && gmx_strcasecmp(dirstr, vsitetoplist[i].resname))
+ {
+ i++;
+ }
+ /* Allocate a new topology entry if this is a new residue */
+ if (i == ntop)
+ {
+ srenew(vsitetoplist, ntop+1);
+ ntop++; /* i still points to current vsite topology entry */
+ strncpy(vsitetoplist[i].resname, dirstr, MAXNAME-1);
+ vsitetoplist[i].nbonds = vsitetoplist[i].nangles = 0;
+ snew(vsitetoplist[i].bond, 1);
+ snew(vsitetoplist[i].angle, 1);
+ }
+ n = sscanf(pline, "%s%s%s%s", s1, s2, s3, s4);
+ if (n == 3)
+ {
+ /* bond */
+ k = vsitetoplist[i].nbonds++;
+ srenew(vsitetoplist[i].bond, k+1);
+ strncpy(vsitetoplist[i].bond[k].atom1, s1, MAXNAME-1);
+ strncpy(vsitetoplist[i].bond[k].atom2, s2, MAXNAME-1);
+ vsitetoplist[i].bond[k].value = strtod(s3, NULL);
+ }
+ else if (n == 4)
+ {
+ /* angle */
+ k = vsitetoplist[i].nangles++;
+ srenew(vsitetoplist[i].angle, k+1);
+ strncpy(vsitetoplist[i].angle[k].atom1, s1, MAXNAME-1);
+ strncpy(vsitetoplist[i].angle[k].atom2, s2, MAXNAME-1);
+ strncpy(vsitetoplist[i].angle[k].atom3, s3, MAXNAME-1);
+ vsitetoplist[i].angle[k].value = strtod(s4, NULL);
+ }
+ else
+ {
+ gmx_fatal(FARGS, "Need 3 or 4 values to specify bond/angle values in %s: %s\n", ddbname, pline);
+ }
+ break;
+ default:
+ gmx_fatal(FARGS, "Didnt find a case for directive %s in read_vsite_database\n", dirstr);
+ }
+ }
+ }
+ }
+
+ *pvsiteconflist = vsiteconflist;
+ *pvsitetoplist = vsitetoplist;
+ *nvsiteconf = nvsite;
+ *nvsitetop = ntop;
+
+ ffclose(ddb);
+}
+
+static int nitrogen_is_planar(t_vsiteconf vsiteconflist[], int nvsiteconf, char atomtype[])
+{
+ /* Return 1 if atomtype exists in database list and is planar, 0 if not,
+ * and -1 if not found.
+ */
+ int i, res;
+ gmx_bool found = FALSE;
+ for (i = 0; i < nvsiteconf && !found; i++)
+ {
+ found = (!gmx_strcasecmp(vsiteconflist[i].atomtype, atomtype) && (vsiteconflist[i].nhydrogens == 2));
+ }
+ if (found)
+ {
+ res = (vsiteconflist[i-1].isplanar == TRUE);
+ }
+ else
+ {
+ res = -1;
+ }
+
+ return res;
+}
+
+static char *get_dummymass_name(t_vsiteconf vsiteconflist[], int nvsiteconf, char atom[], char nextheavy[])
+{
+ /* Return the dummy mass name if found, or NULL if not set in ddb database */
+ int i;
+ gmx_bool found = FALSE;
+ for (i = 0; i < nvsiteconf && !found; i++)
+ {
+ found = (!gmx_strcasecmp(vsiteconflist[i].atomtype, atom) &&
+ !gmx_strcasecmp(vsiteconflist[i].nextheavytype, nextheavy));
+ }
+ if (found)
+ {
+ return vsiteconflist[i-1].dummymass;
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+
+
+static real get_ddb_bond(t_vsitetop *vsitetop, int nvsitetop,
+ const char res[],
+ const char atom1[], const char atom2[])
+{
+ int i, j;
+
+ i = 0;
+ while (i < nvsitetop && gmx_strcasecmp(res, vsitetop[i].resname))
+ {
+ i++;
+ }
+ if (i == nvsitetop)
+ {
+ gmx_fatal(FARGS, "No vsite information for residue %s found in vsite database.\n", res);
+ }
+ j = 0;
+ while (j < vsitetop[i].nbonds &&
+ ( strcmp(atom1, vsitetop[i].bond[j].atom1) || strcmp(atom2, vsitetop[i].bond[j].atom2)) &&
+ ( strcmp(atom2, vsitetop[i].bond[j].atom1) || strcmp(atom1, vsitetop[i].bond[j].atom2)))
+ {
+ j++;
+ }
+ if (j == vsitetop[i].nbonds)
+ {
+ gmx_fatal(FARGS, "Couldnt find bond %s-%s for residue %s in vsite database.\n", atom1, atom2, res);
+ }
+
+ return vsitetop[i].bond[j].value;
+}
+
+
+static real get_ddb_angle(t_vsitetop *vsitetop, int nvsitetop,
+ const char res[], const char atom1[],
+ const char atom2[], const char atom3[])
+{
+ int i, j;
+
+ i = 0;
+ while (i < nvsitetop && gmx_strcasecmp(res, vsitetop[i].resname))
+ {
+ i++;
+ }
+ if (i == nvsitetop)
+ {
+ gmx_fatal(FARGS, "No vsite information for residue %s found in vsite database.\n", res);
+ }
+ j = 0;
+ while (j < vsitetop[i].nangles &&
+ ( strcmp(atom1, vsitetop[i].angle[j].atom1) ||
+ strcmp(atom2, vsitetop[i].angle[j].atom2) ||
+ strcmp(atom3, vsitetop[i].angle[j].atom3)) &&
+ ( strcmp(atom3, vsitetop[i].angle[j].atom1) ||
+ strcmp(atom2, vsitetop[i].angle[j].atom2) ||
+ strcmp(atom1, vsitetop[i].angle[j].atom3)))
+ {
+ j++;
+ }
+ if (j == vsitetop[i].nangles)
+ {
+ gmx_fatal(FARGS, "Couldnt find angle %s-%s-%s for residue %s in vsite database.\n", atom1, atom2, atom3, res);
+ }
+
+ return vsitetop[i].angle[j].value;
+}
+
+
+static void count_bonds(int atom, t_params *psb, char ***atomname,
+ int *nrbonds, int *nrHatoms, int Hatoms[], int *Heavy,
+ int *nrheavies, int heavies[])
+{
+ int i, heavy, other, nrb, nrH, nrhv;
+
+ /* find heavy atom bound to this hydrogen */
+ heavy = NOTSET;
+ for (i = 0; (i < psb->nr) && (heavy == NOTSET); i++)
+ {
+ if (psb->param[i].AI == atom)
+ {
+ heavy = psb->param[i].AJ;
+ }
+ else if (psb->param[i].AJ == atom)
+ {
+ heavy = psb->param[i].AI;
+ }
+ }
+ if (heavy == NOTSET)
+ {
+ gmx_fatal(FARGS, "unbound hydrogen atom %d", atom+1);
+ }
+ /* find all atoms bound to heavy atom */
+ other = NOTSET;
+ nrb = 0;
+ nrH = 0;
+ nrhv = 0;
+ for (i = 0; i < psb->nr; i++)
+ {
+ if (psb->param[i].AI == heavy)
+ {
+ other = psb->param[i].AJ;
+ }
+ else if (psb->param[i].AJ == heavy)
+ {
+ other = psb->param[i].AI;
+ }
+ if (other != NOTSET)
+ {
+ nrb++;
+ if (is_hydrogen(*(atomname[other])))
+ {
+ Hatoms[nrH] = other;
+ nrH++;
+ }
+ else
+ {
+ heavies[nrhv] = other;
+ nrhv++;
+ }
+ other = NOTSET;
+ }
+ }
+ *Heavy = heavy;
+ *nrbonds = nrb;
+ *nrHatoms = nrH;
+ *nrheavies = nrhv;
+}
+
+static void print_bonds(FILE *fp, int o2n[],
+ int nrHatoms, int Hatoms[], int Heavy,
+ int nrheavies, int heavies[])
+{
+ int i;
+
+ fprintf(fp, "Found: %d Hatoms: ", nrHatoms);
+ for (i = 0; i < nrHatoms; i++)
+ {
+ fprintf(fp, " %d", o2n[Hatoms[i]]+1);
+ }
+ fprintf(fp, "; %d Heavy atoms: %d", nrheavies+1, o2n[Heavy]+1);
+ for (i = 0; i < nrheavies; i++)
+ {
+ fprintf(fp, " %d", o2n[heavies[i]]+1);
+ }
+ fprintf(fp, "\n");
+}
+
+static int get_atype(int atom, t_atoms *at, int nrtp, t_restp rtp[],
+ gmx_residuetype_t rt)
+{
+ int type;
+ gmx_bool bNterm;
+ int j;
+ t_restp *rtpp;
+
+ if (at->atom[atom].m)
+ {
+ type = at->atom[atom].type;
+ }
+ else
+ {
+ /* get type from rtp */
+ rtpp = get_restp(*(at->resinfo[at->atom[atom].resind].name), nrtp, rtp);
+ bNterm = gmx_residuetype_is_protein(rt, *(at->resinfo[at->atom[atom].resind].name)) &&
+ (at->atom[atom].resind == 0);
+ j = search_jtype(rtpp, *(at->atomname[atom]), bNterm);
+ type = rtpp->atom[j].type;
+ }
+ return type;
+}
+
+static int vsite_nm2type(const char *name, gpp_atomtype_t atype)
+{
+ int tp;
+
+ tp = get_atomtype_type(name, atype);
+ if (tp == NOTSET)
+ {
+ gmx_fatal(FARGS, "Dummy mass type (%s) not found in atom type database",
+ name);
+ }
+
+ return tp;
+}
+
+static real get_amass(int atom, t_atoms *at, int nrtp, t_restp rtp[],
+ gmx_residuetype_t rt)
+{
+ real mass;
+ gmx_bool bNterm;
+ int j;
+ t_restp *rtpp;
+
+ if (at->atom[atom].m)
+ {
+ mass = at->atom[atom].m;
+ }
+ else
+ {
+ /* get mass from rtp */
+ rtpp = get_restp(*(at->resinfo[at->atom[atom].resind].name), nrtp, rtp);
+ bNterm = gmx_residuetype_is_protein(rt, *(at->resinfo[at->atom[atom].resind].name)) &&
+ (at->atom[atom].resind == 0);
+ j = search_jtype(rtpp, *(at->atomname[atom]), bNterm);
+ mass = rtpp->atom[j].m;
+ }
+ return mass;
+}
+
+static void my_add_param(t_params *plist, int ai, int aj, real b)
+{
+ static real c[MAXFORCEPARAM] =
+ { NOTSET, NOTSET, NOTSET, NOTSET, NOTSET, NOTSET };
+
+ c[0] = b;
+ add_param(plist, ai, aj, c, NULL);
+}
+
+static void add_vsites(t_params plist[], int vsite_type[],
+ int Heavy, int nrHatoms, int Hatoms[],
+ int nrheavies, int heavies[])
+{
+ int i, j, ftype, other, moreheavy, bb;
+ gmx_bool bSwapParity;
+
+ for (i = 0; i < nrHatoms; i++)
+ {
+ ftype = vsite_type[Hatoms[i]];
+ /* Errors in setting the vsite_type should really be caugth earlier,
+ * because here it's not possible to print any useful error message.
+ * But it's still better to print a message than to segfault.
+ */
+ if (ftype == NOTSET)
+ {
+ gmx_incons("Undetected error in setting up virtual sites");
+ }
+ bSwapParity = (ftype < 0);
+ vsite_type[Hatoms[i]] = ftype = abs(ftype);
+ if (ftype == F_BONDS)
+ {
+ if ( (nrheavies != 1) && (nrHatoms != 1) )
+ {
+ gmx_fatal(FARGS, "cannot make constraint in add_vsites for %d heavy "
+ "atoms and %d hydrogen atoms", nrheavies, nrHatoms);
+ }
+ my_add_param(&(plist[F_CONSTRNC]), Hatoms[i], heavies[0], NOTSET);
+ }
+ else
+ {
+ switch (ftype)
+ {
+ case F_VSITE3:
+ case F_VSITE3FD:
+ case F_VSITE3OUT:
+ if (nrheavies < 2)
+ {
+ gmx_fatal(FARGS, "Not enough heavy atoms (%d) for %s (min 3)",
+ nrheavies+1,
+ interaction_function[vsite_type[Hatoms[i]]].name);
+ }
+ add_vsite3_atoms(&plist[ftype], Hatoms[i], Heavy, heavies[0], heavies[1],
+ bSwapParity);
+ break;
+ case F_VSITE3FAD:
+ {
+ if (nrheavies > 1)
+ {
+ moreheavy = heavies[1];
+ }
+ else
+ {
+ /* find more heavy atoms */
+ other = moreheavy = NOTSET;
+ for (j = 0; (j < plist[F_BONDS].nr) && (moreheavy == NOTSET); j++)
+ {
+ if (plist[F_BONDS].param[j].AI == heavies[0])
+ {
+ other = plist[F_BONDS].param[j].AJ;
+ }
+ else if (plist[F_BONDS].param[j].AJ == heavies[0])
+ {
+ other = plist[F_BONDS].param[j].AI;
+ }
+ if ( (other != NOTSET) && (other != Heavy) )
+ {
+ moreheavy = other;
+ }
+ }
+ if (moreheavy == NOTSET)
+ {
+ gmx_fatal(FARGS, "Unbound molecule part %d-%d", Heavy+1, Hatoms[0]+1);
+ }
+ }
+ add_vsite3_atoms(&plist[ftype], Hatoms[i], Heavy, heavies[0], moreheavy,
+ bSwapParity);
+ break;
+ }
+ case F_VSITE4FD:
+ case F_VSITE4FDN:
+ if (nrheavies < 3)
+ {
+ gmx_fatal(FARGS, "Not enough heavy atoms (%d) for %s (min 4)",
+ nrheavies+1,
+ interaction_function[vsite_type[Hatoms[i]]].name);
+ }
+ add_vsite4_atoms(&plist[ftype],
+ Hatoms[0], Heavy, heavies[0], heavies[1], heavies[2]);
+ break;
+
+ default:
+ gmx_fatal(FARGS, "can't use add_vsites for interaction function %s",
+ interaction_function[vsite_type[Hatoms[i]]].name);
+ } /* switch ftype */
+ } /* else */
+ } /* for i */
+}
+
+#define ANGLE_6RING (DEG2RAD*120)
+
+/* cosine rule: a^2 = b^2 + c^2 - 2 b c cos(alpha) */
+/* get a^2 when a, b and alpha are given: */
+#define cosrule(b, c, alpha) ( sqr(b) + sqr(c) - 2*b*c*cos(alpha) )
+/* get cos(alpha) when a, b and c are given: */
+#define acosrule(a, b, c) ( (sqr(b)+sqr(c)-sqr(a))/(2*b*c) )
+
+static int gen_vsites_6ring(t_atoms *at, int *vsite_type[], t_params plist[],
+ int nrfound, int *ats, real bond_cc, real bond_ch,
+ real xcom, gmx_bool bDoZ)
+{
+ /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ enum {
+ atCG, atCD1, atHD1, atCD2, atHD2, atCE1, atHE1, atCE2, atHE2,
+ atCZ, atHZ, atNR
+ };
+
+ int i, nvsite;
+ real a, b, dCGCE, tmp1, tmp2, mtot, mG, mrest;
+ real xCG, yCG, xCE1, yCE1, xCE2, yCE2;
+ /* CG, CE1 and CE2 stay and each get a part of the total mass,
+ * so the c-o-m stays the same.
+ */
+
+ if (bDoZ)
+ {
+ if (atNR != nrfound)
+ {
+ gmx_incons("Generating vsites on 6-rings");
+ }
+ }
+
+ /* constraints between CG, CE1 and CE2: */
+ dCGCE = sqrt( cosrule(bond_cc, bond_cc, ANGLE_6RING) );
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atCE1], dCGCE);
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atCE2], dCGCE);
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCE1], ats[atCE2], dCGCE);
+
+ /* rest will be vsite3 */
+ mtot = 0;
+ nvsite = 0;
+ for (i = 0; i < (bDoZ ? atNR : atHZ); i++)
+ {
+ mtot += at->atom[ats[i]].m;
+ if (i != atCG && i != atCE1 && i != atCE2 && (bDoZ || (i != atHZ && i != atCZ) ) )
+ {
+ at->atom[ats[i]].m = at->atom[ats[i]].mB = 0;
+ (*vsite_type)[ats[i]] = F_VSITE3;
+ nvsite++;
+ }
+ }
+ /* Distribute mass so center-of-mass stays the same.
+ * The center-of-mass in the call is defined with x=0 at
+ * the CE1-CE2 bond and y=0 at the line from CG to the middle of CE1-CE2 bond.
+ */
+ xCG = -bond_cc+bond_cc*cos(ANGLE_6RING);
+ yCG = 0;
+ xCE1 = 0;
+ yCE1 = bond_cc*sin(0.5*ANGLE_6RING);
+ xCE2 = 0;
+ yCE2 = -bond_cc*sin(0.5*ANGLE_6RING);
+
+ mG = at->atom[ats[atCG]].m = at->atom[ats[atCG]].mB = xcom*mtot/xCG;
+ mrest = mtot-mG;
+ at->atom[ats[atCE1]].m = at->atom[ats[atCE1]].mB =
+ at->atom[ats[atCE2]].m = at->atom[ats[atCE2]].mB = mrest / 2;
+
+ /* vsite3 construction: r_d = r_i + a r_ij + b r_ik */
+ tmp1 = dCGCE*sin(ANGLE_6RING*0.5);
+ tmp2 = bond_cc*cos(0.5*ANGLE_6RING) + tmp1;
+ tmp1 *= 2;
+ a = b = -bond_ch / tmp1;
+ /* HE1 and HE2: */
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHE1], ats[atCE1], ats[atCE2], ats[atCG], a, b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHE2], ats[atCE2], ats[atCE1], ats[atCG], a, b);
+ /* CD1, CD2 and CZ: */
+ a = b = tmp2 / tmp1;
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atCD1], ats[atCE2], ats[atCE1], ats[atCG], a, b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atCD2], ats[atCE1], ats[atCE2], ats[atCG], a, b);
+ if (bDoZ)
+ {
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atCZ], ats[atCG], ats[atCE1], ats[atCE2], a, b);
+ }
+ /* HD1, HD2 and HZ: */
+ a = b = ( bond_ch + tmp2 ) / tmp1;
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHD1], ats[atCE2], ats[atCE1], ats[atCG], a, b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHD2], ats[atCE1], ats[atCE2], ats[atCG], a, b);
+ if (bDoZ)
+ {
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHZ], ats[atCG], ats[atCE1], ats[atCE2], a, b);
+ }
+
+ return nvsite;
+}
+
+static int gen_vsites_phe(t_atoms *at, int *vsite_type[], t_params plist[],
+ int nrfound, int *ats, t_vsitetop *vsitetop, int nvsitetop)
+{
+ real bond_cc, bond_ch;
+ real xcom, mtot;
+ int i;
+ /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ enum {
+ atCG, atCD1, atHD1, atCD2, atHD2, atCE1, atHE1, atCE2, atHE2,
+ atCZ, atHZ, atNR
+ };
+ real x[atNR], y[atNR];
+ /* Aromatic rings have 6-fold symmetry, so we only need one bond length.
+ * (angle is always 120 degrees).
+ */
+ bond_cc = get_ddb_bond(vsitetop, nvsitetop, "PHE", "CD1", "CE1");
+ bond_ch = get_ddb_bond(vsitetop, nvsitetop, "PHE", "CD1", "HD1");
+
+ x[atCG] = -bond_cc+bond_cc*cos(ANGLE_6RING);
+ y[atCG] = 0;
+ x[atCD1] = -bond_cc;
+ y[atCD1] = bond_cc*sin(0.5*ANGLE_6RING);
+ x[atHD1] = x[atCD1]+bond_ch*cos(ANGLE_6RING);
+ y[atHD1] = y[atCD1]+bond_ch*sin(ANGLE_6RING);
+ x[atCE1] = 0;
+ y[atCE1] = y[atCD1];
+ x[atHE1] = x[atCE1]-bond_ch*cos(ANGLE_6RING);
+ y[atHE1] = y[atCE1]+bond_ch*sin(ANGLE_6RING);
+ x[atCD2] = x[atCD1];
+ y[atCD2] = -y[atCD1];
+ x[atHD2] = x[atHD1];
+ y[atHD2] = -y[atHD1];
+ x[atCE2] = x[atCE1];
+ y[atCE2] = -y[atCE1];
+ x[atHE2] = x[atHE1];
+ y[atHE2] = -y[atHE1];
+ x[atCZ] = bond_cc*cos(0.5*ANGLE_6RING);
+ y[atCZ] = 0;
+ x[atHZ] = x[atCZ]+bond_ch;
+ y[atHZ] = 0;
+
+ xcom = mtot = 0;
+ for (i = 0; i < atNR; i++)
+ {
+ xcom += x[i]*at->atom[ats[i]].m;
+ mtot += at->atom[ats[i]].m;
+ }
+ xcom /= mtot;
+
+ return gen_vsites_6ring(at, vsite_type, plist, nrfound, ats, bond_cc, bond_ch, xcom, TRUE);
+}
+
+static void calc_vsite3_param(real xd, real yd, real xi, real yi, real xj, real yj,
+ real xk, real yk, real *a, real *b)
+{
+ /* determine parameters by solving the equation system, since we know the
+ * virtual site coordinates here.
+ */
+ real dx_ij, dx_ik, dy_ij, dy_ik;
+ real b_ij, b_ik;
+
+ dx_ij = xj-xi;
+ dy_ij = yj-yi;
+ dx_ik = xk-xi;
+ dy_ik = yk-yi;
+ b_ij = sqrt(dx_ij*dx_ij+dy_ij*dy_ij);
+ b_ik = sqrt(dx_ik*dx_ik+dy_ik*dy_ik);
+
+ *a = ( (xd-xi)*dy_ik - dx_ik*(yd-yi) ) / (dx_ij*dy_ik - dx_ik*dy_ij);
+ *b = ( yd - yi - (*a)*dy_ij ) / dy_ik;
+}
+
+
+static int gen_vsites_trp(gpp_atomtype_t atype, rvec *newx[],
+ t_atom *newatom[], char ***newatomname[],
+ int *o2n[], int *newvsite_type[], int *newcgnr[],
+ t_symtab *symtab, int *nadd, rvec x[], int *cgnr[],
+ t_atoms *at, int *vsite_type[], t_params plist[],
+ int nrfound, int *ats, int add_shift,
+ t_vsitetop *vsitetop, int nvsitetop)
+{
+#define NMASS 2
+ /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ enum {
+ atCB, atCG, atCD1, atHD1, atCD2, atNE1, atHE1, atCE2, atCE3, atHE3,
+ atCZ2, atHZ2, atCZ3, atHZ3, atCH2, atHH2, atNR
+ };
+ /* weights for determining the COM's of both rings (M1 and M2): */
+ real mw[NMASS][atNR] = {
+ { 0, 1, 1, 1, 0.5, 1, 1, 0.5, 0, 0,
+ 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0.5, 0, 0, 0.5, 1, 1,
+ 1, 1, 1, 1, 1, 1 }
+ };
+
+ real xi[atNR], yi[atNR];
+ real xcom[NMASS], ycom[NMASS], I, alpha;
+ real lineA, lineB, dist;
+ real b_CD2_CE2, b_NE1_CE2, b_CG_CD2, b_CH2_HH2, b_CE2_CZ2;
+ real b_NE1_HE1, b_CD2_CE3, b_CE3_CZ3, b_CB_CG;
+ real b_CZ2_CH2, b_CZ2_HZ2, b_CD1_HD1, b_CE3_HE3;
+ real b_CG_CD1, b_CZ3_HZ3;
+ real a_NE1_CE2_CD2, a_CE2_CD2_CG, a_CB_CG_CD2, a_CE2_CD2_CE3;
+ real a_CB_CG_CD1, a_CD2_CG_CD1, a_CE2_CZ2_HZ2, a_CZ2_CH2_HH2;
+ real a_CD2_CE2_CZ2, a_CD2_CE3_CZ3, a_CE3_CZ3_HZ3, a_CG_CD1_HD1;
+ real a_CE2_CZ2_CH2, a_HE1_NE1_CE2, a_CD2_CE3_HE3;
+ real xM[NMASS];
+ int atM[NMASS], tpM, i, i0, j, nvsite;
+ real mwtot, mtot, mM[NMASS], dCBM1, dCBM2, dM1M2;
+ real a, b, c[MAXFORCEPARAM];
+ rvec r_ij, r_ik, t1, t2;
+ char name[10];
+
+ if (atNR != nrfound)
+ {
+ gmx_incons("atom types in gen_vsites_trp");
+ }
+ /* Get geometry from database */
+ b_CD2_CE2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CD2", "CE2");
+ b_NE1_CE2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "NE1", "CE2");
+ b_CG_CD1 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CG", "CD1");
+ b_CG_CD2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CG", "CD2");
+ b_CB_CG = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CB", "CG");
+ b_CE2_CZ2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CE2", "CZ2");
+ b_CD2_CE3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CD2", "CE3");
+ b_CE3_CZ3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CE3", "CZ3");
+ b_CZ2_CH2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CZ2", "CH2");
+
+ b_CD1_HD1 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CD1", "HD1");
+ b_CZ2_HZ2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CZ2", "HZ2");
+ b_NE1_HE1 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "NE1", "HE1");
+ b_CH2_HH2 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CH2", "HH2");
+ b_CE3_HE3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CE3", "HE3");
+ b_CZ3_HZ3 = get_ddb_bond(vsitetop, nvsitetop, "TRP", "CZ3", "HZ3");
+
+ a_NE1_CE2_CD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "NE1", "CE2", "CD2");
+ a_CE2_CD2_CG = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CD2", "CG");
+ a_CB_CG_CD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CB", "CG", "CD2");
+ a_CD2_CG_CD1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CG", "CD1");
+ a_CB_CG_CD1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CB", "CG", "CD1");
+
+ a_CE2_CD2_CE3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CD2", "CE3");
+ a_CD2_CE2_CZ2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CE2", "CZ2");
+ a_CD2_CE3_CZ3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CE3", "CZ3");
+ a_CE3_CZ3_HZ3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE3", "CZ3", "HZ3");
+ a_CZ2_CH2_HH2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CZ2", "CH2", "HH2");
+ a_CE2_CZ2_HZ2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CZ2", "HZ2");
+ a_CE2_CZ2_CH2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CE2", "CZ2", "CH2");
+ a_CG_CD1_HD1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CG", "CD1", "HD1");
+ a_HE1_NE1_CE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "HE1", "NE1", "CE2");
+ a_CD2_CE3_HE3 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TRP", "CD2", "CE3", "HE3");
+
+ /* Calculate local coordinates.
+ * y-axis (x=0) is the bond CD2-CE2.
+ * x-axis (y=0) is perpendicular to the bond CD2-CE2 and
+ * intersects the middle of the bond.
+ */
+ xi[atCD2] = 0;
+ yi[atCD2] = -0.5*b_CD2_CE2;
+
+ xi[atCE2] = 0;
+ yi[atCE2] = 0.5*b_CD2_CE2;
+
+ xi[atNE1] = -b_NE1_CE2*sin(a_NE1_CE2_CD2);
+ yi[atNE1] = yi[atCE2]-b_NE1_CE2*cos(a_NE1_CE2_CD2);
+
+ xi[atCG] = -b_CG_CD2*sin(a_CE2_CD2_CG);
+ yi[atCG] = yi[atCD2]+b_CG_CD2*cos(a_CE2_CD2_CG);
+
+ alpha = a_CE2_CD2_CG + M_PI - a_CB_CG_CD2;
+ xi[atCB] = xi[atCG]-b_CB_CG*sin(alpha);
+ yi[atCB] = yi[atCG]+b_CB_CG*cos(alpha);
+
+ alpha = a_CE2_CD2_CG + a_CD2_CG_CD1 - M_PI;
+ xi[atCD1] = xi[atCG]-b_CG_CD1*sin(alpha);
+ yi[atCD1] = yi[atCG]+b_CG_CD1*cos(alpha);
+
+ xi[atCE3] = b_CD2_CE3*sin(a_CE2_CD2_CE3);
+ yi[atCE3] = yi[atCD2]+b_CD2_CE3*cos(a_CE2_CD2_CE3);
+
+ xi[atCZ2] = b_CE2_CZ2*sin(a_CD2_CE2_CZ2);
+ yi[atCZ2] = yi[atCE2]-b_CE2_CZ2*cos(a_CD2_CE2_CZ2);
+
+ alpha = a_CE2_CD2_CE3 + a_CD2_CE3_CZ3 - M_PI;
+ xi[atCZ3] = xi[atCE3]+b_CE3_CZ3*sin(alpha);
+ yi[atCZ3] = yi[atCE3]+b_CE3_CZ3*cos(alpha);
+
+ alpha = a_CD2_CE2_CZ2 + a_CE2_CZ2_CH2 - M_PI;
+ xi[atCH2] = xi[atCZ2]+b_CZ2_CH2*sin(alpha);
+ yi[atCH2] = yi[atCZ2]-b_CZ2_CH2*cos(alpha);
+
+ /* hydrogens */
+ alpha = a_CE2_CD2_CG + a_CD2_CG_CD1 - a_CG_CD1_HD1;
+ xi[atHD1] = xi[atCD1]-b_CD1_HD1*sin(alpha);
+ yi[atHD1] = yi[atCD1]+b_CD1_HD1*cos(alpha);
+
+ alpha = a_NE1_CE2_CD2 + M_PI - a_HE1_NE1_CE2;
+ xi[atHE1] = xi[atNE1]-b_NE1_HE1*sin(alpha);
+ yi[atHE1] = yi[atNE1]-b_NE1_HE1*cos(alpha);
+
+ alpha = a_CE2_CD2_CE3 + M_PI - a_CD2_CE3_HE3;
+ xi[atHE3] = xi[atCE3]+b_CE3_HE3*sin(alpha);
+ yi[atHE3] = yi[atCE3]+b_CE3_HE3*cos(alpha);
+
+ alpha = a_CD2_CE2_CZ2 + M_PI - a_CE2_CZ2_HZ2;
+ xi[atHZ2] = xi[atCZ2]+b_CZ2_HZ2*sin(alpha);
+ yi[atHZ2] = yi[atCZ2]-b_CZ2_HZ2*cos(alpha);
+
+ alpha = a_CD2_CE2_CZ2 + a_CE2_CZ2_CH2 - a_CZ2_CH2_HH2;
+ xi[atHZ3] = xi[atCZ3]+b_CZ3_HZ3*sin(alpha);
+ yi[atHZ3] = yi[atCZ3]+b_CZ3_HZ3*cos(alpha);
+
+ alpha = a_CE2_CD2_CE3 + a_CD2_CE3_CZ3 - a_CE3_CZ3_HZ3;
+ xi[atHH2] = xi[atCH2]+b_CH2_HH2*sin(alpha);
+ yi[atHH2] = yi[atCH2]-b_CH2_HH2*cos(alpha);
+
+ /* Determine coeff. for the line CB-CG */
+ lineA = (yi[atCB]-yi[atCG])/(xi[atCB]-xi[atCG]);
+ lineB = yi[atCG]-lineA*xi[atCG];
+
+ /* Calculate masses for each ring and put it on the dummy masses */
+ for (j = 0; j < NMASS; j++)
+ {
+ mM[j] = xcom[j] = ycom[j] = 0;
+ }
+ for (i = 0; i < atNR; i++)
+ {
+ if (i != atCB)
+ {
+ for (j = 0; j < NMASS; j++)
+ {
+ mM[j] += mw[j][i] * at->atom[ats[i]].m;
+ xcom[j] += xi[i] * mw[j][i] * at->atom[ats[i]].m;
+ ycom[j] += yi[i] * mw[j][i] * at->atom[ats[i]].m;
+ }
+ }
+ }
+ for (j = 0; j < NMASS; j++)
+ {
+ xcom[j] /= mM[j];
+ ycom[j] /= mM[j];
+ }
+
+ /* get dummy mass type */
+ tpM = vsite_nm2type("MW", atype);
+ /* make space for 2 masses: shift all atoms starting with CB */
+ i0 = ats[atCB];
+ for (j = 0; j < NMASS; j++)
+ {
+ atM[j] = i0+*nadd+j;
+ }
+ if (debug)
+ {
+ fprintf(stderr, "Inserting %d dummy masses at %d\n", NMASS, (*o2n)[i0]+1);
+ }
+ *nadd += NMASS;
+ for (j = i0; j < at->nr; j++)
+ {
+ (*o2n)[j] = j+*nadd;
+ }
+ srenew(*newx, at->nr+*nadd);
+ srenew(*newatom, at->nr+*nadd);
+ srenew(*newatomname, at->nr+*nadd);
+ srenew(*newvsite_type, at->nr+*nadd);
+ srenew(*newcgnr, at->nr+*nadd);
+ for (j = 0; j < NMASS; j++)
+ {
+ (*newatomname)[at->nr+*nadd-1-j] = NULL;
+ }
+
+ /* Dummy masses will be placed at the center-of-mass in each ring. */
+
+ /* calc initial position for dummy masses in real (non-local) coordinates.
+ * Cheat by using the routine to calculate virtual site parameters. It is
+ * much easier when we have the coordinates expressed in terms of
+ * CB, CG, CD2.
+ */
+ rvec_sub(x[ats[atCB]], x[ats[atCG]], r_ij);
+ rvec_sub(x[ats[atCD2]], x[ats[atCG]], r_ik);
+ calc_vsite3_param(xcom[0], ycom[0], xi[atCG], yi[atCG], xi[atCB], yi[atCB],
+ xi[atCD2], yi[atCD2], &a, &b);
+ svmul(a, r_ij, t1);
+ svmul(b, r_ik, t2);
+ rvec_add(t1, t2, t1);
+ rvec_add(t1, x[ats[atCG]], (*newx)[atM[0]]);
+
+ calc_vsite3_param(xcom[1], ycom[1], xi[atCG], yi[atCG], xi[atCB], yi[atCB],
+ xi[atCD2], yi[atCD2], &a, &b);
+ svmul(a, r_ij, t1);
+ svmul(b, r_ik, t2);
+ rvec_add(t1, t2, t1);
+ rvec_add(t1, x[ats[atCG]], (*newx)[atM[1]]);
+
+ /* set parameters for the masses */
+ for (j = 0; j < NMASS; j++)
+ {
+ sprintf(name, "MW%d", j+1);
+ (*newatomname) [atM[j]] = put_symtab(symtab, name);
+ (*newatom) [atM[j]].m = (*newatom)[atM[j]].mB = mM[j];
+ (*newatom) [atM[j]].q = (*newatom)[atM[j]].qB = 0.0;
+ (*newatom) [atM[j]].type = (*newatom)[atM[j]].typeB = tpM;
+ (*newatom) [atM[j]].ptype = eptAtom;
+ (*newatom) [atM[j]].resind = at->atom[i0].resind;
++ (*newatom) [atM[j]].elem[0] = 'M';
++ (*newatom) [atM[j]].elem[1] = '\0';
+ (*newvsite_type)[atM[j]] = NOTSET;
+ (*newcgnr) [atM[j]] = (*cgnr)[i0];
+ }
+ /* renumber cgnr: */
+ for (i = i0; i < at->nr; i++)
+ {
+ (*cgnr)[i]++;
+ }
+
+ /* constraints between CB, M1 and M2 */
+ /* 'add_shift' says which atoms won't be renumbered afterwards */
+ dCBM1 = sqrt( sqr(xcom[0]-xi[atCB]) + sqr(ycom[0]-yi[atCB]) );
+ dM1M2 = sqrt( sqr(xcom[0]-xcom[1]) + sqr(ycom[0]-ycom[1]) );
+ dCBM2 = sqrt( sqr(xcom[1]-xi[atCB]) + sqr(ycom[1]-yi[atCB]) );
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCB], add_shift+atM[0], dCBM1);
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCB], add_shift+atM[1], dCBM2);
+ my_add_param(&(plist[F_CONSTRNC]), add_shift+atM[0], add_shift+atM[1], dM1M2);
+
+ /* rest will be vsite3 */
+ nvsite = 0;
+ for (i = 0; i < atNR; i++)
+ {
+ if (i != atCB)
+ {
+ at->atom[ats[i]].m = at->atom[ats[i]].mB = 0;
+ (*vsite_type)[ats[i]] = F_VSITE3;
+ nvsite++;
+ }
+ }
+
+ /* now define all vsites from M1, M2, CB, ie:
+ r_d = r_M1 + a r_M1_M2 + b r_M1_CB */
+ for (i = 0; i < atNR; i++)
+ {
+ if ( (*vsite_type)[ats[i]] == F_VSITE3)
+ {
+ calc_vsite3_param(xi[i], yi[i], xcom[0], ycom[0], xcom[1], ycom[1], xi[atCB], yi[atCB], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[i], add_shift+atM[0], add_shift+atM[1], ats[atCB], a, b);
+ }
+ }
+ return nvsite;
+#undef NMASS
+}
+
+
+static int gen_vsites_tyr(gpp_atomtype_t atype, rvec *newx[],
+ t_atom *newatom[], char ***newatomname[],
+ int *o2n[], int *newvsite_type[], int *newcgnr[],
+ t_symtab *symtab, int *nadd, rvec x[], int *cgnr[],
+ t_atoms *at, int *vsite_type[], t_params plist[],
+ int nrfound, int *ats, int add_shift,
+ t_vsitetop *vsitetop, int nvsitetop)
+{
+ int nvsite, i, i0, j, atM, tpM;
+ real dCGCE, dCEOH, dCGM, tmp1, a, b;
+ real bond_cc, bond_ch, bond_co, bond_oh, angle_coh;
+ real xcom, mtot;
+ real vmass, vdist, mM;
+ rvec r1;
+ char name[10];
+
+ /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ enum {
+ atCG, atCD1, atHD1, atCD2, atHD2, atCE1, atHE1, atCE2, atHE2,
+ atCZ, atOH, atHH, atNR
+ };
+ real xi[atNR], yi[atNR];
+ /* CG, CE1, CE2 (as in general 6-ring) and OH and HH stay,
+ rest gets virtualized.
+ Now we have two linked triangles with one improper keeping them flat */
+ if (atNR != nrfound)
+ {
+ gmx_incons("Number of atom types in gen_vsites_tyr");
+ }
+
+ /* Aromatic rings have 6-fold symmetry, so we only need one bond length
+ * for the ring part (angle is always 120 degrees).
+ */
+ bond_cc = get_ddb_bond(vsitetop, nvsitetop, "TYR", "CD1", "CE1");
+ bond_ch = get_ddb_bond(vsitetop, nvsitetop, "TYR", "CD1", "HD1");
+ bond_co = get_ddb_bond(vsitetop, nvsitetop, "TYR", "CZ", "OH");
+ bond_oh = get_ddb_bond(vsitetop, nvsitetop, "TYR", "OH", "HH");
+ angle_coh = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, "TYR", "CZ", "OH", "HH");
+
+ xi[atCG] = -bond_cc+bond_cc*cos(ANGLE_6RING);
+ yi[atCG] = 0;
+ xi[atCD1] = -bond_cc;
+ yi[atCD1] = bond_cc*sin(0.5*ANGLE_6RING);
+ xi[atHD1] = xi[atCD1]+bond_ch*cos(ANGLE_6RING);
+ yi[atHD1] = yi[atCD1]+bond_ch*sin(ANGLE_6RING);
+ xi[atCE1] = 0;
+ yi[atCE1] = yi[atCD1];
+ xi[atHE1] = xi[atCE1]-bond_ch*cos(ANGLE_6RING);
+ yi[atHE1] = yi[atCE1]+bond_ch*sin(ANGLE_6RING);
+ xi[atCD2] = xi[atCD1];
+ yi[atCD2] = -yi[atCD1];
+ xi[atHD2] = xi[atHD1];
+ yi[atHD2] = -yi[atHD1];
+ xi[atCE2] = xi[atCE1];
+ yi[atCE2] = -yi[atCE1];
+ xi[atHE2] = xi[atHE1];
+ yi[atHE2] = -yi[atHE1];
+ xi[atCZ] = bond_cc*cos(0.5*ANGLE_6RING);
+ yi[atCZ] = 0;
+ xi[atOH] = xi[atCZ]+bond_co;
+ yi[atOH] = 0;
+
+ xcom = mtot = 0;
+ for (i = 0; i < atOH; i++)
+ {
+ xcom += xi[i]*at->atom[ats[i]].m;
+ mtot += at->atom[ats[i]].m;
+ }
+ xcom /= mtot;
+
+ /* first do 6 ring as default,
+ except CZ (we'll do that different) and HZ (we don't have that): */
+ nvsite = gen_vsites_6ring(at, vsite_type, plist, nrfound, ats, bond_cc, bond_ch, xcom, FALSE);
+
+ /* then construct CZ from the 2nd triangle */
+ /* vsite3 construction: r_d = r_i + a r_ij + b r_ik */
+ a = b = 0.5 * bond_co / ( bond_co - bond_cc*cos(ANGLE_6RING) );
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atCZ], ats[atOH], ats[atCE1], ats[atCE2], a, b);
+ at->atom[ats[atCZ]].m = at->atom[ats[atCZ]].mB = 0;
+
+ /* constraints between CE1, CE2 and OH */
+ dCGCE = sqrt( cosrule(bond_cc, bond_cc, ANGLE_6RING) );
+ dCEOH = sqrt( cosrule(bond_cc, bond_co, ANGLE_6RING) );
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCE1], ats[atOH], dCEOH);
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCE2], ats[atOH], dCEOH);
+
+ /* We also want to constrain the angle C-O-H, but since CZ is constructed
+ * we need to introduce a constraint to CG.
+ * CG is much further away, so that will lead to instabilities in LINCS
+ * when we constrain both CG-HH and OH-HH distances. Instead of requiring
+ * the use of lincs_order=8 we introduce a dummy mass three times further
+ * away from OH than HH. The mass is accordingly a third, with the remaining
+ * 2/3 moved to OH. This shouldnt cause any problems since the forces will
+ * apply to the HH constructed atom and not directly on the virtual mass.
+ */
+
+ vdist = 2.0*bond_oh;
+ mM = at->atom[ats[atHH]].m/2.0;
+ at->atom[ats[atOH]].m += mM; /* add 1/2 of original H mass */
+ at->atom[ats[atOH]].mB += mM; /* add 1/2 of original H mass */
+ at->atom[ats[atHH]].m = at->atom[ats[atHH]].mB = 0;
+
+ /* get dummy mass type */
+ tpM = vsite_nm2type("MW", atype);
+ /* make space for 1 mass: shift HH only */
+ i0 = ats[atHH];
+ atM = i0+*nadd;
+ if (debug)
+ {
+ fprintf(stderr, "Inserting 1 dummy mass at %d\n", (*o2n)[i0]+1);
+ }
+ (*nadd)++;
+ for (j = i0; j < at->nr; j++)
+ {
+ (*o2n)[j] = j+*nadd;
+ }
+ srenew(*newx, at->nr+*nadd);
+ srenew(*newatom, at->nr+*nadd);
+ srenew(*newatomname, at->nr+*nadd);
+ srenew(*newvsite_type, at->nr+*nadd);
+ srenew(*newcgnr, at->nr+*nadd);
+ (*newatomname)[at->nr+*nadd-1] = NULL;
+
+ /* Calc the dummy mass initial position */
+ rvec_sub(x[ats[atHH]], x[ats[atOH]], r1);
+ svmul(2.0, r1, r1);
+ rvec_add(r1, x[ats[atHH]], (*newx)[atM]);
+
+ strcpy(name, "MW1");
+ (*newatomname) [atM] = put_symtab(symtab, name);
+ (*newatom) [atM].m = (*newatom)[atM].mB = mM;
+ (*newatom) [atM].q = (*newatom)[atM].qB = 0.0;
+ (*newatom) [atM].type = (*newatom)[atM].typeB = tpM;
+ (*newatom) [atM].ptype = eptAtom;
+ (*newatom) [atM].resind = at->atom[i0].resind;
++ (*newatom) [atM].elem[0] = 'M';
++ (*newatom) [atM].elem[1] = '\0';
+ (*newvsite_type)[atM] = NOTSET;
+ (*newcgnr) [atM] = (*cgnr)[i0];
+ /* renumber cgnr: */
+ for (i = i0; i < at->nr; i++)
+ {
+ (*cgnr)[i]++;
+ }
+
+ (*vsite_type)[ats[atHH]] = F_VSITE2;
+ nvsite++;
+ /* assume we also want the COH angle constrained: */
+ tmp1 = bond_cc*cos(0.5*ANGLE_6RING) + dCGCE*sin(ANGLE_6RING*0.5) + bond_co;
+ dCGM = sqrt( cosrule(tmp1, vdist, angle_coh) );
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCG], add_shift+atM, dCGM);
+ my_add_param(&(plist[F_CONSTRNC]), ats[atOH], add_shift+atM, vdist);
+
+ add_vsite2_param(&plist[F_VSITE2],
+ ats[atHH], ats[atOH], add_shift+atM, 1.0/2.0);
+ return nvsite;
+}
+
+static int gen_vsites_his(t_atoms *at, int *vsite_type[], t_params plist[],
+ int nrfound, int *ats, t_vsitetop *vsitetop, int nvsitetop)
+{
+ int nvsite, i;
+ real a, b, alpha, dCGCE1, dCGNE2;
+ real sinalpha, cosalpha;
+ real xcom, ycom, mtot;
+ real mG, mrest, mCE1, mNE2;
+ real b_CG_ND1, b_ND1_CE1, b_CE1_NE2, b_CG_CD2, b_CD2_NE2;
+ real b_ND1_HD1, b_NE2_HE2, b_CE1_HE1, b_CD2_HD2;
+ real a_CG_ND1_CE1, a_CG_CD2_NE2, a_ND1_CE1_NE2, a_CE1_NE2_CD2;
+ real a_NE2_CE1_HE1, a_NE2_CD2_HD2, a_CE1_ND1_HD1, a_CE1_NE2_HE2;
+ char resname[10];
+
+ /* these MUST correspond to the atnms array in do_vsite_aromatics! */
+ enum {
+ atCG, atND1, atHD1, atCD2, atHD2, atCE1, atHE1, atNE2, atHE2, atNR
+ };
+ real x[atNR], y[atNR];
+
+ /* CG, CE1 and NE2 stay, each gets part of the total mass,
+ rest gets virtualized */
+ /* check number of atoms, 3 hydrogens may be missing: */
+ /* assert( nrfound >= atNR-3 || nrfound <= atNR );
+ * Don't understand the above logic. Shouldn't it be && rather than || ???
+ */
+ if ((nrfound < atNR-3) || (nrfound > atNR))
+ {
+ gmx_incons("Generating vsites for HIS");
+ }
+
+ /* avoid warnings about uninitialized variables */
+ b_ND1_HD1 = b_NE2_HE2 = b_CE1_HE1 = b_CD2_HD2 = a_NE2_CE1_HE1 =
+ a_NE2_CD2_HD2 = a_CE1_ND1_HD1 = a_CE1_NE2_HE2 = 0;
+
+ if (ats[atHD1] != NOTSET)
+ {
+ if (ats[atHE2] != NOTSET)
+ {
+ sprintf(resname, "HISH");
+ }
+ else
+ {
+ sprintf(resname, "HISA");
+ }
+ }
+ else
+ {
+ sprintf(resname, "HISB");
+ }
+
+ /* Get geometry from database */
+ b_CG_ND1 = get_ddb_bond(vsitetop, nvsitetop, resname, "CG", "ND1");
+ b_ND1_CE1 = get_ddb_bond(vsitetop, nvsitetop, resname, "ND1", "CE1");
+ b_CE1_NE2 = get_ddb_bond(vsitetop, nvsitetop, resname, "CE1", "NE2");
+ b_CG_CD2 = get_ddb_bond(vsitetop, nvsitetop, resname, "CG", "CD2");
+ b_CD2_NE2 = get_ddb_bond(vsitetop, nvsitetop, resname, "CD2", "NE2");
+ a_CG_ND1_CE1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CG", "ND1", "CE1");
+ a_CG_CD2_NE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CG", "CD2", "NE2");
+ a_ND1_CE1_NE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "ND1", "CE1", "NE2");
+ a_CE1_NE2_CD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CE1", "NE2", "CD2");
+
+ if (ats[atHD1] != NOTSET)
+ {
+ b_ND1_HD1 = get_ddb_bond(vsitetop, nvsitetop, resname, "ND1", "HD1");
+ a_CE1_ND1_HD1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CE1", "ND1", "HD1");
+ }
+ if (ats[atHE2] != NOTSET)
+ {
+ b_NE2_HE2 = get_ddb_bond(vsitetop, nvsitetop, resname, "NE2", "HE2");
+ a_CE1_NE2_HE2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "CE1", "NE2", "HE2");
+ }
+ if (ats[atHD2] != NOTSET)
+ {
+ b_CD2_HD2 = get_ddb_bond(vsitetop, nvsitetop, resname, "CD2", "HD2");
+ a_NE2_CD2_HD2 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "NE2", "CD2", "HD2");
+ }
+ if (ats[atHE1] != NOTSET)
+ {
+ b_CE1_HE1 = get_ddb_bond(vsitetop, nvsitetop, resname, "CE1", "HE1");
+ a_NE2_CE1_HE1 = DEG2RAD*get_ddb_angle(vsitetop, nvsitetop, resname, "NE2", "CE1", "HE1");
+ }
+
+ /* constraints between CG, CE1 and NE1 */
+ dCGCE1 = sqrt( cosrule(b_CG_ND1, b_ND1_CE1, a_CG_ND1_CE1) );
+ dCGNE2 = sqrt( cosrule(b_CG_CD2, b_CD2_NE2, a_CG_CD2_NE2) );
+
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atCE1], dCGCE1);
+ my_add_param(&(plist[F_CONSTRNC]), ats[atCG], ats[atNE2], dCGNE2);
+ /* we already have a constraint CE1-NE2, so we don't add it again */
+
+ /* calculate the positions in a local frame of reference.
+ * The x-axis is the line from CG that makes a right angle
+ * with the bond CE1-NE2, and the y-axis the bond CE1-NE2.
+ */
+ /* First calculate the x-axis intersection with y-axis (=yCE1).
+ * Get cos(angle CG-CE1-NE2) :
+ */
+ cosalpha = acosrule(dCGNE2, dCGCE1, b_CE1_NE2);
+ x[atCE1] = 0;
+ y[atCE1] = cosalpha*dCGCE1;
+ x[atNE2] = 0;
+ y[atNE2] = y[atCE1]-b_CE1_NE2;
+ sinalpha = sqrt(1-cosalpha*cosalpha);
+ x[atCG] = -sinalpha*dCGCE1;
+ y[atCG] = 0;
+ x[atHE1] = x[atHE2] = x[atHD1] = x[atHD2] = 0;
+ y[atHE1] = y[atHE2] = y[atHD1] = y[atHD2] = 0;
+
+ /* calculate ND1 and CD2 positions from CE1 and NE2 */
+
+ x[atND1] = -b_ND1_CE1*sin(a_ND1_CE1_NE2);
+ y[atND1] = y[atCE1]-b_ND1_CE1*cos(a_ND1_CE1_NE2);
+
+ x[atCD2] = -b_CD2_NE2*sin(a_CE1_NE2_CD2);
+ y[atCD2] = y[atNE2]+b_CD2_NE2*cos(a_CE1_NE2_CD2);
+
+ /* And finally the hydrogen positions */
+ if (ats[atHE1] != NOTSET)
+ {
+ x[atHE1] = x[atCE1] + b_CE1_HE1*sin(a_NE2_CE1_HE1);
+ y[atHE1] = y[atCE1] - b_CE1_HE1*cos(a_NE2_CE1_HE1);
+ }
+ /* HD2 - first get (ccw) angle from (positive) y-axis */
+ if (ats[atHD2] != NOTSET)
+ {
+ alpha = a_CE1_NE2_CD2 + M_PI - a_NE2_CD2_HD2;
+ x[atHD2] = x[atCD2] - b_CD2_HD2*sin(alpha);
+ y[atHD2] = y[atCD2] + b_CD2_HD2*cos(alpha);
+ }
+ if (ats[atHD1] != NOTSET)
+ {
+ /* HD1 - first get (cw) angle from (positive) y-axis */
+ alpha = a_ND1_CE1_NE2 + M_PI - a_CE1_ND1_HD1;
+ x[atHD1] = x[atND1] - b_ND1_HD1*sin(alpha);
+ y[atHD1] = y[atND1] - b_ND1_HD1*cos(alpha);
+ }
+ if (ats[atHE2] != NOTSET)
+ {
+ x[atHE2] = x[atNE2] + b_NE2_HE2*sin(a_CE1_NE2_HE2);
+ y[atHE2] = y[atNE2] + b_NE2_HE2*cos(a_CE1_NE2_HE2);
+ }
+ /* Have all coordinates now */
+
+ /* calc center-of-mass; keep atoms CG, CE1, NE2 and
+ * set the rest to vsite3
+ */
+ mtot = xcom = ycom = 0;
+ nvsite = 0;
+ for (i = 0; i < atNR; i++)
+ {
+ if (ats[i] != NOTSET)
+ {
+ mtot += at->atom[ats[i]].m;
+ xcom += x[i]*at->atom[ats[i]].m;
+ ycom += y[i]*at->atom[ats[i]].m;
+ if (i != atCG && i != atCE1 && i != atNE2)
+ {
+ at->atom[ats[i]].m = at->atom[ats[i]].mB = 0;
+ (*vsite_type)[ats[i]] = F_VSITE3;
+ nvsite++;
+ }
+ }
+ }
+ if (nvsite+3 != nrfound)
+ {
+ gmx_incons("Generating vsites for HIS");
+ }
+
+ xcom /= mtot;
+ ycom /= mtot;
+
+ /* distribute mass so that com stays the same */
+ mG = xcom*mtot/x[atCG];
+ mrest = mtot-mG;
+ mCE1 = (ycom-y[atNE2])*mrest/(y[atCE1]-y[atNE2]);
+ mNE2 = mrest-mCE1;
+
+ at->atom[ats[atCG]].m = at->atom[ats[atCG]].mB = mG;
+ at->atom[ats[atCE1]].m = at->atom[ats[atCE1]].mB = mCE1;
+ at->atom[ats[atNE2]].m = at->atom[ats[atNE2]].mB = mNE2;
+
+ /* HE1 */
+ if (ats[atHE1] != NOTSET)
+ {
+ calc_vsite3_param(x[atHE1], y[atHE1], x[atCE1], y[atCE1], x[atNE2], y[atNE2],
+ x[atCG], y[atCG], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHE1], ats[atCE1], ats[atNE2], ats[atCG], a, b);
+ }
+ /* HE2 */
+ if (ats[atHE2] != NOTSET)
+ {
+ calc_vsite3_param(x[atHE2], y[atHE2], x[atNE2], y[atNE2], x[atCE1], y[atCE1],
+ x[atCG], y[atCG], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHE2], ats[atNE2], ats[atCE1], ats[atCG], a, b);
+ }
+
+ /* ND1 */
+ calc_vsite3_param(x[atND1], y[atND1], x[atNE2], y[atNE2], x[atCE1], y[atCE1],
+ x[atCG], y[atCG], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atND1], ats[atNE2], ats[atCE1], ats[atCG], a, b);
+
+ /* CD2 */
+ calc_vsite3_param(x[atCD2], y[atCD2], x[atCE1], y[atCE1], x[atNE2], y[atNE2],
+ x[atCG], y[atCG], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atCD2], ats[atCE1], ats[atNE2], ats[atCG], a, b);
+
+ /* HD1 */
+ if (ats[atHD1] != NOTSET)
+ {
+ calc_vsite3_param(x[atHD1], y[atHD1], x[atNE2], y[atNE2], x[atCE1], y[atCE1],
+ x[atCG], y[atCG], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHD1], ats[atNE2], ats[atCE1], ats[atCG], a, b);
+ }
+ /* HD2 */
+ if (ats[atHD2] != NOTSET)
+ {
+ calc_vsite3_param(x[atHD2], y[atHD2], x[atCE1], y[atCE1], x[atNE2], y[atNE2],
+ x[atCG], y[atCG], &a, &b);
+ add_vsite3_param(&plist[F_VSITE3],
+ ats[atHD2], ats[atCE1], ats[atNE2], ats[atCG], a, b);
+ }
+ return nvsite;
+}
+
+static gmx_bool is_vsite(int vsite_type)
+{
+ if (vsite_type == NOTSET)
+ {
+ return FALSE;
+ }
+ switch (abs(vsite_type) )
+ {
+ case F_VSITE3:
+ case F_VSITE3FD:
+ case F_VSITE3OUT:
+ case F_VSITE3FAD:
+ case F_VSITE4FD:
+ case F_VSITE4FDN:
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
+static char atomnamesuffix[] = "1234";
+
+void do_vsites(int nrtp, t_restp rtp[], gpp_atomtype_t atype,
+ t_atoms *at, t_symtab *symtab, rvec *x[],
+ t_params plist[], int *vsite_type[], int *cgnr[],
+ real mHmult, gmx_bool bVsiteAromatics,
+ const char *ffdir)
+{
+#define MAXATOMSPERRESIDUE 16
+ int i, j, k, m, i0, ni0, whatres, resind, add_shift, ftype, nvsite, nadd;
+ int ai, aj, ak, al;
+ int nrfound = 0, needed, nrbonds, nrHatoms, Heavy, nrheavies, tpM, tpHeavy;
+ int Hatoms[4], heavies[4], bb;
+ gmx_bool bWARNING, bAddVsiteParam, bFirstWater;
+ matrix tmpmat;
+ gmx_bool *bResProcessed;
+ real mHtot, mtot, fact, fact2;
+ rvec rpar, rperp, temp;
+ char name[10], tpname[32], nexttpname[32], *ch;
+ rvec *newx;
+ int *o2n, *newvsite_type, *newcgnr, ats[MAXATOMSPERRESIDUE];
+ t_atom *newatom;
+ t_params *params;
+ char ***newatomname;
+ char *resnm = NULL;
+ int ndb, f;
+ char **db;
+ int nvsiteconf, nvsitetop, cmplength;
+ gmx_bool isN, planarN, bFound;
+ gmx_residuetype_t rt;
+
+ t_vsiteconf *vsiteconflist;
+ /* pointer to a list of CH3/NH3/NH2 configuration entries.
+ * See comments in read_vsite_database. It isnt beautiful,
+ * but it had to be fixed, and I dont even want to try to
+ * maintain this part of the code...
+ */
+ t_vsitetop *vsitetop;
+ /* Pointer to a list of geometry (bond/angle) entries for
+ * residues like PHE, TRP, TYR, HIS, etc., where we need
+ * to know the geometry to construct vsite aromatics.
+ * Note that equilibrium geometry isnt necessarily the same
+ * as the individual bond and angle values given in the
+ * force field (rings can be strained).
+ */
+
+ /* if bVsiteAromatics=TRUE do_vsites will specifically convert atoms in
+ PHE, TRP, TYR and HIS to a construction of virtual sites */
+ enum {
+ resPHE, resTRP, resTYR, resHIS, resNR
+ };
+ const char *resnms[resNR] = { "PHE", "TRP", "TYR", "HIS" };
+ /* Amber03 alternative names for termini */
+ const char *resnmsN[resNR] = { "NPHE", "NTRP", "NTYR", "NHIS" };
+ const char *resnmsC[resNR] = { "CPHE", "CTRP", "CTYR", "CHIS" };
+ /* HIS can be known as HISH, HIS1, HISA, HID, HIE, HIP, etc. too */
+ gmx_bool bPartial[resNR] = { FALSE, FALSE, FALSE, TRUE };
+ /* the atnms for every residue MUST correspond to the enums in the
+ gen_vsites_* (one for each residue) routines! */
+ /* also the atom names in atnms MUST be in the same order as in the .rtp! */
+ const char *atnms[resNR][MAXATOMSPERRESIDUE+1] = {
+ { "CG", /* PHE */
+ "CD1", "HD1", "CD2", "HD2",
+ "CE1", "HE1", "CE2", "HE2",
+ "CZ", "HZ", NULL },
+ { "CB", /* TRP */
+ "CG",
+ "CD1", "HD1", "CD2",
+ "NE1", "HE1", "CE2", "CE3", "HE3",
+ "CZ2", "HZ2", "CZ3", "HZ3",
+ "CH2", "HH2", NULL },
+ { "CG", /* TYR */
+ "CD1", "HD1", "CD2", "HD2",
+ "CE1", "HE1", "CE2", "HE2",
+ "CZ", "OH", "HH", NULL },
+ { "CG", /* HIS */
+ "ND1", "HD1", "CD2", "HD2",
+ "CE1", "HE1", "NE2", "HE2", NULL }
+ };
+
+ if (debug)
+ {
+ printf("Searching for atoms to make virtual sites ...\n");
+ fprintf(debug, "# # # VSITES # # #\n");
+ }
+
+ ndb = fflib_search_file_end(ffdir, ".vsd", FALSE, &db);
+ nvsiteconf = 0;
+ vsiteconflist = NULL;
+ nvsitetop = 0;
+ vsitetop = NULL;
+ for (f = 0; f < ndb; f++)
+ {
+ read_vsite_database(db[f], &vsiteconflist, &nvsiteconf, &vsitetop, &nvsitetop);
+ sfree(db[f]);
+ }
+ sfree(db);
+
+ bFirstWater = TRUE;
+ nvsite = 0;
+ nadd = 0;
+ /* we need a marker for which atoms should *not* be renumbered afterwards */
+ add_shift = 10*at->nr;
+ /* make arrays where masses can be inserted into */
+ snew(newx, at->nr);
+ snew(newatom, at->nr);
+ snew(newatomname, at->nr);
+ snew(newvsite_type, at->nr);
+ snew(newcgnr, at->nr);
+ /* make index array to tell where the atoms go to when masses are inserted */
+ snew(o2n, at->nr);
+ for (i = 0; i < at->nr; i++)
+ {
+ o2n[i] = i;
+ }
+ /* make index to tell which residues were already processed */
+ snew(bResProcessed, at->nres);
+
+ gmx_residuetype_init(&rt);
+
+ /* generate vsite constructions */
+ /* loop over all atoms */
+ resind = -1;
+ for (i = 0; (i < at->nr); i++)
+ {
+ if (at->atom[i].resind != resind)
+ {
+ resind = at->atom[i].resind;
+ resnm = *(at->resinfo[resind].name);
+ }
+ /* first check for aromatics to virtualize */
+ /* don't waste our effort on DNA, water etc. */
+ /* Only do the vsite aromatic stuff when we reach the
+ * CA atom, since there might be an X2/X3 group on the
+ * N-terminus that must be treated first.
+ */
+ if (bVsiteAromatics &&
+ !strcmp(*(at->atomname[i]), "CA") &&
+ !bResProcessed[resind] &&
+ gmx_residuetype_is_protein(rt, *(at->resinfo[resind].name)) )
+ {
+ /* mark this residue */
+ bResProcessed[resind] = TRUE;
+ /* find out if this residue needs converting */
+ whatres = NOTSET;
+ for (j = 0; j < resNR && whatres == NOTSET; j++)
+ {
+
+ cmplength = bPartial[j] ? strlen(resnm)-1 : strlen(resnm);
+
+ bFound = ((gmx_strncasecmp(resnm, resnms[j], cmplength) == 0) ||
+ (gmx_strncasecmp(resnm, resnmsN[j], cmplength) == 0) ||
+ (gmx_strncasecmp(resnm, resnmsC[j], cmplength) == 0));
+
+ if (bFound)
+ {
+ whatres = j;
+ /* get atoms we will be needing for the conversion */
+ nrfound = 0;
+ for (k = 0; atnms[j][k]; k++)
+ {
+ ats[k] = NOTSET;
+ for (m = i; m < at->nr && at->atom[m].resind == resind && ats[k] == NOTSET; m++)
+ {
+ if (gmx_strcasecmp(*(at->atomname[m]), atnms[j][k]) == 0)
+ {
+ ats[k] = m;
+ nrfound++;
+ }
+ }
+ }
+
+ /* now k is number of atom names in atnms[j] */
+ if (j == resHIS)
+ {
+ needed = k-3;
+ }
+ else
+ {
+ needed = k;
+ }
+ if (nrfound < needed)
+ {
+ gmx_fatal(FARGS, "not enough atoms found (%d, need %d) in "
+ "residue %s %d while\n "
+ "generating aromatics virtual site construction",
+ nrfound, needed, resnm, at->resinfo[resind].nr);
+ }
+ /* Advance overall atom counter */
+ i++;
+ }
+ }
+ /* the enums for every residue MUST correspond to atnms[residue] */
+ switch (whatres)
+ {
+ case resPHE:
+ if (debug)
+ {
+ fprintf(stderr, "PHE at %d\n", o2n[ats[0]]+1);
+ }
+ nvsite += gen_vsites_phe(at, vsite_type, plist, nrfound, ats, vsitetop, nvsitetop);
+ break;
+ case resTRP:
+ if (debug)
+ {
+ fprintf(stderr, "TRP at %d\n", o2n[ats[0]]+1);
+ }
+ nvsite += gen_vsites_trp(atype, &newx, &newatom, &newatomname, &o2n,
+ &newvsite_type, &newcgnr, symtab, &nadd, *x, cgnr,
+ at, vsite_type, plist, nrfound, ats, add_shift, vsitetop, nvsitetop);
+ break;
+ case resTYR:
+ if (debug)
+ {
+ fprintf(stderr, "TYR at %d\n", o2n[ats[0]]+1);
+ }
+ nvsite += gen_vsites_tyr(atype, &newx, &newatom, &newatomname, &o2n,
+ &newvsite_type, &newcgnr, symtab, &nadd, *x, cgnr,
+ at, vsite_type, plist, nrfound, ats, add_shift, vsitetop, nvsitetop);
+ break;
+ case resHIS:
+ if (debug)
+ {
+ fprintf(stderr, "HIS at %d\n", o2n[ats[0]]+1);
+ }
+ nvsite += gen_vsites_his(at, vsite_type, plist, nrfound, ats, vsitetop, nvsitetop);
+ break;
+ case NOTSET:
+ /* this means this residue won't be processed */
+ break;
+ default:
+ gmx_fatal(FARGS, "DEATH HORROR in do_vsites (%s:%d)",
+ __FILE__, __LINE__);
+ } /* switch whatres */
+ /* skip back to beginning of residue */
+ while (i > 0 && at->atom[i-1].resind == resind)
+ {
+ i--;
+ }
+ } /* if bVsiteAromatics & is protein */
+
+ /* now process the rest of the hydrogens */
+ /* only process hydrogen atoms which are not already set */
+ if ( ((*vsite_type)[i] == NOTSET) && is_hydrogen(*(at->atomname[i])))
+ {
+ /* find heavy atom, count #bonds from it and #H atoms bound to it
+ and return H atom numbers (Hatoms) and heavy atom numbers (heavies) */
+ count_bonds(i, &plist[F_BONDS], at->atomname,
+ &nrbonds, &nrHatoms, Hatoms, &Heavy, &nrheavies, heavies);
+ /* get Heavy atom type */
+ tpHeavy = get_atype(Heavy, at, nrtp, rtp, rt);
+ strcpy(tpname, get_atomtype_name(tpHeavy, atype));
+
+ bWARNING = FALSE;
+ bAddVsiteParam = TRUE;
+ /* nested if's which check nrHatoms, nrbonds and atomname */
+ if (nrHatoms == 1)
+ {
+ switch (nrbonds)
+ {
+ case 2: /* -O-H */
+ (*vsite_type)[i] = F_BONDS;
+ break;
+ case 3: /* =CH-, -NH- or =NH+- */
+ (*vsite_type)[i] = F_VSITE3FD;
+ break;
+ case 4: /* --CH- (tert) */
+ /* The old type 4FD had stability issues, so
+ * all new constructs should use 4FDN
+ */
+ (*vsite_type)[i] = F_VSITE4FDN;
+
+ /* Check parity of heavy atoms from coordinates */
+ ai = Heavy;
+ aj = heavies[0];
+ ak = heavies[1];
+ al = heavies[2];
+ rvec_sub((*x)[aj], (*x)[ai], tmpmat[0]);
+ rvec_sub((*x)[ak], (*x)[ai], tmpmat[1]);
+ rvec_sub((*x)[al], (*x)[ai], tmpmat[2]);
+
+ if (det(tmpmat) > 0)
+ {
+ /* swap parity */
+ heavies[1] = aj;
+ heavies[0] = ak;
+ }
+
+ break;
+ default: /* nrbonds != 2, 3 or 4 */
+ bWARNING = TRUE;
+ }
+
+ }
+ else if ( /*(nrHatoms == 2) && (nrbonds == 2) && REMOVED this test
+ DvdS 19-01-04 */
+ (gmx_strncasecmp(*at->atomname[Heavy], "OW", 2) == 0) )
+ {
+ bAddVsiteParam = FALSE; /* this is water: skip these hydrogens */
+ if (bFirstWater)
+ {
+ bFirstWater = FALSE;
+ if (debug)
+ {
+ fprintf(debug,
+ "Not converting hydrogens in water to virtual sites\n");
+ }
+ }
+ }
+ else if ( (nrHatoms == 2) && (nrbonds == 4) )
+ {
+ /* -CH2- , -NH2+- */
+ (*vsite_type)[Hatoms[0]] = F_VSITE3OUT;
+ (*vsite_type)[Hatoms[1]] = -F_VSITE3OUT;
+ }
+ else
+ {
+ /* 2 or 3 hydrogen atom, with 3 or 4 bonds in total to the heavy atom.
+ * If it is a nitrogen, first check if it is planar.
+ */
+ isN = planarN = FALSE;
+ if ((nrHatoms == 2) && ((*at->atomname[Heavy])[0] == 'N'))
+ {
+ isN = TRUE;
+ j = nitrogen_is_planar(vsiteconflist, nvsiteconf, tpname);
+ if (j < 0)
+ {
+ gmx_fatal(FARGS, "No vsite database NH2 entry for type %s\n", tpname);
+ }
+ planarN = (j == 1);
+ }
+ if ( (nrHatoms == 2) && (nrbonds == 3) && ( !isN || planarN ) )
+ {
+ /* =CH2 or, if it is a nitrogen NH2, it is a planar one */
+ (*vsite_type)[Hatoms[0]] = F_VSITE3FAD;
+ (*vsite_type)[Hatoms[1]] = -F_VSITE3FAD;
+ }
+ else if ( ( (nrHatoms == 2) && (nrbonds == 3) &&
+ ( isN && !planarN ) ) ||
+ ( (nrHatoms == 3) && (nrbonds == 4) ) )
+ {
+ /* CH3, NH3 or non-planar NH2 group */
+ int Hat_vsite_type[3] = { F_VSITE3, F_VSITE3OUT, F_VSITE3OUT };
+ gmx_bool Hat_SwapParity[3] = { FALSE, TRUE, FALSE };
+
+ if (debug)
+ {
+ fprintf(stderr, "-XH3 or nonplanar NH2 group at %d\n", i+1);
+ }
+ bAddVsiteParam = FALSE; /* we'll do this ourselves! */
+ /* -NH2 (umbrella), -NH3+ or -CH3 */
+ (*vsite_type)[Heavy] = F_VSITE3;
+ for (j = 0; j < nrHatoms; j++)
+ {
+ (*vsite_type)[Hatoms[j]] = Hat_vsite_type[j];
+ }
+ /* get dummy mass type from first char of heavy atom type (N or C) */
+
+ strcpy(nexttpname, get_atomtype_name(get_atype(heavies[0], at, nrtp, rtp, rt), atype));
+ ch = get_dummymass_name(vsiteconflist, nvsiteconf, tpname, nexttpname);
+
+ if (ch == NULL)
+ {
+ if (ndb > 0)
+ {
+ gmx_fatal(FARGS, "Can't find dummy mass for type %s bonded to type %s in the virtual site database (.vsd files). Add it to the database!\n", tpname, nexttpname);
+ }
+ else
+ {
+ gmx_fatal(FARGS, "A dummy mass for type %s bonded to type %s is required, but no virtual site database (.vsd) files where found.\n", tpname, nexttpname);
+ }
+ }
+ else
+ {
+ strcpy(name, ch);
+ }
+
+ tpM = vsite_nm2type(name, atype);
+ /* make space for 2 masses: shift all atoms starting with 'Heavy' */
+#define NMASS 2
+ i0 = Heavy;
+ ni0 = i0+nadd;
+ if (debug)
+ {
+ fprintf(stderr, "Inserting %d dummy masses at %d\n", NMASS, o2n[i0]+1);
+ }
+ nadd += NMASS;
+ for (j = i0; j < at->nr; j++)
+ {
+ o2n[j] = j+nadd;
+ }
+
+ srenew(newx, at->nr+nadd);
+ srenew(newatom, at->nr+nadd);
+ srenew(newatomname, at->nr+nadd);
+ srenew(newvsite_type, at->nr+nadd);
+ srenew(newcgnr, at->nr+nadd);
+
+ for (j = 0; j < NMASS; j++)
+ {
+ newatomname[at->nr+nadd-1-j] = NULL;
+ }
+
+ /* calculate starting position for the masses */
+ mHtot = 0;
+ /* get atom masses, and set Heavy and Hatoms mass to zero */
+ for (j = 0; j < nrHatoms; j++)
+ {
+ mHtot += get_amass(Hatoms[j], at, nrtp, rtp, rt);
+ at->atom[Hatoms[j]].m = at->atom[Hatoms[j]].mB = 0;
+ }
+ mtot = mHtot + get_amass(Heavy, at, nrtp, rtp, rt);
+ at->atom[Heavy].m = at->atom[Heavy].mB = 0;
+ if (mHmult != 1.0)
+ {
+ mHtot *= mHmult;
+ }
+ fact2 = mHtot/mtot;
+ fact = sqrt(fact2);
+ /* generate vectors parallel and perpendicular to rotational axis:
+ * rpar = Heavy -> Hcom
+ * rperp = Hcom -> H1 */
+ clear_rvec(rpar);
+ for (j = 0; j < nrHatoms; j++)
+ {
+ rvec_inc(rpar, (*x)[Hatoms[j]]);
+ }
+ svmul(1.0/nrHatoms, rpar, rpar); /* rpar = ( H1+H2+H3 ) / 3 */
+ rvec_dec(rpar, (*x)[Heavy]); /* - Heavy */
+ rvec_sub((*x)[Hatoms[0]], (*x)[Heavy], rperp);
+ rvec_dec(rperp, rpar); /* rperp = H1 - Heavy - rpar */
+ /* calc mass positions */
+ svmul(fact2, rpar, temp);
+ for (j = 0; (j < NMASS); j++) /* xM = xN + fact2 * rpar +/- fact * rperp */
+ {
+ rvec_add((*x)[Heavy], temp, newx[ni0+j]);
+ }
+ svmul(fact, rperp, temp);
+ rvec_inc(newx[ni0 ], temp);
+ rvec_dec(newx[ni0+1], temp);
+ /* set atom parameters for the masses */
+ for (j = 0; (j < NMASS); j++)
+ {
+ /* make name: "M??#" or "M?#" (? is atomname, # is number) */
+ name[0] = 'M';
+ for (k = 0; (*at->atomname[Heavy])[k] && ( k < NMASS ); k++)
+ {
+ name[k+1] = (*at->atomname[Heavy])[k];
+ }
+ name[k+1] = atomnamesuffix[j];
+ name[k+2] = '\0';
+ newatomname[ni0+j] = put_symtab(symtab, name);
+ newatom[ni0+j].m = newatom[ni0+j].mB = mtot/NMASS;
+ newatom[ni0+j].q = newatom[ni0+j].qB = 0.0;
+ newatom[ni0+j].type = newatom[ni0+j].typeB = tpM;
+ newatom[ni0+j].ptype = eptAtom;
+ newatom[ni0+j].resind = at->atom[i0].resind;
++ newatom[ni0+j].elem[0] = 'M';
++ newatom[ni0+j].elem[1] = '\0';
+ newvsite_type[ni0+j] = NOTSET;
+ newcgnr[ni0+j] = (*cgnr)[i0];
+ }
+ /* add constraints between dummy masses and to heavies[0] */
+ /* 'add_shift' says which atoms won't be renumbered afterwards */
+ my_add_param(&(plist[F_CONSTRNC]), heavies[0], add_shift+ni0, NOTSET);
+ my_add_param(&(plist[F_CONSTRNC]), heavies[0], add_shift+ni0+1, NOTSET);
+ my_add_param(&(plist[F_CONSTRNC]), add_shift+ni0, add_shift+ni0+1, NOTSET);
+
+ /* generate Heavy, H1, H2 and H3 from M1, M2 and heavies[0] */
+ /* note that vsite_type cannot be NOTSET, because we just set it */
+ add_vsite3_atoms (&plist[(*vsite_type)[Heavy]],
+ Heavy, heavies[0], add_shift+ni0, add_shift+ni0+1,
+ FALSE);
+ for (j = 0; j < nrHatoms; j++)
+ {
+ add_vsite3_atoms(&plist[(*vsite_type)[Hatoms[j]]],
+ Hatoms[j], heavies[0], add_shift+ni0, add_shift+ni0+1,
+ Hat_SwapParity[j]);
+ }
+#undef NMASS
+ }
+ else
+ {
+ bWARNING = TRUE;
+ }
+
+ }
+ if (bWARNING)
+ {
+ fprintf(stderr,
+ "Warning: cannot convert atom %d %s (bound to a heavy atom "
+ "%s with \n"
+ " %d bonds and %d bound hydrogens atoms) to virtual site\n",
+ i+1, *(at->atomname[i]), tpname, nrbonds, nrHatoms);
+ }
+ if (bAddVsiteParam)
+ {
+ /* add vsite parameters to topology,
+ also get rid of negative vsite_types */
+ add_vsites(plist, (*vsite_type), Heavy, nrHatoms, Hatoms,
+ nrheavies, heavies);
+ /* transfer mass of virtual site to Heavy atom */
+ for (j = 0; j < nrHatoms; j++)
+ {
+ if (is_vsite((*vsite_type)[Hatoms[j]]))
+ {
+ at->atom[Heavy].m += at->atom[Hatoms[j]].m;
+ at->atom[Heavy].mB = at->atom[Heavy].m;
+ at->atom[Hatoms[j]].m = at->atom[Hatoms[j]].mB = 0;
+ }
+ }
+ }
+ nvsite += nrHatoms;
+ if (debug)
+ {
+ fprintf(debug, "atom %d: ", o2n[i]+1);
+ print_bonds(debug, o2n, nrHatoms, Hatoms, Heavy, nrheavies, heavies);
+ }
+ } /* if vsite NOTSET & is hydrogen */
+
+ } /* for i < at->nr */
+
+ gmx_residuetype_destroy(rt);
+
+ if (debug)
+ {
+ fprintf(debug, "Before inserting new atoms:\n");
+ for (i = 0; i < at->nr; i++)
+ {
+ fprintf(debug, "%4d %4d %4s %4d %4s %6d %-10s\n", i+1, o2n[i]+1,
+ at->atomname[i] ? *(at->atomname[i]) : "(NULL)",
+ at->resinfo[at->atom[i].resind].nr,
+ at->resinfo[at->atom[i].resind].name ?
+ *(at->resinfo[at->atom[i].resind].name) : "(NULL)",
+ (*cgnr)[i],
+ ((*vsite_type)[i] == NOTSET) ?
+ "NOTSET" : interaction_function[(*vsite_type)[i]].name);
+ }
+ fprintf(debug, "new atoms to be inserted:\n");
+ for (i = 0; i < at->nr+nadd; i++)
+ {
+ if (newatomname[i])
+ {
+ fprintf(debug, "%4d %4s %4d %6d %-10s\n", i+1,
+ newatomname[i] ? *(newatomname[i]) : "(NULL)",
+ newatom[i].resind, newcgnr[i],
+ (newvsite_type[i] == NOTSET) ?
+ "NOTSET" : interaction_function[newvsite_type[i]].name);
+ }
+ }
+ }
+
+ /* add all original atoms to the new arrays, using o2n index array */
+ for (i = 0; i < at->nr; i++)
+ {
+ newatomname [o2n[i]] = at->atomname [i];
+ newatom [o2n[i]] = at->atom [i];
+ newvsite_type[o2n[i]] = (*vsite_type)[i];
+ newcgnr [o2n[i]] = (*cgnr) [i];
+ copy_rvec((*x)[i], newx[o2n[i]]);
+ }
+ /* throw away old atoms */
+ sfree(at->atom);
+ sfree(at->atomname);
+ sfree(*vsite_type);
+ sfree(*cgnr);
+ sfree(*x);
+ /* put in the new ones */
+ at->nr += nadd;
+ at->atom = newatom;
+ at->atomname = newatomname;
+ *vsite_type = newvsite_type;
+ *cgnr = newcgnr;
+ *x = newx;
+ if (at->nr > add_shift)
+ {
+ gmx_fatal(FARGS, "Added impossible amount of dummy masses "
+ "(%d on a total of %d atoms)\n", nadd, at->nr-nadd);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "After inserting new atoms:\n");
+ for (i = 0; i < at->nr; i++)
+ {
+ fprintf(debug, "%4d %4s %4d %4s %6d %-10s\n", i+1,
+ at->atomname[i] ? *(at->atomname[i]) : "(NULL)",
+ at->resinfo[at->atom[i].resind].nr,
+ at->resinfo[at->atom[i].resind].name ?
+ *(at->resinfo[at->atom[i].resind].name) : "(NULL)",
+ (*cgnr)[i],
+ ((*vsite_type)[i] == NOTSET) ?
+ "NOTSET" : interaction_function[(*vsite_type)[i]].name);
+ }
+ }
+
+ /* now renumber all the interactions because of the added atoms */
+ for (ftype = 0; ftype < F_NRE; ftype++)
+ {
+ params = &(plist[ftype]);
+ if (debug)
+ {
+ fprintf(debug, "Renumbering %d %s\n", params->nr,
+ interaction_function[ftype].longname);
+ }
+ for (i = 0; i < params->nr; i++)
+ {
+ for (j = 0; j < NRAL(ftype); j++)
+ {
+ if (params->param[i].a[j] >= add_shift)
+ {
+ if (debug)
+ {
+ fprintf(debug, " [%u -> %u]", params->param[i].a[j],
+ params->param[i].a[j]-add_shift);
+ }
+ params->param[i].a[j] = params->param[i].a[j]-add_shift;
+ }
+ else
+ {
+ if (debug)
+ {
+ fprintf(debug, " [%u -> %d]", params->param[i].a[j],
+ o2n[params->param[i].a[j]]);
+ }
+ params->param[i].a[j] = o2n[params->param[i].a[j]];
+ }
+ }
+ if (debug)
+ {
+ fprintf(debug, "\n");
+ }
+ }
+ }
+ /* now check if atoms in the added constraints are in increasing order */
+ params = &(plist[F_CONSTRNC]);
+ for (i = 0; i < params->nr; i++)
+ {
+ if (params->param[i].AI > params->param[i].AJ)
+ {
+ j = params->param[i].AJ;
+ params->param[i].AJ = params->param[i].AI;
+ params->param[i].AI = j;
+ }
+ }
+
+ /* clean up */
+ sfree(o2n);
+
+ /* tell the user what we did */
+ fprintf(stderr, "Marked %d virtual sites\n", nvsite);
+ fprintf(stderr, "Added %d dummy masses\n", nadd);
+ fprintf(stderr, "Added %d new constraints\n", plist[F_CONSTRNC].nr);
+}
+
+void do_h_mass(t_params *psb, int vsite_type[], t_atoms *at, real mHmult,
+ gmx_bool bDeuterate)
+{
+ int i, j, a;
+
+ /* loop over all atoms */
+ for (i = 0; i < at->nr; i++)
+ {
+ /* adjust masses if i is hydrogen and not a virtual site */
+ if (!is_vsite(vsite_type[i]) && is_hydrogen(*(at->atomname[i])) )
+ {
+ /* find bonded heavy atom */
+ a = NOTSET;
+ for (j = 0; (j < psb->nr) && (a == NOTSET); j++)
+ {
+ /* if other atom is not a virtual site, it is the one we want */
+ if ( (psb->param[j].AI == i) &&
+ !is_vsite(vsite_type[psb->param[j].AJ]) )
+ {
+ a = psb->param[j].AJ;
+ }
+ else if ( (psb->param[j].AJ == i) &&
+ !is_vsite(vsite_type[psb->param[j].AI]) )
+ {
+ a = psb->param[j].AI;
+ }
+ }
+ if (a == NOTSET)
+ {
+ gmx_fatal(FARGS, "Unbound hydrogen atom (%d) found while adjusting mass",
+ i+1);
+ }
+
+ /* adjust mass of i (hydrogen) with mHmult
+ and correct mass of a (bonded atom) with same amount */
+ if (!bDeuterate)
+ {
+ at->atom[a].m -= (mHmult-1.0)*at->atom[i].m;
+ at->atom[a].mB -= (mHmult-1.0)*at->atom[i].m;
+ }
+ at->atom[i].m *= mHmult;
+ at->atom[i].mB *= mHmult;
+ }
+ }
+}
--- /dev/null
- /* Undefine all defines used below so we can include this file multiple times
- * with different settings from the same source file.
- */
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* The macros in this file are intended to be used for writing
+ * architecture-independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
- #undef GMX_SIMD_WIDTH_HERE
++#ifdef _gmx_simd_macros_h_
++#error "gmx_simd_macros.h included twice"
++#else
++#define _gmx_simd_macros_h_
+
+/* NOTE: SSE2 acceleration does not include floor or blendv */
+
- /* float/double SIMD register type */
- #undef gmx_mm_pr
+
- /* integer SIMD register type, only used in the tabulated PME kernels */
- #undef gmx_epi32
++/* Uncomment the next line, without other SIMD active, for testing plain-C */
++/* #define GMX_SIMD_REFERENCE_PLAIN_C */
++#ifdef GMX_SIMD_REFERENCE_PLAIN_C
++/* Plain C SIMD reference implementation, also serves as documentation */
++#define GMX_HAVE_SIMD_MACROS
+
- #undef gmx_load_pr
- #undef gmx_load1_pr
- #undef gmx_set1_pr
- #undef gmx_setzero_pr
- #undef gmx_store_pr
++/* In general the reference SIMD supports any SIMD width, including 1.
++ * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
++ * The nbnxn 2xnn kernels are currently not supported.
++ */
++#define GMX_SIMD_REF_WIDTH 4
+
- #undef gmx_add_pr
- #undef gmx_sub_pr
- #undef gmx_mul_pr
++/* Include plain-C reference implementation, also serves as documentation */
++#include "gmx_simd_ref.h"
+
- /* d = gmx_madd_pr(a,b,c): d = a*b + c, could use FMA3 or FMA4 */
- #undef gmx_madd_pr
- /* d = gmx_nmsub_pr(a,b,c): d = -a*b + c, could use FMA3 or FMA4 */
- #undef gmx_nmsub_pr
- #undef gmx_max_pr
- #undef gmx_cmplt_pr
- /* gmx_blendzero_pr(real a, boolean b) does: (b ? a : 0) */
- #undef gmx_blendzero_pr
- /* Logical operations on SIMD booleans */
- #undef gmx_and_pr
- #undef gmx_or_pr
- #undef gmx_andnot_pr
++#define GMX_SIMD_WIDTH_HERE GMX_SIMD_REF_WIDTH
++
++/* float/double SIMD register type */
++#define gmx_mm_pr gmx_simd_ref_pr
++
++/* boolean SIMD register type */
++#define gmx_mm_pb gmx_simd_ref_pb
++
++/* integer SIMD register type, only for table indexing and exclusion masks */
++#define gmx_epi32 gmx_simd_ref_epi32
++#define GMX_SIMD_EPI32_WIDTH GMX_SIMD_REF_EPI32_WIDTH
++
++/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
++#define gmx_load_pr gmx_simd_ref_load_pr
++/* Set all SIMD register elements to *r */
++#define gmx_load1_pr gmx_simd_ref_load1_pr
++#define gmx_set1_pr gmx_simd_ref_set1_pr
++#define gmx_setzero_pr gmx_simd_ref_setzero_pr
++#define gmx_store_pr gmx_simd_ref_store_pr
++
++#define gmx_add_pr gmx_simd_ref_add_pr
++#define gmx_sub_pr gmx_simd_ref_sub_pr
++#define gmx_mul_pr gmx_simd_ref_mul_pr
+/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
- /* Only used for PBC in bonded interactions, can be avoided */
- #undef gmx_round_pr
++#define gmx_madd_pr gmx_simd_ref_madd_pr
++#define gmx_nmsub_pr gmx_simd_ref_nmsub_pr
++
++#define gmx_max_pr gmx_simd_ref_max_pr
++#define gmx_blendzero_pr gmx_simd_ref_blendzero_pr
++
++#define gmx_round_pr gmx_simd_ref_round_pr
+
- #undef GMX_HAVE_SIMD_FLOOR
- #undef gmx_floor_pr
+/* Not required, only used to speed up the nbnxn tabulated PME kernels */
- #undef GMX_HAVE_SIMD_BLENDV
- #undef gmx_blendv_pr
- /* Not required, gmx_anytrue(x) returns if any of the boolean is x is True.
++#define GMX_SIMD_HAVE_FLOOR
++#ifdef GMX_SIMD_HAVE_FLOOR
++#define gmx_floor_pr gmx_simd_ref_floor_pr
++#endif
+
+/* Not required, only used when blendv is faster than comparison */
- #undef GMX_HAVE_SIMD_ANYTRUE
- #undef gmx_anytrue_pr
++#define GMX_SIMD_HAVE_BLENDV
++#ifdef GMX_SIMD_HAVE_BLENDV
++#define gmx_blendv_pr gmx_simd_ref_blendv_pr
++#endif
++
++/* Copy the sign of a to b, assumes b >= 0 for efficiency */
++#define gmx_cpsgn_nonneg_pr gmx_simd_ref_cpsgn_nonneg_pr
++
++/* Very specific operation required in the non-bonded kernels */
++#define gmx_masknot_add_pr gmx_simd_ref_masknot_add_pr
++
++/* Comparison */
++#define gmx_cmplt_pr gmx_simd_ref_cmplt_pr
++
++/* Logical operations on SIMD booleans */
++#define gmx_and_pb gmx_simd_ref_and_pb
++#define gmx_or_pb gmx_simd_ref_or_pb
++
++/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
+ * If this is not present, define GMX_SIMD_IS_TRUE(real x),
+ * which should return x==True, where True is True as defined in SIMD.
+ */
- /* Integer set and cast are only used for nbnxn exclusion masks */
- #undef gmx_set1_epi32
- #undef gmx_castsi_pr
++#define GMX_SIMD_HAVE_ANYTRUE
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb gmx_simd_ref_anytrue_pb
++#else
++/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
++#define gmx_store_pb gmx_simd_ref_store_pb
++#endif
+
- #undef gmx_load_si
- /* If the same bit is set in both input masks, return all bits 1, otherwise 0 */
- #undef gmx_checkbitmask_epi32
+/* For topology exclusion pair checking we need: ((a & b) ? True : False)
+ * when we do a bit-wise and between a and b.
+ * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
+ * Otherwise we do all operations, except for the set1, in reals.
+ */
- #undef gmx_checkbitmask_pr
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#define gmx_set1_epi32 gmx_simd_ref_set1_epi32
++#define gmx_load_si gmx_simd_ref_load_si
++#define gmx_checkbitmask_epi32 gmx_simd_ref_checkbitmask_epi32
++#endif
++
++/* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
++#define gmx_castsi_pr gmx_simd_ref_castsi_pr
+/* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
+ * identical 32-bit masks are set in one double and one or both can be used.
+ */
- #undef gmx_cvttpr_epi32
- #undef gmx_cvtepi32_pr
-
- #undef gmx_invsqrt_pr
- /* sqrt+inv+sin+cos+acos+atan2 are only used for bonded potentials */
- #undef gmx_sqrt_pr
- #undef gmx_inv_pr
- #undef gmx_sincos_pr
- #undef gmx_acos_pr
- #undef gmx_atan_pr
-
- #undef gmx_calc_rsq_pr
- #undef gmx_sum4_pr
-
- /* Only required for nbnxn analytical PME kernels */
- #undef gmx_pmecorrF_pr
- #undef gmx_pmecorrV_pr
-
++#define gmx_checkbitmask_pr gmx_simd_ref_checkbitmask_pr
++#endif
+
+/* Conversions only used for PME table lookup */
- /* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
- #undef gmx_mm_hpr
-
- #undef gmx_load_hpr
- #undef gmx_load1_hpr
- #undef gmx_store_hpr
- #undef gmx_add_hpr
- #undef gmx_sub_hpr
++#define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32
++#define gmx_cvtepi32_pr gmx_simd_ref_cvtepi32_pr
+
- #undef gmx_sum4_hpr
++/* These two function only need to be approximate, Newton-Raphson iteration
++ * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
++ */
++#define gmx_rsqrt_pr gmx_simd_ref_rsqrt_pr
++#define gmx_rcp_pr gmx_simd_ref_rcp_pr
+
- #undef gmx_2hpr_to_pr
++/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
++#define GMX_SIMD_HAVE_EXP
++#ifdef GMX_SIMD_HAVE_EXP
++#define gmx_exp_pr gmx_simd_ref_exp_pr
++#endif
++#define GMX_SIMD_HAVE_TRIGONOMETRIC
++#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
++#define gmx_sqrt_pr gmx_simd_ref_sqrt_pr
++#define gmx_sincos_pr gmx_simd_ref_sincos_pr
++#define gmx_acos_pr gmx_simd_ref_acos_pr
++#define gmx_atan2_pr gmx_simd_ref_atan2_pr
++#endif
+
- /* Generic macros for obtaining a SIMD aligned pointer from pointer x */
- #undef gmx_simd_align_real
- #undef gmx_simd_align_int
-
-
++#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
+
+
+/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
+ * to instructions for) different SIMD width and float precision.
+ *
+ * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
+ * The _pr suffix is replaced by _ps or _pd (for single or double precision).
+ * Compiler settings will decide if 128-bit intrinsics will
+ * be translated into SSE or AVX instructions.
+ */
+
+
- #include "gmx_x86_simd_single.h"
-
+#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
+#if defined GMX_X86_AVX_256
+/* We have half SIMD width support, continue */
+#else
+#error "half SIMD width intrinsics are not supported"
+#endif
+#endif
+
+
+#ifdef GMX_X86_SSE2
++/* This is for general x86 SIMD instruction sets that also support SSE2 */
++#define GMX_HAVE_SIMD_MACROS
++
++/* Include the highest supported x86 SIMD intrisics + math functions */
++#ifdef GMX_X86_AVX_256
++#include "gmx_x86_avx_256.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_avx_256_double.h"
++#else
++#include "gmx_math_x86_avx_256_single.h"
++#endif
++#else
++#ifdef GMX_X86_AVX_128_FMA
++#include "gmx_x86_avx_128_fma.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_avx_128_fma_double.h"
++#else
++#include "gmx_math_x86_avx_128_fma_single.h"
++#endif
++#else
++#ifdef GMX_X86_SSE4_1
++#include "gmx_x86_sse4_1.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_sse4_1_double.h"
++#else
++#include "gmx_math_x86_sse4_1_single.h"
++#endif
++#else
++#ifdef GMX_X86_SSE2
++#include "gmx_x86_sse2.h"
++#ifdef GMX_DOUBLE
++#include "gmx_math_x86_sse2_double.h"
++#else
++#include "gmx_math_x86_sse2_single.h"
++#endif
++#else
++#error No x86 acceleration defined
++#endif
++#endif
++#endif
++#endif
++/* exp and trigonometric functions are included above */
++#define GMX_SIMD_HAVE_EXP
++#define GMX_SIMD_HAVE_TRIGONOMETRIC
+
+#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
+
+#ifndef GMX_DOUBLE
+
- #define gmx_cmplt_pr _mm_cmplt_ps
+#define GMX_SIMD_WIDTH_HERE 4
+
+#define gmx_mm_pr __m128
+
++#define gmx_mm_pb __m128
++
+#define gmx_epi32 __m128i
++#define GMX_SIMD_EPI32_WIDTH 4
+
+#define gmx_load_pr _mm_load_ps
+#define gmx_load1_pr _mm_load1_ps
+#define gmx_set1_pr _mm_set1_ps
+#define gmx_setzero_pr _mm_setzero_ps
+#define gmx_store_pr _mm_store_ps
+
+#define gmx_add_pr _mm_add_ps
+#define gmx_sub_pr _mm_sub_ps
+#define gmx_mul_pr _mm_mul_ps
+#ifdef GMX_X86_AVX_128_FMA
+#define gmx_madd_pr(a, b, c) _mm_macc_ps(a, b, c)
+#define gmx_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c)
+#else
+#define gmx_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
+#define gmx_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
+#endif
+#define gmx_max_pr _mm_max_ps
- #define gmx_and_pr _mm_and_ps
- #define gmx_or_pr _mm_or_ps
- #define gmx_andnot_pr _mm_andnot_ps
+#define gmx_blendzero_pr _mm_and_ps
- #define GMX_HAVE_SIMD_FLOOR
++
++#define gmx_cmplt_pr _mm_cmplt_ps
++#define gmx_and_pb _mm_and_ps
++#define gmx_or_pb _mm_or_ps
+
+#ifdef GMX_X86_SSE4_1
+#define gmx_round_pr(x) _mm_round_ps(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+#define gmx_floor_pr _mm_floor_ps
+#else
+#define gmx_round_pr(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
+#endif
+
+#ifdef GMX_X86_SSE4_1
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr _mm_movemask_ps
++#define GMX_SIMD_HAVE_BLENDV
+#define gmx_blendv_pr _mm_blendv_ps
+#endif
+
- #define gmx_castsi_pr gmx_mm_castsi128_ps
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++ /* The value -0.0 has only the sign-bit set */
++ gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
++ return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
++};
+
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
++
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb _mm_movemask_ps
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+#define gmx_set1_epi32 _mm_set1_epi32
- #define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
+#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
- #define gmx_invsqrt_pr gmx_mm_invsqrt_ps
++#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
+
+#define gmx_cvttpr_epi32 _mm_cvttps_epi32
+#define gmx_cvtepi32_pr _mm_cvtepi32_ps
+
- #define gmx_inv_pr gmx_mm_inv_ps
++#define gmx_rsqrt_pr _mm_rsqrt_ps
++#define gmx_rcp_pr _mm_rcp_ps
++
++#define gmx_exp_pr gmx_mm_exp_ps
+#define gmx_sqrt_pr gmx_mm_sqrt_ps
- #define gmx_calc_rsq_pr gmx_mm_calc_rsq_ps
- #define gmx_sum4_pr gmx_mm_sum4_ps
-
- #define gmx_pmecorrF_pr gmx_mm_pmecorrF_ps
- #define gmx_pmecorrV_pr gmx_mm_pmecorrV_ps
-
+#define gmx_sincos_pr gmx_mm_sincos_ps
+#define gmx_acos_pr gmx_mm_acos_ps
+#define gmx_atan2_pr gmx_mm_atan2_ps
+
- #include "gmx_x86_simd_double.h"
-
+#else /* ifndef GMX_DOUBLE */
+
- #define gmx_cmplt_pr _mm_cmplt_pd
+#define GMX_SIMD_WIDTH_HERE 2
+
+#define gmx_mm_pr __m128d
+
++#define gmx_mm_pb __m128d
++
+#define gmx_epi32 __m128i
++#define GMX_SIMD_EPI32_WIDTH 4
+
+#define gmx_load_pr _mm_load_pd
+#define gmx_load1_pr _mm_load1_pd
+#define gmx_set1_pr _mm_set1_pd
+#define gmx_setzero_pr _mm_setzero_pd
+#define gmx_store_pr _mm_store_pd
+
+#define gmx_add_pr _mm_add_pd
+#define gmx_sub_pr _mm_sub_pd
+#define gmx_mul_pr _mm_mul_pd
+#ifdef GMX_X86_AVX_128_FMA
+#define gmx_madd_pr(a, b, c) _mm_macc_pd(a, b, c)
+#define gmx_nmsub_pr(a, b, c) _mm_nmacc_pd(a, b, c)
+#else
+#define gmx_madd_pr(a, b, c) _mm_add_pd(c, _mm_mul_pd(a, b))
+#define gmx_nmsub_pr(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b))
+#endif
+#define gmx_max_pr _mm_max_pd
- #define gmx_and_pr _mm_and_pd
- #define gmx_or_pr _mm_or_pd
- #define gmx_andnot_pr _mm_andnot_pd
+#define gmx_blendzero_pr _mm_and_pd
- #define GMX_HAVE_SIMD_FLOOR
+
+#ifdef GMX_X86_SSE4_1
+#define gmx_round_pr(x) _mm_round_pd(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+#define gmx_floor_pr _mm_floor_pd
+#else
+#define gmx_round_pr(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
+/* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
+#endif
+
+#ifdef GMX_X86_SSE4_1
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr _mm_movemask_pd
++#define GMX_SIMD_HAVE_BLENDV
+#define gmx_blendv_pr _mm_blendv_pd
+#endif
+
- #define gmx_castsi_pr gmx_mm_castsi128_pd
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++ gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
++ return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
++};
++
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
+
++#define gmx_cmplt_pr _mm_cmplt_pd
++
++#define gmx_and_pb _mm_and_pd
++#define gmx_or_pb _mm_or_pd
++
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb _mm_movemask_pd
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+#define gmx_set1_epi32 _mm_set1_epi32
- #define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
+#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
- #define gmx_invsqrt_pr gmx_mm_invsqrt_pd
++#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
+
+#define gmx_cvttpr_epi32 _mm_cvttpd_epi32
+#define gmx_cvtepi32_pr _mm_cvtepi32_pd
+
- #define gmx_inv_pr gmx_mm_inv_pd
++#define gmx_rsqrt_pr(r) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
++#define gmx_rcp_pr(r) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
++
++#define gmx_exp_pr gmx_mm_exp_pd
+#define gmx_sqrt_pr gmx_mm_sqrt_pd
- #define gmx_calc_rsq_pr gmx_mm_calc_rsq_pd
- #define gmx_sum4_pr gmx_mm_sum4_pd
-
- #define gmx_pmecorrF_pr gmx_mm_pmecorrF_pd
- #define gmx_pmecorrV_pr gmx_mm_pmecorrV_pd
-
+#define gmx_sincos_pr gmx_mm_sincos_pd
+#define gmx_acos_pr gmx_mm_acos_pd
+#define gmx_atan2_pr gmx_mm_atan2_pd
+
- #include "gmx_x86_simd_single.h"
-
+#endif /* ifndef GMX_DOUBLE */
+
+#else
+/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
+ * so we use 256-bit SIMD.
+ */
+
+#ifndef GMX_DOUBLE
+
- /* Less-than (we use ordered, non-signaling, but that's not required) */
- #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
+#define GMX_SIMD_WIDTH_HERE 8
+
+#define gmx_mm_pr __m256
+
++#define gmx_mm_pb __m256
++
+#define gmx_epi32 __m256i
++#define GMX_SIMD_EPI32_WIDTH 8
+
+#define gmx_load_pr _mm256_load_ps
+#define gmx_load1_pr(x) _mm256_set1_ps((x)[0])
+#define gmx_set1_pr _mm256_set1_ps
+#define gmx_setzero_pr _mm256_setzero_ps
+#define gmx_store_pr _mm256_store_ps
+
+#define gmx_add_pr _mm256_add_ps
+#define gmx_sub_pr _mm256_sub_ps
+#define gmx_mul_pr _mm256_mul_ps
+#define gmx_madd_pr(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#define gmx_nmsub_pr(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
+#define gmx_max_pr _mm256_max_ps
- #define gmx_and_pr _mm256_and_ps
- #define gmx_or_pr _mm256_or_ps
- #define gmx_andnot_pr _mm256_andnot_ps
+#define gmx_blendzero_pr _mm256_and_ps
- #define GMX_HAVE_SIMD_FLOOR
+
+#define gmx_round_pr(x) _mm256_round_ps(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+#define gmx_floor_pr _mm256_floor_ps
+
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr _mm256_movemask_ps
++#define GMX_SIMD_HAVE_BLENDV
+#define gmx_blendv_pr _mm256_blendv_ps
+
- #define gmx_invsqrt_pr gmx_mm256_invsqrt_ps
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++ gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
++ return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
++};
++
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
++
++/* Less-than (we use ordered, non-signaling, but that's not required) */
++#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
++#define gmx_and_pb _mm256_and_ps
++#define gmx_or_pb _mm256_or_ps
+
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb _mm256_movemask_ps
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_PR
+#define gmx_set1_epi32 _mm256_set1_epi32
+#define gmx_castsi_pr _mm256_castsi256_ps
+/* With <= 16 bits used the cast and conversion should not be required,
+ * since only mantissa bits are set and that would give a non-zero float,
+ * but with the Intel compiler this does not work correctly.
+ */
+#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
+
+#define gmx_cvttpr_epi32 _mm256_cvttps_epi32
+
- #define gmx_inv_pr gmx_mm256_inv_ps
++#define gmx_rsqrt_pr _mm256_rsqrt_ps
++#define gmx_rcp_pr _mm256_rcp_ps
++
++#define gmx_exp_pr gmx_mm256_exp_ps
+#define gmx_sqrt_pr gmx_mm256_sqrt_ps
- #define gmx_calc_rsq_pr gmx_mm256_calc_rsq_ps
- #define gmx_sum4_pr gmx_mm256_sum4_ps
-
- #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
- #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
-
+#define gmx_sincos_pr gmx_mm256_sincos_ps
+#define gmx_acos_pr gmx_mm256_acos_ps
+#define gmx_atan2_pr gmx_mm256_atan2_ps
+
- #include "gmx_x86_simd_double.h"
-
+#else
+
- /* Less-than (we use ordered, non-signaling, but that's not required) */
- #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+#define GMX_SIMD_WIDTH_HERE 4
+
+#define gmx_mm_pr __m256d
+
++#define gmx_mm_pb __m256d
++
+/* We use 128-bit integer registers because of missing 256-bit operations */
+#define gmx_epi32 __m128i
++#define GMX_SIMD_EPI32_WIDTH 4
+
+#define gmx_load_pr _mm256_load_pd
+#define gmx_load1_pr(x) _mm256_set1_pd((x)[0])
+#define gmx_set1_pr _mm256_set1_pd
+#define gmx_setzero_pr _mm256_setzero_pd
+#define gmx_store_pr _mm256_store_pd
+
+#define gmx_add_pr _mm256_add_pd
+#define gmx_sub_pr _mm256_sub_pd
+#define gmx_mul_pr _mm256_mul_pd
+#define gmx_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
+#define gmx_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#define gmx_max_pr _mm256_max_pd
- #define gmx_and_pr _mm256_and_pd
- #define gmx_or_pr _mm256_or_pd
- #define gmx_andnot_pr _mm256_andnot_pd
+#define gmx_blendzero_pr _mm256_and_pd
- #define GMX_HAVE_SIMD_FLOOR
+
+#define gmx_round_pr(x) _mm256_round_pd(x, 0x0)
- #define GMX_HAVE_SIMD_BLENDV
++#define GMX_SIMD_HAVE_FLOOR
+#define gmx_floor_pr _mm256_floor_pd
+
- #define GMX_HAVE_SIMD_ANYTRUE
- #define gmx_anytrue_pr _mm256_movemask_pd
++#define GMX_SIMD_HAVE_BLENDV
+#define gmx_blendv_pr _mm256_blendv_pd
+
- #define gmx_invsqrt_pr gmx_mm256_invsqrt_pd
++static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
++{
++ gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
++ return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
++};
++
++static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
++
++/* Less-than (we use ordered, non-signaling, but that's not required) */
++#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
++
++#define gmx_and_pb _mm256_and_pd
++#define gmx_or_pb _mm256_or_pd
+
++#define GMX_SIMD_HAVE_ANYTRUE
++#define gmx_anytrue_pb _mm256_movemask_pd
++
++#define GMX_SIMD_HAVE_CHECKBITMASK_PR
+#define gmx_set1_epi32 _mm256_set1_epi32
+#define gmx_castsi_pr _mm256_castsi256_pd
+/* With <= 16 bits used the cast and conversion should not be required,
+ * since only mantissa bits are set and that would give a non-zero float,
+ * but with the Intel compiler this does not work correctly.
+ * Because AVX does not have int->double conversion, we convert via float.
+ */
+#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
+
+#define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
+
- #define gmx_inv_pr gmx_mm256_inv_pd
++#define gmx_rsqrt_pr(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
++#define gmx_rcp_pr(r) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
++
++#define gmx_exp_pr gmx_mm256_exp_pd
+#define gmx_sqrt_pr gmx_mm256_sqrt_pd
- #define gmx_calc_rsq_pr gmx_mm256_calc_rsq_pd
- #define gmx_sum4_pr gmx_mm256_sum4_pd
-
- #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_pd
- #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_pd
-
+#define gmx_sincos_pr gmx_mm256_sincos_pd
+#define gmx_acos_pr gmx_mm256_acos_pd
+#define gmx_atan2_pr gmx_mm256_atan2_pd
+
- /* Generic macros to extract a SIMD aligned pointer from a pointer x.
+#endif /* GMX_DOUBLE */
+
+#endif /* 128- or 256-bit x86 SIMD */
+
+#endif /* GMX_X86_SSE2 */
+
+
- #define gmx_simd_align_real(x) (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))))
++#ifdef GMX_HAVE_SIMD_MACROS
++/* Generic functions to extract a SIMD aligned pointer from a pointer x.
+ * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
+ * to how many you want to use, to avoid indexing outside the aligned region.
+ */
+
- #define gmx_simd_align_int(x) (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))))
++static gmx_inline real *
++gmx_simd_align_real(const real *x)
++{
++ return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
++}
++
++static gmx_inline int *
++gmx_simd_align_int(const int *x)
++{
++ return (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
++}
++
++
++/* Include the math functions which only need the above macros,
++ * generally these are the ones that don't need masking operations.
++ */
++#ifdef GMX_DOUBLE
++#include "gmx_simd_math_double.h"
++#else
++#include "gmx_simd_math_single.h"
++#endif
++
++#endif /* GMX_HAVE_SIMD_MACROS */
+
++#endif /* _gmx_simd_macros_h_ */
--- /dev/null
- #if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_NBNXN_SIMD_BITWIDTH 256
- #else
- #define GMX_NBNXN_SIMD_BITWIDTH 128
- #endif
-
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NB_VERLET_H
+#define NB_VERLET_H
+
+#include "nbnxn_pairlist.h"
+#include "nbnxn_cuda_types_ext.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
++
++/* For testing the reference plain-C SIMD kernels, uncomment the next lines,
++ * as well as the GMX_SIMD_REFERENCE_PLAIN_C define in gmx_simd_macros.h
++ * The actual SIMD width is set in gmx_simd_macros.h
++ * The 4xN reference kernels support 2-, 4- and 8-way SIMD.
++ * The 2x(N+N) reference kernels support 8- and 16-way SIMD.
++ */
++/* #define GMX_NBNXN_SIMD */
++/* #define GMX_NBNXN_SIMD_4XN */
++/* #define GMX_NBNXN_SIMD_2XNN */
++
++
+#ifdef GMX_X86_SSE2
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+
+/* Uncomment the next line to use, slower, 128-bit SIMD with AVX-256 */
+/* #define GMX_NBNXN_HALF_WIDTH_SIMD */
+
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ * 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
+ */
+#define GMX_NBNXN_SIMD_4XN
++#if defined GMX_X86_AVX_256 && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#endif
+
+
+/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+typedef enum
+{
+ nbnxnkNotSet = 0,
+ nbnxnk4x4_PlainC,
+ nbnxnk4xN_SIMD_4xN,
+ nbnxnk4xN_SIMD_2xNN,
+ nbnxnk8x8x8_CUDA,
+ nbnxnk8x8x8_PlainC,
+ nbnxnkNR
+} nbnxn_kernel_type;
+
+/*! Return a string indentifying the kernel type */
+const char *lookup_nbnxn_kernel_name(int kernel_type);
+
+enum {
+ ewaldexclTable, ewaldexclAnalytical
+};
+
+/* Atom locality indicator: local, non-local, all, used for calls to:
+ gridding, pair-search, force calculation, x/f buffer operations */
+enum {
+ eatLocal = 0, eatNonlocal = 1, eatAll
+};
+
+#define LOCAL_A(x) ((x) == eatLocal)
+#define NONLOCAL_A(x) ((x) == eatNonlocal)
+#define LOCAL_OR_NONLOCAL_A(x) (LOCAL_A(x) || NONLOCAL_A(x))
+
+/* Interaction locality indicator (used in pair-list search/calculations):
+ - local interactions require local atom data and affect local output only;
+ - non-local interactions require both local and non-local atom data and
+ affect both local- and non-local output. */
+enum {
+ eintLocal = 0, eintNonlocal = 1
+};
+
+#define LOCAL_I(x) ((x) == eintLocal)
+#define NONLOCAL_I(x) ((x) == eintNonlocal)
+
+enum {
+ enbvClearFNo, enbvClearFYes
+};
+
+typedef struct {
+ nbnxn_pairlist_set_t nbl_lists; /* pair list(s) */
+ nbnxn_atomdata_t *nbat; /* atom data */
+ int kernel_type; /* non-bonded kernel - see enum above */
+ int ewald_excl; /* Ewald exclusion - see enum above */
+} nonbonded_verlet_group_t;
+
+/* non-bonded data structure with Verlet-type cut-off */
+typedef struct {
+ nbnxn_search_t nbs; /* n vs n atom pair searching data */
+ int ngrp; /* number of interaction groups */
+ nonbonded_verlet_group_t grp[2]; /* local and non-local interaction group */
+
+ gmx_bool bUseGPU; /* TRUE when GPU acceleration is used */
+ nbnxn_cuda_ptr_t cu_nbv; /* pointer to CUDA nb verlet data */
+ int min_ci_balanced; /* pair list balancing parameter
+ used for the 8x8x8 CUDA kernels */
+} nonbonded_verlet_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NB_VERLET_H */
--- /dev/null
- int cj; /* The j-cluster */
- unsigned excl; /* The exclusion (interaction) bits */
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_pairlist_h
+#define _nbnxn_pairlist_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A buffer data structure of 64 bytes
+ * to be placed at the beginning and end of structs
+ * to avoid cache invalidation of the real contents
+ * of the struct by writes to neighboring memory.
+ */
+typedef struct {
+ int dummy[16];
+} gmx_cache_protect_t;
+
+/* Abstract type for pair searching data */
+typedef struct nbnxn_search * nbnxn_search_t;
+
+/* Function that should return a pointer *ptr to memory
+ * of size nbytes.
+ * Error handling should be done within this function.
+ */
+typedef void nbnxn_alloc_t (void **ptr, size_t nbytes);
+
+/* Function that should free the memory pointed to by *ptr.
+ * NULL should not be passed to this function.
+ */
+typedef void nbnxn_free_t (void *ptr);
+
++/* This is the actual cluster-pair list j-entry.
++ * cj is the j-cluster.
++ * The interaction bits in excl are indexed i-major, j-minor.
++ * The cj entries are sorted such that ones with exclusions come first.
++ * This means that once a full mask (=NBNXN_INTERACTION_MASK_ALL)
++ * is found, all subsequent j-entries in the i-entry also have full masks.
++ */
+typedef struct {
- unsigned pair[32]; /* Exclusion bits for one warp, *
- * each unsigned has bit for 4*8 i clusters */
++ int cj; /* The j-cluster */
++ unsigned excl; /* The topology exclusion (interaction) bits */
+} nbnxn_cj_t;
+
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc)) => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc) => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
+#define NBNXN_CI_SHIFT 127
+#define NBNXN_CI_DO_LJ(subc) (1<<(7+3*(subc)))
+#define NBNXN_CI_HALF_LJ(subc) (1<<(8+3*(subc)))
+#define NBNXN_CI_DO_COUL(subc) (1<<(9+3*(subc)))
+
+/* Simple pair-list i-unit */
+typedef struct {
+ int ci; /* i-cluster */
+ int shift; /* Shift vector index plus possible flags, see above */
+ int cj_ind_start; /* Start index into cj */
+ int cj_ind_end; /* End index into cj */
+} nbnxn_ci_t;
+
+/* Grouped pair-list i-unit */
+typedef struct {
+ int sci; /* i-super-cluster */
+ int shift; /* Shift vector index plus possible flags */
+ int cj4_ind_start; /* Start index into cj4 */
+ int cj4_ind_end; /* End index into cj4 */
+} nbnxn_sci_t;
+
+typedef struct {
+ unsigned imask; /* The i-cluster interactions mask for 1 warp */
+ int excl_ind; /* Index into the exclusion array for 1 warp */
+} nbnxn_im_ei_t;
+
+typedef struct {
+ int cj[4]; /* The 4 j-clusters */
+ nbnxn_im_ei_t imei[2]; /* The i-cluster mask data for 2 warps */
+} nbnxn_cj4_t;
+
+typedef struct {
- real *simd_4xn_diag; /* indices to set the SIMD 4xN diagonal masks */
- real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
- unsigned *simd_excl_mask; /* exclusion masks for SIMD topology exclusions */
++ unsigned pair[32]; /* Topology exclusion interaction bits for one warp,
++ * each unsigned has bitS for 4*8 i clusters
++ */
+} nbnxn_excl_t;
+
+typedef struct {
+ gmx_cache_protect_t cp0;
+
+ nbnxn_alloc_t *alloc;
+ nbnxn_free_t *free;
+
+ gmx_bool bSimple; /* Simple list has na_sc=na_s and uses cj *
+ * Complex list uses cj4 */
+
+ int na_ci; /* The number of atoms per i-cluster */
+ int na_cj; /* The number of atoms per j-cluster */
+ int na_sc; /* The number of atoms per super cluster */
+ real rlist; /* The radius for constructing the list */
+ int nci; /* The number of i-clusters in the list */
+ nbnxn_ci_t *ci; /* The i-cluster list, size nci */
+ int ci_nalloc; /* The allocation size of ci */
+ int nsci; /* The number of i-super-clusters in the list */
+ nbnxn_sci_t *sci; /* The i-super-cluster list */
+ int sci_nalloc; /* The allocation size of sci */
+
+ int ncj; /* The number of j-clusters in the list */
+ nbnxn_cj_t *cj; /* The j-cluster list, size ncj */
+ int cj_nalloc; /* The allocation size of cj */
+
+ int ncj4; /* The total number of 4*j clusters */
+ nbnxn_cj4_t *cj4; /* The 4*j cluster list, size ncj4 */
+ int cj4_nalloc; /* The allocation size of cj4 */
+ int nexcl; /* The count for excl */
+ nbnxn_excl_t *excl; /* Atom interaction bits (non-exclusions) */
+ int excl_nalloc; /* The allocation size for excl */
+ int nci_tot; /* The total number of i clusters */
+
+ struct nbnxn_list_work *work;
+
+ gmx_cache_protect_t cp1;
+} nbnxn_pairlist_t;
+
+typedef struct {
+ int nnbl; /* number of lists */
+ nbnxn_pairlist_t **nbl; /* lists */
+ gmx_bool bCombined; /* TRUE if lists get combined into one (the 1st) */
+ gmx_bool bSimple; /* TRUE if the list of of type "simple"
+ (na_sc=na_s, no super-clusters used) */
+ int natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
+ int natpair_lj; /* Total number of atom pairs for LJ kernel */
+ int natpair_q; /* Total number of atom pairs for Q kernel */
+} nbnxn_pairlist_set_t;
+
+enum {
+ nbatXYZ, nbatXYZQ, nbatX4, nbatX8
+};
+
+typedef struct {
+ real *f; /* f, size natoms*fstride */
+ real *fshift; /* Shift force array, size SHIFTS*DIM */
+ int nV; /* The size of *Vvdw and *Vc */
+ real *Vvdw; /* Temporary Van der Waals group energy storage */
+ real *Vc; /* Temporary Coulomb group energy storage */
+ int nVS; /* The size of *VSvdw and *VSc */
+ real *VSvdw; /* Temporary SIMD Van der Waals group energy storage */
+ real *VSc; /* Temporary SIMD Coulomb group energy storage */
+} nbnxn_atomdata_output_t;
+
+/* Block size in atoms for the non-bonded thread force-buffer reduction,
+ * should be a multiple of all cell and x86 SIMD sizes (i.e. 2, 4 and 8).
+ * Should be small to reduce the reduction and zeroing cost,
+ * but too small will result in overhead.
+ * Currently the block size is NBNXN_BUFFERFLAG_SIZE*3*sizeof(real)=192 bytes.
+ */
+#ifdef GMX_DOUBLE
+#define NBNXN_BUFFERFLAG_SIZE 8
+#else
+#define NBNXN_BUFFERFLAG_SIZE 16
+#endif
+
+/* We currently store the reduction flags as bits in an unsigned int.
+ * In most cases this limits the number of flags to 32.
+ * The reduction will automatically disable the flagging and do a full
+ * reduction when the flags won't fit, but this will lead to very slow
+ * reduction. As we anyhow don't expect reasonable performance with
+ * more than 32 threads, we put in this hard limit.
+ * You can increase this number, but the reduction will be very slow.
+ */
+#define NBNXN_BUFFERFLAG_MAX_THREADS 32
+
+/* Flags for telling if threads write to force output buffers */
+typedef struct {
+ int nflag; /* The number of flag blocks */
+ unsigned *flag; /* Bit i is set when thread i writes to a cell-block */
+ int flag_nalloc; /* Allocation size of cxy_flag */
+} nbnxn_buffer_flags_t;
+
+/* LJ combination rules: geometric, Lorentz-Berthelot, none */
+enum {
+ ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR
+};
+
+typedef struct {
+ nbnxn_alloc_t *alloc;
+ nbnxn_free_t *free;
+ int ntype; /* The number of different atom types */
+ real *nbfp; /* Lennard-Jones 6*C6 and 12*C12 params, size ntype^2*2 */
+ int comb_rule; /* Combination rule, see enum above */
+ real *nbfp_comb; /* LJ parameter per atom type, size ntype*2 */
+ real *nbfp_s4; /* As nbfp, but with stride 4, size ntype^2*4. This
+ * might suit 4-wide SIMD loads of two values (e.g.
+ * two floats in single precision on x86). */
+ int natoms; /* Number of atoms */
+ int natoms_local; /* Number of local atoms */
+ int *type; /* Atom types */
+ real *lj_comb; /* LJ parameters per atom for combining for pairs */
+ int XFormat; /* The format of x (and q), enum */
+ int FFormat; /* The format of f, enum */
+ real *q; /* Charges, can be NULL if incorporated in x */
+ int na_c; /* The number of atoms per cluster */
+ int nenergrp; /* The number of energy groups */
+ int neg_2log; /* Log2 of nenergrp */
+ int *energrp; /* The energy groups per cluster, can be NULL */
+ gmx_bool bDynamicBox; /* Do we need to update shift_vec every step? */
+ rvec *shift_vec; /* Shift vectors, copied from t_forcerec */
+ int xstride; /* stride for a coordinate in x (usually 3 or 4) */
+ int fstride; /* stride for a coordinate in f (usually 3 or 4) */
+ real *x; /* x and possibly q, size natoms*xstride */
++
++ /* j-atom minus i-atom index for generating self and Newton exclusions
++ * cluster-cluster pairs of the diagonal, for 4xn and 2xnn kernels.
++ */
++ real *simd_4xn_diagonal_j_minus_i;
++ real *simd_2xnn_diagonal_j_minus_i;
++ /* Filters for topology exclusion masks for the SIMD kernels.
++ * filter2 is the same as filter1, but with each element duplicated.
++ */
++ unsigned *simd_exclusion_filter1;
++ unsigned *simd_exclusion_filter2;
++
+ int nout; /* The number of force arrays */
+ nbnxn_atomdata_output_t *out; /* Output data structures */
+ int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
+ gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */
+ nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */
+} nbnxn_atomdata_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
- endif (NOT GMX_EXTERNAL_BLAS)
+file(GLOB LINEARALGEBRA_SOURCES *.c)
+
+if (NOT GMX_EXTERNAL_BLAS)
+ file(GLOB BLAS_SOURCES gmx_blas/*.c)
- endif (NOT GMX_EXTERNAL_LAPACK)
++endif()
+
+if (NOT GMX_EXTERNAL_LAPACK)
+ file(GLOB LAPACK_SOURCES gmx_lapack/*.c)
++endif()
+
+set(LINEARALGEBRA_SOURCES
+ ${LINEARALGEBRA_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES})
+
+set(LIBGROMACS_SOURCES
+ ${LIBGROMACS_SOURCES} ${LINEARALGEBRA_SOURCES} PARENT_SCOPE)
+
+set(LINEARALGEBRA_PUBLIC_HEADERS
+ eigensolver.h
+ matrix.h
+ mtxio.h
+ sparsematrix.h)
+install(FILES ${LINEARALGEBRA_PUBLIC_HEADERS}
+ DESTINATION ${INCL_INSTALL_DIR}/gromacs/linearalgebra
+ COMPONENT development)
--- /dev/null
- case nbnxnkNotSet: returnvalue = "not set"; break;
- case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
- #ifndef GMX_NBNXN_SIMD
- case nbnxnk4xN_SIMD_4xN: returnvalue = "not available"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
- #else
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "sysstuff.h"
+#include "typedefs.h"
+#include "vec.h"
+#include "maths.h"
+#include "macros.h"
+#include "smalloc.h"
+#include "macros.h"
+#include "gmx_fatal.h"
+#include "gmx_fatal_collective.h"
+#include "physics.h"
+#include "force.h"
+#include "tables.h"
+#include "nonbonded.h"
+#include "invblock.h"
+#include "names.h"
+#include "network.h"
+#include "pbc.h"
+#include "ns.h"
+#include "mshift.h"
+#include "txtdump.h"
+#include "coulomb.h"
+#include "md_support.h"
+#include "md_logging.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "qmmm.h"
+#include "copyrite.h"
+#include "mtop_util.h"
+#include "nbnxn_search.h"
+#include "nbnxn_atomdata.h"
+#include "nbnxn_consts.h"
+#include "statutil.h"
+#include "gmx_omp_nthreads.h"
+#include "gmx_detect_hardware.h"
+
+#ifdef _MSC_VER
+/* MSVC definition for __cpuid() */
+#include <intrin.h>
+#endif
+
+#include "types/nbnxn_cuda_types_ext.h"
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "pmalloc_cuda.h"
+
+t_forcerec *mk_forcerec(void)
+{
+ t_forcerec *fr;
+
+ snew(fr, 1);
+
+ return fr;
+}
+
+#ifdef DEBUG
+static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
+{
+ int i, j;
+
+ for (i = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++)
+ {
+ fprintf(fp, "%2d - %2d", i, j);
+ if (bBHAM)
+ {
+ fprintf(fp, " a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
+ BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
+ }
+ else
+ {
+ fprintf(fp, " c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
+ C12(nbfp, atnr, i, j)/12.0);
+ }
+ }
+ }
+}
+#endif
+
+static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
+{
+ real *nbfp;
+ int i, j, k, atnr;
+
+ atnr = idef->atnr;
+ if (bBHAM)
+ {
+ snew(nbfp, 3*atnr*atnr);
+ for (i = k = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++, k++)
+ {
+ BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
+ BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
+ /* nbfp now includes the 6.0 derivative prefactor */
+ BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
+ }
+ }
+ }
+ else
+ {
+ snew(nbfp, 2*atnr*atnr);
+ for (i = k = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++, k++)
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ C6(nbfp, atnr, i, j) = idef->iparams[k].lj.c6*6.0;
+ C12(nbfp, atnr, i, j) = idef->iparams[k].lj.c12*12.0;
+ }
+ }
+ }
+
+ return nbfp;
+}
+
+/* This routine sets fr->solvent_opt to the most common solvent in the
+ * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
+ * the fr->solvent_type array with the correct type (or esolNO).
+ *
+ * Charge groups that fulfill the conditions but are not identical to the
+ * most common one will be marked as esolNO in the solvent_type array.
+ *
+ * TIP3p is identical to SPC for these purposes, so we call it
+ * SPC in the arrays (Apologies to Bill Jorgensen ;-)
+ *
+ * NOTE: QM particle should not
+ * become an optimized solvent. Not even if there is only one charge
+ * group in the Qm
+ */
+
+typedef struct
+{
+ int model;
+ int count;
+ int vdwtype[4];
+ real charge[4];
+} solvent_parameters_t;
+
+static void
+check_solvent_cg(const gmx_moltype_t *molt,
+ int cg0,
+ int nmol,
+ const unsigned char *qm_grpnr,
+ const t_grps *qm_grps,
+ t_forcerec * fr,
+ int *n_solvent_parameters,
+ solvent_parameters_t **solvent_parameters_p,
+ int cginfo,
+ int *cg_sp)
+{
+ const t_blocka * excl;
+ t_atom *atom;
+ int j, k;
+ int j0, j1, nj;
+ gmx_bool perturbed;
+ gmx_bool has_vdw[4];
+ gmx_bool match;
+ real tmp_charge[4];
+ int tmp_vdwtype[4];
+ int tjA;
+ gmx_bool qm;
+ solvent_parameters_t *solvent_parameters;
+
+ /* We use a list with parameters for each solvent type.
+ * Every time we discover a new molecule that fulfills the basic
+ * conditions for a solvent we compare with the previous entries
+ * in these lists. If the parameters are the same we just increment
+ * the counter for that type, and otherwise we create a new type
+ * based on the current molecule.
+ *
+ * Once we've finished going through all molecules we check which
+ * solvent is most common, and mark all those molecules while we
+ * clear the flag on all others.
+ */
+
+ solvent_parameters = *solvent_parameters_p;
+
+ /* Mark the cg first as non optimized */
+ *cg_sp = -1;
+
+ /* Check if this cg has no exclusions with atoms in other charge groups
+ * and all atoms inside the charge group excluded.
+ * We only have 3 or 4 atom solvent loops.
+ */
+ if (GET_CGINFO_EXCL_INTER(cginfo) ||
+ !GET_CGINFO_EXCL_INTRA(cginfo))
+ {
+ return;
+ }
+
+ /* Get the indices of the first atom in this charge group */
+ j0 = molt->cgs.index[cg0];
+ j1 = molt->cgs.index[cg0+1];
+
+ /* Number of atoms in our molecule */
+ nj = j1 - j0;
+
+ if (debug)
+ {
+ fprintf(debug,
+ "Moltype '%s': there are %d atoms in this charge group\n",
+ *molt->name, nj);
+ }
+
+ /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
+ * otherwise skip it.
+ */
+ if (nj < 3 || nj > 4)
+ {
+ return;
+ }
+
+ /* Check if we are doing QM on this group */
+ qm = FALSE;
+ if (qm_grpnr != NULL)
+ {
+ for (j = j0; j < j1 && !qm; j++)
+ {
+ qm = (qm_grpnr[j] < qm_grps->nr - 1);
+ }
+ }
+ /* Cannot use solvent optimization with QM */
+ if (qm)
+ {
+ return;
+ }
+
+ atom = molt->atoms.atom;
+
+ /* Still looks like a solvent, time to check parameters */
+
+ /* If it is perturbed (free energy) we can't use the solvent loops,
+ * so then we just skip to the next molecule.
+ */
+ perturbed = FALSE;
+
+ for (j = j0; j < j1 && !perturbed; j++)
+ {
+ perturbed = PERTURBED(atom[j]);
+ }
+
+ if (perturbed)
+ {
+ return;
+ }
+
+ /* Now it's only a question if the VdW and charge parameters
+ * are OK. Before doing the check we compare and see if they are
+ * identical to a possible previous solvent type.
+ * First we assign the current types and charges.
+ */
+ for (j = 0; j < nj; j++)
+ {
+ tmp_vdwtype[j] = atom[j0+j].type;
+ tmp_charge[j] = atom[j0+j].q;
+ }
+
+ /* Does it match any previous solvent type? */
+ for (k = 0; k < *n_solvent_parameters; k++)
+ {
+ match = TRUE;
+
+
+ /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
+ if ( (solvent_parameters[k].model == esolSPC && nj != 3) ||
+ (solvent_parameters[k].model == esolTIP4P && nj != 4) )
+ {
+ match = FALSE;
+ }
+
+ /* Check that types & charges match for all atoms in molecule */
+ for (j = 0; j < nj && match == TRUE; j++)
+ {
+ if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
+ {
+ match = FALSE;
+ }
+ if (tmp_charge[j] != solvent_parameters[k].charge[j])
+ {
+ match = FALSE;
+ }
+ }
+ if (match == TRUE)
+ {
+ /* Congratulations! We have a matched solvent.
+ * Flag it with this type for later processing.
+ */
+ *cg_sp = k;
+ solvent_parameters[k].count += nmol;
+
+ /* We are done with this charge group */
+ return;
+ }
+ }
+
+ /* If we get here, we have a tentative new solvent type.
+ * Before we add it we must check that it fulfills the requirements
+ * of the solvent optimized loops. First determine which atoms have
+ * VdW interactions.
+ */
+ for (j = 0; j < nj; j++)
+ {
+ has_vdw[j] = FALSE;
+ tjA = tmp_vdwtype[j];
+
+ /* Go through all other tpes and see if any have non-zero
+ * VdW parameters when combined with this one.
+ */
+ for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
+ {
+ /* We already checked that the atoms weren't perturbed,
+ * so we only need to check state A now.
+ */
+ if (fr->bBHAM)
+ {
+ has_vdw[j] = (has_vdw[j] ||
+ (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ }
+ else
+ {
+ /* Standard LJ */
+ has_vdw[j] = (has_vdw[j] ||
+ (C6(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ }
+ }
+ }
+
+ /* Now we know all we need to make the final check and assignment. */
+ if (nj == 3)
+ {
+ /* So, is it an SPC?
+ * For this we require thatn all atoms have charge,
+ * the charges on atom 2 & 3 should be the same, and only
+ * atom 1 might have VdW.
+ */
+ if (has_vdw[1] == FALSE &&
+ has_vdw[2] == FALSE &&
+ tmp_charge[0] != 0 &&
+ tmp_charge[1] != 0 &&
+ tmp_charge[2] == tmp_charge[1])
+ {
+ srenew(solvent_parameters, *n_solvent_parameters+1);
+ solvent_parameters[*n_solvent_parameters].model = esolSPC;
+ solvent_parameters[*n_solvent_parameters].count = nmol;
+ for (k = 0; k < 3; k++)
+ {
+ solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ solvent_parameters[*n_solvent_parameters].charge[k] = tmp_charge[k];
+ }
+
+ *cg_sp = *n_solvent_parameters;
+ (*n_solvent_parameters)++;
+ }
+ }
+ else if (nj == 4)
+ {
+ /* Or could it be a TIP4P?
+ * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
+ * Only atom 1 mght have VdW.
+ */
+ if (has_vdw[1] == FALSE &&
+ has_vdw[2] == FALSE &&
+ has_vdw[3] == FALSE &&
+ tmp_charge[0] == 0 &&
+ tmp_charge[1] != 0 &&
+ tmp_charge[2] == tmp_charge[1] &&
+ tmp_charge[3] != 0)
+ {
+ srenew(solvent_parameters, *n_solvent_parameters+1);
+ solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
+ solvent_parameters[*n_solvent_parameters].count = nmol;
+ for (k = 0; k < 4; k++)
+ {
+ solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ solvent_parameters[*n_solvent_parameters].charge[k] = tmp_charge[k];
+ }
+
+ *cg_sp = *n_solvent_parameters;
+ (*n_solvent_parameters)++;
+ }
+ }
+
+ *solvent_parameters_p = solvent_parameters;
+}
+
+static void
+check_solvent(FILE * fp,
+ const gmx_mtop_t * mtop,
+ t_forcerec * fr,
+ cginfo_mb_t *cginfo_mb)
+{
+ const t_block * cgs;
+ const t_block * mols;
+ const gmx_moltype_t *molt;
+ int mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
+ int n_solvent_parameters;
+ solvent_parameters_t *solvent_parameters;
+ int **cg_sp;
+ int bestsp, bestsol;
+
+ if (debug)
+ {
+ fprintf(debug, "Going to determine what solvent types we have.\n");
+ }
+
+ mols = &mtop->mols;
+
+ n_solvent_parameters = 0;
+ solvent_parameters = NULL;
+ /* Allocate temporary array for solvent type */
+ snew(cg_sp, mtop->nmolblock);
+
+ cg_offset = 0;
+ at_offset = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ molt = &mtop->moltype[mtop->molblock[mb].type];
+ cgs = &molt->cgs;
+ /* Here we have to loop over all individual molecules
+ * because we need to check for QMMM particles.
+ */
+ snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
+ nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
+ nmol = mtop->molblock[mb].nmol/nmol_ch;
+ for (mol = 0; mol < nmol_ch; mol++)
+ {
+ cgm = mol*cgs->nr;
+ am = mol*cgs->index[cgs->nr];
+ for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
+ {
+ check_solvent_cg(molt, cg_mol, nmol,
+ mtop->groups.grpnr[egcQMMM] ?
+ mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
+ &mtop->groups.grps[egcQMMM],
+ fr,
+ &n_solvent_parameters, &solvent_parameters,
+ cginfo_mb[mb].cginfo[cgm+cg_mol],
+ &cg_sp[mb][cgm+cg_mol]);
+ }
+ }
+ cg_offset += cgs->nr;
+ at_offset += cgs->index[cgs->nr];
+ }
+
+ /* Puh! We finished going through all charge groups.
+ * Now find the most common solvent model.
+ */
+
+ /* Most common solvent this far */
+ bestsp = -2;
+ for (i = 0; i < n_solvent_parameters; i++)
+ {
+ if (bestsp == -2 ||
+ solvent_parameters[i].count > solvent_parameters[bestsp].count)
+ {
+ bestsp = i;
+ }
+ }
+
+ if (bestsp >= 0)
+ {
+ bestsol = solvent_parameters[bestsp].model;
+ }
+ else
+ {
+ bestsol = esolNO;
+ }
+
+#ifdef DISABLE_WATER_NLIST
+ bestsol = esolNO;
+#endif
+
+ fr->nWatMol = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
+ nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
+ for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
+ {
+ if (cg_sp[mb][i] == bestsp)
+ {
+ SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
+ fr->nWatMol += nmol;
+ }
+ else
+ {
+ SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
+ }
+ }
+ sfree(cg_sp[mb]);
+ }
+ sfree(cg_sp);
+
+ if (bestsol != esolNO && fp != NULL)
+ {
+ fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
+ esol_names[bestsol],
+ solvent_parameters[bestsp].count);
+ }
+
+ sfree(solvent_parameters);
+ fr->solvent_opt = bestsol;
+}
+
+enum {
+ acNONE = 0, acCONSTRAINT, acSETTLE
+};
+
+static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
+ t_forcerec *fr, gmx_bool bNoSolvOpt,
+ gmx_bool *bExcl_IntraCGAll_InterCGNone)
+{
+ const t_block *cgs;
+ const t_blocka *excl;
+ const gmx_moltype_t *molt;
+ const gmx_molblock_t *molb;
+ cginfo_mb_t *cginfo_mb;
+ gmx_bool *type_VDW;
+ int *cginfo;
+ int cg_offset, a_offset, cgm, am;
+ int mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
+ int *a_con;
+ int ftype;
+ int ia;
+ gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
+
+ ncg_tot = ncg_mtop(mtop);
+ snew(cginfo_mb, mtop->nmolblock);
+
+ snew(type_VDW, fr->ntype);
+ for (ai = 0; ai < fr->ntype; ai++)
+ {
+ type_VDW[ai] = FALSE;
+ for (j = 0; j < fr->ntype; j++)
+ {
+ type_VDW[ai] = type_VDW[ai] ||
+ fr->bBHAM ||
+ C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
+ C12(fr->nbfp, fr->ntype, ai, j) != 0;
+ }
+ }
+
+ *bExcl_IntraCGAll_InterCGNone = TRUE;
+
+ excl_nalloc = 10;
+ snew(bExcl, excl_nalloc);
+ cg_offset = 0;
+ a_offset = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ molb = &mtop->molblock[mb];
+ molt = &mtop->moltype[molb->type];
+ cgs = &molt->cgs;
+ excl = &molt->excls;
+
+ /* Check if the cginfo is identical for all molecules in this block.
+ * If so, we only need an array of the size of one molecule.
+ * Otherwise we make an array of #mol times #cgs per molecule.
+ */
+ bId = TRUE;
+ am = 0;
+ for (m = 0; m < molb->nmol; m++)
+ {
+ am = m*cgs->index[cgs->nr];
+ for (cg = 0; cg < cgs->nr; cg++)
+ {
+ a0 = cgs->index[cg];
+ a1 = cgs->index[cg+1];
+ if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
+ ggrpnr(&mtop->groups, egcENER, a_offset +a0))
+ {
+ bId = FALSE;
+ }
+ if (mtop->groups.grpnr[egcQMMM] != NULL)
+ {
+ for (ai = a0; ai < a1; ai++)
+ {
+ if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
+ mtop->groups.grpnr[egcQMMM][a_offset +ai])
+ {
+ bId = FALSE;
+ }
+ }
+ }
+ }
+ }
+
+ cginfo_mb[mb].cg_start = cg_offset;
+ cginfo_mb[mb].cg_end = cg_offset + molb->nmol*cgs->nr;
+ cginfo_mb[mb].cg_mod = (bId ? 1 : molb->nmol)*cgs->nr;
+ snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
+ cginfo = cginfo_mb[mb].cginfo;
+
+ /* Set constraints flags for constrained atoms */
+ snew(a_con, molt->atoms.nr);
+ for (ftype = 0; ftype < F_NRE; ftype++)
+ {
+ if (interaction_function[ftype].flags & IF_CONSTRAINT)
+ {
+ int nral;
+
+ nral = NRAL(ftype);
+ for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
+ {
+ int a;
+
+ for (a = 0; a < nral; a++)
+ {
+ a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
+ (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
+ }
+ }
+ }
+ }
+
+ for (m = 0; m < (bId ? 1 : molb->nmol); m++)
+ {
+ cgm = m*cgs->nr;
+ am = m*cgs->index[cgs->nr];
+ for (cg = 0; cg < cgs->nr; cg++)
+ {
+ a0 = cgs->index[cg];
+ a1 = cgs->index[cg+1];
+
+ /* Store the energy group in cginfo */
+ gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
+ SET_CGINFO_GID(cginfo[cgm+cg], gid);
+
+ /* Check the intra/inter charge group exclusions */
+ if (a1-a0 > excl_nalloc)
+ {
+ excl_nalloc = a1 - a0;
+ srenew(bExcl, excl_nalloc);
+ }
+ /* bExclIntraAll: all intra cg interactions excluded
+ * bExclInter: any inter cg interactions excluded
+ */
+ bExclIntraAll = TRUE;
+ bExclInter = FALSE;
+ bHaveVDW = FALSE;
+ bHaveQ = FALSE;
+ for (ai = a0; ai < a1; ai++)
+ {
+ /* Check VDW and electrostatic interactions */
+ bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
+ type_VDW[molt->atoms.atom[ai].typeB]);
+ bHaveQ = bHaveQ || (molt->atoms.atom[ai].q != 0 ||
+ molt->atoms.atom[ai].qB != 0);
+
+ /* Clear the exclusion list for atom ai */
+ for (aj = a0; aj < a1; aj++)
+ {
+ bExcl[aj-a0] = FALSE;
+ }
+ /* Loop over all the exclusions of atom ai */
+ for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
+ {
+ aj = excl->a[j];
+ if (aj < a0 || aj >= a1)
+ {
+ bExclInter = TRUE;
+ }
+ else
+ {
+ bExcl[aj-a0] = TRUE;
+ }
+ }
+ /* Check if ai excludes a0 to a1 */
+ for (aj = a0; aj < a1; aj++)
+ {
+ if (!bExcl[aj-a0])
+ {
+ bExclIntraAll = FALSE;
+ }
+ }
+
+ switch (a_con[ai])
+ {
+ case acCONSTRAINT:
+ SET_CGINFO_CONSTR(cginfo[cgm+cg]);
+ break;
+ case acSETTLE:
+ SET_CGINFO_SETTLE(cginfo[cgm+cg]);
+ break;
+ default:
+ break;
+ }
+ }
+ if (bExclIntraAll)
+ {
+ SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
+ }
+ if (bExclInter)
+ {
+ SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
+ }
+ if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
+ {
+ /* The size in cginfo is currently only read with DD */
+ gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
+ }
+ if (bHaveVDW)
+ {
+ SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
+ }
+ if (bHaveQ)
+ {
+ SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
+ }
+ /* Store the charge group size */
+ SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
+
+ if (!bExclIntraAll || bExclInter)
+ {
+ *bExcl_IntraCGAll_InterCGNone = FALSE;
+ }
+ }
+ }
+
+ sfree(a_con);
+
+ cg_offset += molb->nmol*cgs->nr;
+ a_offset += molb->nmol*cgs->index[cgs->nr];
+ }
+ sfree(bExcl);
+
+ /* the solvent optimizer is called after the QM is initialized,
+ * because we don't want to have the QM subsystemto become an
+ * optimized solvent
+ */
+
+ check_solvent(fplog, mtop, fr, cginfo_mb);
+
+ if (getenv("GMX_NO_SOLV_OPT"))
+ {
+ if (fplog)
+ {
+ fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
+ "Disabling all solvent optimization\n");
+ }
+ fr->solvent_opt = esolNO;
+ }
+ if (bNoSolvOpt)
+ {
+ fr->solvent_opt = esolNO;
+ }
+ if (!fr->solvent_opt)
+ {
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
+ {
+ SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
+ }
+ }
+ }
+
+ return cginfo_mb;
+}
+
+static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
+{
+ int ncg, mb, cg;
+ int *cginfo;
+
+ ncg = cgi_mb[nmb-1].cg_end;
+ snew(cginfo, ncg);
+ mb = 0;
+ for (cg = 0; cg < ncg; cg++)
+ {
+ while (cg >= cgi_mb[mb].cg_end)
+ {
+ mb++;
+ }
+ cginfo[cg] =
+ cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
+ }
+
+ return cginfo;
+}
+
+static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
+{
+ double qsum, q2sum, q;
+ int mb, nmol, i;
+ const t_atoms *atoms;
+
+ qsum = 0;
+ q2sum = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (i = 0; i < atoms->nr; i++)
+ {
+ q = atoms->atom[i].q;
+ qsum += nmol*q;
+ q2sum += nmol*q*q;
+ }
+ }
+ fr->qsum[0] = qsum;
+ fr->q2sum[0] = q2sum;
+ if (fr->efep != efepNO)
+ {
+ qsum = 0;
+ q2sum = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (i = 0; i < atoms->nr; i++)
+ {
+ q = atoms->atom[i].qB;
+ qsum += nmol*q;
+ q2sum += nmol*q*q;
+ }
+ fr->qsum[1] = qsum;
+ fr->q2sum[1] = q2sum;
+ }
+ }
+ else
+ {
+ fr->qsum[1] = fr->qsum[0];
+ fr->q2sum[1] = fr->q2sum[0];
+ }
+ if (log)
+ {
+ if (fr->efep == efepNO)
+ {
+ fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
+ }
+ else
+ {
+ fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
+ fr->qsum[0], fr->qsum[1]);
+ }
+ }
+}
+
+void update_forcerec(t_forcerec *fr, matrix box)
+{
+ if (fr->eeltype == eelGRF)
+ {
+ calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ fr->rcoulomb, fr->temp, fr->zsquare, box,
+ &fr->kappa, &fr->k_rf, &fr->c_rf);
+ }
+}
+
+void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
+{
+ const t_atoms *atoms, *atoms_tpi;
+ const t_blocka *excl;
+ int mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
+#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
+ long long int npair, npair_ij, tmpi, tmpj;
+#else
+ double npair, npair_ij, tmpi, tmpj;
+#endif
+ double csix, ctwelve;
+ int ntp, *typecount;
+ gmx_bool bBHAM;
+ real *nbfp;
+
+ ntp = fr->ntype;
+ bBHAM = fr->bBHAM;
+ nbfp = fr->nbfp;
+
+ for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
+ {
+ csix = 0;
+ ctwelve = 0;
+ npair = 0;
+ nexcl = 0;
+ if (!fr->n_tpi)
+ {
+ /* Count the types so we avoid natoms^2 operations */
+ snew(typecount, ntp);
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (i = 0; i < atoms->nr; i++)
+ {
+ if (q == 0)
+ {
+ tpi = atoms->atom[i].type;
+ }
+ else
+ {
+ tpi = atoms->atom[i].typeB;
+ }
+ typecount[tpi] += nmol;
+ }
+ }
+ for (tpi = 0; tpi < ntp; tpi++)
+ {
+ for (tpj = tpi; tpj < ntp; tpj++)
+ {
+ tmpi = typecount[tpi];
+ tmpj = typecount[tpj];
+ if (tpi != tpj)
+ {
+ npair_ij = tmpi*tmpj;
+ }
+ else
+ {
+ npair_ij = tmpi*(tmpi - 1)/2;
+ }
+ if (bBHAM)
+ {
+ /* nbfp now includes the 6.0 derivative prefactor */
+ csix += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ }
+ else
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ csix += npair_ij* C6(nbfp, ntp, tpi, tpj)/6.0;
+ ctwelve += npair_ij* C12(nbfp, ntp, tpi, tpj)/12.0;
+ }
+ npair += npair_ij;
+ }
+ }
+ sfree(typecount);
+ /* Subtract the excluded pairs.
+ * The main reason for substracting exclusions is that in some cases
+ * some combinations might never occur and the parameters could have
+ * any value. These unused values should not influence the dispersion
+ * correction.
+ */
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ excl = &mtop->moltype[mtop->molblock[mb].type].excls;
+ for (i = 0; (i < atoms->nr); i++)
+ {
+ if (q == 0)
+ {
+ tpi = atoms->atom[i].type;
+ }
+ else
+ {
+ tpi = atoms->atom[i].typeB;
+ }
+ j1 = excl->index[i];
+ j2 = excl->index[i+1];
+ for (j = j1; j < j2; j++)
+ {
+ k = excl->a[j];
+ if (k > i)
+ {
+ if (q == 0)
+ {
+ tpj = atoms->atom[k].type;
+ }
+ else
+ {
+ tpj = atoms->atom[k].typeB;
+ }
+ if (bBHAM)
+ {
+ /* nbfp now includes the 6.0 derivative prefactor */
+ csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ }
+ else
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ csix -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
+ }
+ nexcl += nmol;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Only correct for the interaction of the test particle
+ * with the rest of the system.
+ */
+ atoms_tpi =
+ &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
+
+ npair = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (j = 0; j < atoms->nr; j++)
+ {
+ nmolc = nmol;
+ /* Remove the interaction of the test charge group
+ * with itself.
+ */
+ if (mb == mtop->nmolblock-1)
+ {
+ nmolc--;
+
+ if (mb == 0 && nmol == 1)
+ {
+ gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
+ }
+ }
+ if (q == 0)
+ {
+ tpj = atoms->atom[j].type;
+ }
+ else
+ {
+ tpj = atoms->atom[j].typeB;
+ }
+ for (i = 0; i < fr->n_tpi; i++)
+ {
+ if (q == 0)
+ {
+ tpi = atoms_tpi->atom[i].type;
+ }
+ else
+ {
+ tpi = atoms_tpi->atom[i].typeB;
+ }
+ if (bBHAM)
+ {
+ /* nbfp now includes the 6.0 derivative prefactor */
+ csix += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ }
+ else
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ csix += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
+ }
+ npair += nmolc;
+ }
+ }
+ }
+ }
+ if (npair - nexcl <= 0 && fplog)
+ {
+ fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
+ csix = 0;
+ ctwelve = 0;
+ }
+ else
+ {
+ csix /= npair - nexcl;
+ ctwelve /= npair - nexcl;
+ }
+ if (debug)
+ {
+ fprintf(debug, "Counted %d exclusions\n", nexcl);
+ fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
+ fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
+ }
+ fr->avcsix[q] = csix;
+ fr->avctwelve[q] = ctwelve;
+ }
+ if (fplog != NULL)
+ {
+ if (fr->eDispCorr == edispcAllEner ||
+ fr->eDispCorr == edispcAllEnerPres)
+ {
+ fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ fr->avcsix[0], fr->avctwelve[0]);
+ }
+ else
+ {
+ fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
+ }
+ }
+}
+
+
+static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
+ const gmx_mtop_t *mtop)
+{
+ const t_atoms *at1, *at2;
+ int mt1, mt2, i, j, tpi, tpj, ntypes;
+ real b, bmin;
+ real *nbfp;
+
+ if (fplog)
+ {
+ fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
+ }
+ nbfp = fr->nbfp;
+ ntypes = fr->ntype;
+
+ bmin = -1;
+ fr->bham_b_max = 0;
+ for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
+ {
+ at1 = &mtop->moltype[mt1].atoms;
+ for (i = 0; (i < at1->nr); i++)
+ {
+ tpi = at1->atom[i].type;
+ if (tpi >= ntypes)
+ {
+ gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
+ }
+
+ for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
+ {
+ at2 = &mtop->moltype[mt2].atoms;
+ for (j = 0; (j < at2->nr); j++)
+ {
+ tpj = at2->atom[j].type;
+ if (tpj >= ntypes)
+ {
+ gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
+ }
+ b = BHAMB(nbfp, ntypes, tpi, tpj);
+ if (b > fr->bham_b_max)
+ {
+ fr->bham_b_max = b;
+ }
+ if ((b < bmin) || (bmin == -1))
+ {
+ bmin = b;
+ }
+ }
+ }
+ }
+ }
+ if (fplog)
+ {
+ fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
+ bmin, fr->bham_b_max);
+ }
+}
+
+static void make_nbf_tables(FILE *fp, const output_env_t oenv,
+ t_forcerec *fr, real rtab,
+ const t_commrec *cr,
+ const char *tabfn, char *eg1, char *eg2,
+ t_nblists *nbl)
+{
+ char buf[STRLEN];
+ int i, j;
+
+ if (tabfn == NULL)
+ {
+ if (debug)
+ {
+ fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
+ }
+ return;
+ }
+
+ sprintf(buf, "%s", tabfn);
+ if (eg1 && eg2)
+ {
+ /* Append the two energy group names */
+ sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
+ eg1, eg2, ftp2ext(efXVG));
+ }
+ nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
+ /* Copy the contents of the table to separate coulomb and LJ tables too,
+ * to improve cache performance.
+ */
+ /* For performance reasons we want
+ * the table data to be aligned to 16-byte. The pointers could be freed
+ * but currently aren't.
+ */
+ nbl->table_elec.interaction = GMX_TABLE_INTERACTION_ELEC;
+ nbl->table_elec.format = nbl->table_elec_vdw.format;
+ nbl->table_elec.r = nbl->table_elec_vdw.r;
+ nbl->table_elec.n = nbl->table_elec_vdw.n;
+ nbl->table_elec.scale = nbl->table_elec_vdw.scale;
+ nbl->table_elec.scale_exp = nbl->table_elec_vdw.scale_exp;
+ nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
+ nbl->table_elec.ninteractions = 1;
+ nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
+ snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
+
+ nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
+ nbl->table_vdw.format = nbl->table_elec_vdw.format;
+ nbl->table_vdw.r = nbl->table_elec_vdw.r;
+ nbl->table_vdw.n = nbl->table_elec_vdw.n;
+ nbl->table_vdw.scale = nbl->table_elec_vdw.scale;
+ nbl->table_vdw.scale_exp = nbl->table_elec_vdw.scale_exp;
+ nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
+ nbl->table_vdw.ninteractions = 2;
+ nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
+ snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
+
+ for (i = 0; i <= nbl->table_elec_vdw.n; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
+ }
+ for (j = 0; j < 8; j++)
+ {
+ nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
+ }
+ }
+}
+
+static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
+ int *ncount, int **count)
+{
+ const gmx_moltype_t *molt;
+ const t_ilist *il;
+ int mt, ftype, stride, i, j, tabnr;
+
+ for (mt = 0; mt < mtop->nmoltype; mt++)
+ {
+ molt = &mtop->moltype[mt];
+ for (ftype = 0; ftype < F_NRE; ftype++)
+ {
+ if (ftype == ftype1 || ftype == ftype2)
+ {
+ il = &molt->ilist[ftype];
+ stride = 1 + NRAL(ftype);
+ for (i = 0; i < il->nr; i += stride)
+ {
+ tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
+ if (tabnr < 0)
+ {
+ gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
+ }
+ if (tabnr >= *ncount)
+ {
+ srenew(*count, tabnr+1);
+ for (j = *ncount; j < tabnr+1; j++)
+ {
+ (*count)[j] = 0;
+ }
+ *ncount = tabnr+1;
+ }
+ (*count)[tabnr]++;
+ }
+ }
+ }
+ }
+}
+
+static bondedtable_t *make_bonded_tables(FILE *fplog,
+ int ftype1, int ftype2,
+ const gmx_mtop_t *mtop,
+ const char *basefn, const char *tabext)
+{
+ int i, ncount, *count;
+ char tabfn[STRLEN];
+ bondedtable_t *tab;
+
+ tab = NULL;
+
+ ncount = 0;
+ count = NULL;
+ count_tables(ftype1, ftype2, mtop, &ncount, &count);
+
+ if (ncount > 0)
+ {
+ snew(tab, ncount);
+ for (i = 0; i < ncount; i++)
+ {
+ if (count[i] > 0)
+ {
+ sprintf(tabfn, "%s", basefn);
+ sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
+ tabext, i, ftp2ext(efXVG));
+ tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
+ }
+ }
+ sfree(count);
+ }
+
+ return tab;
+}
+
+void forcerec_set_ranges(t_forcerec *fr,
+ int ncg_home, int ncg_force,
+ int natoms_force,
+ int natoms_force_constr, int natoms_f_novirsum)
+{
+ fr->cg0 = 0;
+ fr->hcg = ncg_home;
+
+ /* fr->ncg_force is unused in the standard code,
+ * but it can be useful for modified code dealing with charge groups.
+ */
+ fr->ncg_force = ncg_force;
+ fr->natoms_force = natoms_force;
+ fr->natoms_force_constr = natoms_force_constr;
+
+ if (fr->natoms_force_constr > fr->nalloc_force)
+ {
+ fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
+
+ if (fr->bTwinRange)
+ {
+ srenew(fr->f_twin, fr->nalloc_force);
+ }
+ }
+
+ if (fr->bF_NoVirSum)
+ {
+ fr->f_novirsum_n = natoms_f_novirsum;
+ if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
+ {
+ fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
+ srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
+ }
+ }
+ else
+ {
+ fr->f_novirsum_n = 0;
+ }
+}
+
+static real cutoff_inf(real cutoff)
+{
+ if (cutoff == 0)
+ {
+ cutoff = GMX_CUTOFF_INF;
+ }
+
+ return cutoff;
+}
+
+static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
+ t_forcerec *fr, const t_inputrec *ir,
+ const char *tabfn, const gmx_mtop_t *mtop,
+ matrix box)
+{
+ char buf[STRLEN];
+ int i, j;
+
+ if (tabfn == NULL)
+ {
+ gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
+ return;
+ }
+
+ snew(fr->atf_tabs, ir->adress->n_tf_grps);
+
+ sprintf(buf, "%s", tabfn);
+ for (i = 0; i < ir->adress->n_tf_grps; i++)
+ {
+ j = ir->adress->tf_table_index[i]; /* get energy group index */
+ sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
+ *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
+ if (fp)
+ {
+ fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
+ }
+ fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
+ }
+
+}
+
+gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
+{
+ gmx_bool bAllvsAll;
+
+ bAllvsAll =
+ (
+ ir->rlist == 0 &&
+ ir->rcoulomb == 0 &&
+ ir->rvdw == 0 &&
+ ir->ePBC == epbcNONE &&
+ ir->vdwtype == evdwCUT &&
+ ir->coulombtype == eelCUT &&
+ ir->efep == efepNO &&
+ (ir->implicit_solvent == eisNO ||
+ (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
+ ir->gb_algorithm == egbHCT ||
+ ir->gb_algorithm == egbOBC))) &&
+ getenv("GMX_NO_ALLVSALL") == NULL
+ );
+
+ if (bAllvsAll && ir->opts.ngener > 1)
+ {
+ const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
+
+ if (bPrintNote)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "\n%s\n", note);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "\n%s\n", note);
+ }
+ }
+ bAllvsAll = FALSE;
+ }
+
+ if (bAllvsAll && fp && MASTER(cr))
+ {
+ fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+ }
+
+ return bAllvsAll;
+}
+
+
+static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
+{
+ int t, i;
+
+ /* These thread local data structures are used for bondeds only */
+ fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+
+ if (fr->nthreads > 1)
+ {
+ snew(fr->f_t, fr->nthreads);
+ /* Thread 0 uses the global force and energy arrays */
+ for (t = 1; t < fr->nthreads; t++)
+ {
+ fr->f_t[t].f = NULL;
+ fr->f_t[t].f_nalloc = 0;
+ snew(fr->f_t[t].fshift, SHIFTS);
+ fr->f_t[t].grpp.nener = nenergrp*nenergrp;
+ for (i = 0; i < egNR; i++)
+ {
+ snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
+ }
+ }
+ }
+}
+
+
+static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
+ int *kernel_type,
+ int *ewald_excl)
+{
+ *kernel_type = nbnxnk4x4_PlainC;
+ *ewald_excl = ewaldexclTable;
+
+#ifdef GMX_NBNXN_SIMD
+ {
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We expect the 2xNN kernels to be faster in most cases */
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+ if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+ {
+ /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT, but extra zeros interactions
+ * can compensate. As we currently don't detect the actual use
+ * of HT, switch to 4x8 to avoid a potential performance hit.
+ */
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+ }
+#endif
+ if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
+ {
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+ gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
+ }
+ if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
+ {
+#ifdef GMX_NBNXN_SIMD_2XNN
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#else
+ gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
+ }
+
+ /* Analytical Ewald exclusion correction is only an option in the
+ * x86 SIMD kernel. This is faster in single precision
+ * on Bulldozer and slightly faster on Sandy Bridge.
+ */
+#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+ *ewald_excl = ewaldexclAnalytical;
+#endif
+ if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
+ {
+ *ewald_excl = ewaldexclTable;
+ }
+ if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
+ {
+ *ewald_excl = ewaldexclAnalytical;
+ }
+
+ }
+#endif /* GMX_X86_SSE2 */
+}
+
+
+const char *lookup_nbnxn_kernel_name(int kernel_type)
+{
+ const char *returnvalue = NULL;
+ switch (kernel_type)
+ {
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
- * on compiler flags. As we use nearly identical intrinsics, using an AVX
- * compiler flag without an AVX macro effectively results in AVX kernels.
++ case nbnxnkNotSet:
++ returnvalue = "not set";
++ break;
++ case nbnxnk4x4_PlainC:
++ returnvalue = "plain C";
++ break;
++ case nbnxnk4xN_SIMD_4xN:
++ case nbnxnk4xN_SIMD_2xNN:
++#ifdef GMX_NBNXN_SIMD
+#ifdef GMX_X86_SSE2
- #if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
- #ifndef GMX_X86_SSE4_1
- case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE2"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
++ /* We have x86 SSE2 compatible SIMD */
++#ifdef GMX_X86_AVX_128_FMA
++ returnvalue = "AVX-128-FMA";
++#else
++#if defined GMX_X86_AVX_256 || defined __AVX__
++ /* x86 SIMD intrinsics can be converted to SSE or AVX depending
++ * on compiler flags. As we use nearly identical intrinsics,
++ * compiling for AVX without an AVX macros effectively results
++ * in AVX kernels.
+ * For gcc we check for __AVX__
+ * At least a check for icc should be added (if there is a macro)
+ */
- case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE4.1"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
++#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
++ returnvalue = "AVX-256";
+#else
- case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-128"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
- #endif
- #endif
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-256"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
++ returnvalue = "AVX-128";
+#endif
+#else
- #else /* not GMX_X86_SSE2 */
- case nbnxnk4xN_SIMD_4xN: returnvalue = "SIMD"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
++#ifdef GMX_X86_SSE4_1
++ returnvalue = "SSE4.1";
++#else
++ returnvalue = "SSE2";
+#endif
+#endif
+#endif
++#else /* GMX_X86_SSE2 */
++ /* not GMX_X86_SSE2, but other SIMD */
++ returnvalue = "SIMD";
++#endif /* GMX_X86_SSE2 */
++#else /* GMX_NBNXN_SIMD */
++ returnvalue = "not available";
++#endif /* GMX_NBNXN_SIMD */
++ break;
+ case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
+ case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
+
+ case nbnxnkNR:
+ default:
+ gmx_fatal(FARGS, "Illegal kernel type selected");
+ returnvalue = NULL;
+ break;
+ }
+ return returnvalue;
+};
+
+static void pick_nbnxn_kernel(FILE *fp,
+ const t_commrec *cr,
+ gmx_bool use_cpu_acceleration,
+ gmx_bool bUseGPU,
+ gmx_bool bEmulateGPU,
+ const t_inputrec *ir,
+ int *kernel_type,
+ int *ewald_excl,
+ gmx_bool bDoNonbonded)
+{
+ assert(kernel_type);
+
+ *kernel_type = nbnxnkNotSet;
+ *ewald_excl = ewaldexclTable;
+
+ if (bEmulateGPU)
+ {
+ *kernel_type = nbnxnk8x8x8_PlainC;
+
+ if (bDoNonbonded)
+ {
+ md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
+ }
+ }
+ else if (bUseGPU)
+ {
+ *kernel_type = nbnxnk8x8x8_CUDA;
+ }
+
+ if (*kernel_type == nbnxnkNotSet)
+ {
+ if (use_cpu_acceleration)
+ {
+ pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
+ }
+ else
+ {
+ *kernel_type = nbnxnk4x4_PlainC;
+ }
+ }
+
+ if (bDoNonbonded && fp != NULL)
+ {
+ fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
+ lookup_nbnxn_kernel_name(*kernel_type),
+ nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ nbnxn_kernel_to_cj_size(*kernel_type));
+ }
+}
+
+static void pick_nbnxn_resources(const t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ gmx_bool bDoNonbonded,
+ gmx_bool *bUseGPU,
+ gmx_bool *bEmulateGPU)
+{
+ gmx_bool bEmulateGPUEnvVarSet;
+ char gpu_err_str[STRLEN];
+
+ *bUseGPU = FALSE;
+
+ bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
+
+ /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
+ * GPUs (currently) only handle non-bonded calculations, we will
+ * automatically switch to emulation if non-bonded calculations are
+ * turned off via GMX_NO_NONBONDED - this is the simple and elegant
+ * way to turn off GPU initialization, data movement, and cleanup.
+ *
+ * GPU emulation can be useful to assess the performance one can expect by
+ * adding GPU(s) to the machine. The conditional below allows this even
+ * if mdrun is compiled without GPU acceleration support.
+ * Note that you should freezing the system as otherwise it will explode.
+ */
+ *bEmulateGPU = (bEmulateGPUEnvVarSet ||
+ (!bDoNonbonded && hwinfo->bCanUseGPU));
+
+ /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
+ */
+ if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+ {
+ /* Each PP node will use the intra-node id-th device from the
+ * list of detected/selected GPUs. */
+ if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+ {
+ /* At this point the init should never fail as we made sure that
+ * we have all the GPUs we need. If it still does, we'll bail. */
+ gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+ cr->nodeid,
+ get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+ gpu_err_str);
+ }
+
+ /* Here we actually turn on hardware GPU acceleration */
+ *bUseGPU = TRUE;
+ }
+}
+
+gmx_bool uses_simple_tables(int cutoff_scheme,
+ nonbonded_verlet_t *nbv,
+ int group)
+{
+ gmx_bool bUsesSimpleTables = TRUE;
+ int grp_index;
+
+ switch (cutoff_scheme)
+ {
+ case ecutsGROUP:
+ bUsesSimpleTables = TRUE;
+ break;
+ case ecutsVERLET:
+ assert(NULL != nbv && NULL != nbv->grp);
+ grp_index = (group < 0) ? 0 : (nbv->ngrp - 1);
+ bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
+ break;
+ default:
+ gmx_incons("unimplemented");
+ }
+ return bUsesSimpleTables;
+}
+
+static void init_ewald_f_table(interaction_const_t *ic,
+ gmx_bool bUsesSimpleTables,
+ real rtab)
+{
+ real maxr;
+
+ if (bUsesSimpleTables)
+ {
+ /* With a spacing of 0.0005 we are at the force summation accuracy
+ * for the SSE kernels for "normal" atomistic simulations.
+ */
+ ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
+ ic->rcoulomb);
+
+ maxr = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
+ ic->tabq_size = (int)(maxr*ic->tabq_scale) + 2;
+ }
+ else
+ {
+ ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+ /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+ ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+ }
+
+ sfree_aligned(ic->tabq_coul_FDV0);
+ sfree_aligned(ic->tabq_coul_F);
+ sfree_aligned(ic->tabq_coul_V);
+
+ /* Create the original table data in FDV0 */
+ snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
+ snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
+ snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
+ table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
+ ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
+}
+
+void init_interaction_const_tables(FILE *fp,
+ interaction_const_t *ic,
+ gmx_bool bUsesSimpleTables,
+ real rtab)
+{
+ real spacing;
+
+ if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+ {
+ init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+
+ if (fp != NULL)
+ {
+ fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
+ 1/ic->tabq_scale, ic->tabq_size);
+ }
+ }
+}
+
+void init_interaction_const(FILE *fp,
+ interaction_const_t **interaction_const,
+ const t_forcerec *fr,
+ real rtab)
+{
+ interaction_const_t *ic;
+ gmx_bool bUsesSimpleTables = TRUE;
+
+ snew(ic, 1);
+
+ /* Just allocate something so we can free it */
+ snew_aligned(ic->tabq_coul_FDV0, 16, 32);
+ snew_aligned(ic->tabq_coul_F, 16, 32);
+ snew_aligned(ic->tabq_coul_V, 16, 32);
+
+ ic->rlist = fr->rlist;
+ ic->rlistlong = fr->rlistlong;
+
+ /* Lennard-Jones */
+ ic->rvdw = fr->rvdw;
+ if (fr->vdw_modifier == eintmodPOTSHIFT)
+ {
+ ic->sh_invrc6 = pow(ic->rvdw, -6.0);
+ }
+ else
+ {
+ ic->sh_invrc6 = 0;
+ }
+
+ /* Electrostatics */
+ ic->eeltype = fr->eeltype;
+ ic->rcoulomb = fr->rcoulomb;
+ ic->epsilon_r = fr->epsilon_r;
+ ic->epsfac = fr->epsfac;
+
+ /* Ewald */
+ ic->ewaldcoeff = fr->ewaldcoeff;
+ if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ {
+ ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+ }
+ else
+ {
+ ic->sh_ewald = 0;
+ }
+
+ /* Reaction-field */
+ if (EEL_RF(ic->eeltype))
+ {
+ ic->epsilon_rf = fr->epsilon_rf;
+ ic->k_rf = fr->k_rf;
+ ic->c_rf = fr->c_rf;
+ }
+ else
+ {
+ /* For plain cut-off we might use the reaction-field kernels */
+ ic->epsilon_rf = ic->epsilon_r;
+ ic->k_rf = 0;
+ if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ {
+ ic->c_rf = 1/ic->rcoulomb;
+ }
+ else
+ {
+ ic->c_rf = 0;
+ }
+ }
+
+ if (fp != NULL)
+ {
+ fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
+ sqr(ic->sh_invrc6), ic->sh_invrc6);
+ if (ic->eeltype == eelCUT)
+ {
+ fprintf(fp, ", Coulomb %.3f", ic->c_rf);
+ }
+ else if (EEL_PME(ic->eeltype))
+ {
+ fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
+ }
+ fprintf(fp, "\n");
+ }
+
+ *interaction_const = ic;
+
+ if (fr->nbv != NULL && fr->nbv->bUseGPU)
+ {
+ nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+ }
+
+ bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
+ init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
+}
+
+static void init_nb_verlet(FILE *fp,
+ nonbonded_verlet_t **nb_verlet,
+ const t_inputrec *ir,
+ const t_forcerec *fr,
+ const t_commrec *cr,
+ const char *nbpu_opt)
+{
+ nonbonded_verlet_t *nbv;
+ int i;
+ char *env;
+ gmx_bool bEmulateGPU, bHybridGPURun = FALSE;
+
+ nbnxn_alloc_t *nb_alloc;
+ nbnxn_free_t *nb_free;
+
+ snew(nbv, 1);
+
+ pick_nbnxn_resources(cr, fr->hwinfo,
+ fr->bNonbonded,
+ &nbv->bUseGPU,
+ &bEmulateGPU);
+
+ nbv->nbs = NULL;
+
+ nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
+ for (i = 0; i < nbv->ngrp; i++)
+ {
+ nbv->grp[i].nbl_lists.nnbl = 0;
+ nbv->grp[i].nbat = NULL;
+ nbv->grp[i].kernel_type = nbnxnkNotSet;
+
+ if (i == 0) /* local */
+ {
+ pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+ nbv->bUseGPU, bEmulateGPU, ir,
+ &nbv->grp[i].kernel_type,
+ &nbv->grp[i].ewald_excl,
+ fr->bNonbonded);
+ }
+ else /* non-local */
+ {
+ if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
+ {
+ /* Use GPU for local, select a CPU kernel for non-local */
+ pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+ FALSE, FALSE, ir,
+ &nbv->grp[i].kernel_type,
+ &nbv->grp[i].ewald_excl,
+ fr->bNonbonded);
+
+ bHybridGPURun = TRUE;
+ }
+ else
+ {
+ /* Use the same kernel for local and non-local interactions */
+ nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
+ nbv->grp[i].ewald_excl = nbv->grp[0].ewald_excl;
+ }
+ }
+ }
+
+ if (nbv->bUseGPU)
+ {
+ /* init the NxN GPU data; the last argument tells whether we'll have
+ * both local and non-local NB calculation on GPU */
+ nbnxn_cuda_init(fp, &nbv->cu_nbv,
+ &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+ (nbv->ngrp > 1) && !bHybridGPURun);
+
+ if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ {
+ char *end;
+
+ nbv->min_ci_balanced = strtol(env, &end, 10);
+ if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ {
+ gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+ else
+ {
+ nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+ }
+ else
+ {
+ nbv->min_ci_balanced = 0;
+ }
+
+ *nb_verlet = nbv;
+
+ nbnxn_init_search(&nbv->nbs,
+ DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
+ DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
+ gmx_omp_nthreads_get(emntNonbonded));
+
+ for (i = 0; i < nbv->ngrp; i++)
+ {
+ if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
+ {
+ nb_alloc = &pmalloc;
+ nb_free = &pfree;
+ }
+ else
+ {
+ nb_alloc = NULL;
+ nb_free = NULL;
+ }
+
+ nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
+ nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ /* 8x8x8 "non-simple" lists are ATM always combined */
+ !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ nb_alloc, nb_free);
+
+ if (i == 0 ||
+ nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
+ {
+ snew(nbv->grp[i].nbat, 1);
+ nbnxn_atomdata_init(fp,
+ nbv->grp[i].nbat,
+ nbv->grp[i].kernel_type,
+ fr->ntype, fr->nbfp,
+ ir->opts.ngener,
+ nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+ nb_alloc, nb_free);
+ }
+ else
+ {
+ nbv->grp[i].nbat = nbv->grp[0].nbat;
+ }
+ }
+}
+
+void init_forcerec(FILE *fp,
+ const output_env_t oenv,
+ t_forcerec *fr,
+ t_fcdata *fcd,
+ const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ matrix box,
+ const char *tabfn,
+ const char *tabafn,
+ const char *tabpfn,
+ const char *tabbfn,
+ const char *nbpu_opt,
+ gmx_bool bNoSolvOpt,
+ real print_force)
+{
+ int i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
+ real rtab;
+ char *env;
+ double dbl;
+ rvec box_size;
+ const t_block *cgs;
+ gmx_bool bGenericKernelOnly;
+ gmx_bool bTab, bSep14tab, bNormalnblists;
+ t_nblists *nbl;
+ int *nm_ind, egp_flags;
+
+ if (fr->hwinfo == NULL)
+ {
+ /* Detect hardware, gather information.
+ * In mdrun, hwinfo has already been set before calling init_forcerec.
+ * Here we ignore GPUs, as tools will not use them anyhow.
+ */
+ fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+ }
+
+ /* By default we turn acceleration on, but it might be turned off further down... */
+ fr->use_cpu_acceleration = TRUE;
+
+ fr->bDomDec = DOMAINDECOMP(cr);
+
+ natoms = mtop->natoms;
+
+ if (check_box(ir->ePBC, box))
+ {
+ gmx_fatal(FARGS, check_box(ir->ePBC, box));
+ }
+
+ /* Test particle insertion ? */
+ if (EI_TPI(ir->eI))
+ {
+ /* Set to the size of the molecule to be inserted (the last one) */
+ /* Because of old style topologies, we have to use the last cg
+ * instead of the last molecule type.
+ */
+ cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
+ fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
+ if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
+ {
+ gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
+ }
+ }
+ else
+ {
+ fr->n_tpi = 0;
+ }
+
+ /* Copy AdResS parameters */
+ if (ir->bAdress)
+ {
+ fr->adress_type = ir->adress->type;
+ fr->adress_const_wf = ir->adress->const_wf;
+ fr->adress_ex_width = ir->adress->ex_width;
+ fr->adress_hy_width = ir->adress->hy_width;
+ fr->adress_icor = ir->adress->icor;
+ fr->adress_site = ir->adress->site;
+ fr->adress_ex_forcecap = ir->adress->ex_forcecap;
+ fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
+
+
+ snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
+ for (i = 0; i < ir->adress->n_energy_grps; i++)
+ {
+ fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
+ }
+
+ fr->n_adress_tf_grps = ir->adress->n_tf_grps;
+ snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
+ for (i = 0; i < fr->n_adress_tf_grps; i++)
+ {
+ fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
+ }
+ copy_rvec(ir->adress->refs, fr->adress_refs);
+ }
+ else
+ {
+ fr->adress_type = eAdressOff;
+ fr->adress_do_hybridpairs = FALSE;
+ }
+
+ /* Copy the user determined parameters */
+ fr->userint1 = ir->userint1;
+ fr->userint2 = ir->userint2;
+ fr->userint3 = ir->userint3;
+ fr->userint4 = ir->userint4;
+ fr->userreal1 = ir->userreal1;
+ fr->userreal2 = ir->userreal2;
+ fr->userreal3 = ir->userreal3;
+ fr->userreal4 = ir->userreal4;
+
+ /* Shell stuff */
+ fr->fc_stepsize = ir->fc_stepsize;
+
+ /* Free energy */
+ fr->efep = ir->efep;
+ fr->sc_alphavdw = ir->fepvals->sc_alpha;
+ if (ir->fepvals->bScCoul)
+ {
+ fr->sc_alphacoul = ir->fepvals->sc_alpha;
+ fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
+ }
+ else
+ {
+ fr->sc_alphacoul = 0;
+ fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
+ }
+ fr->sc_power = ir->fepvals->sc_power;
+ fr->sc_r_power = ir->fepvals->sc_r_power;
+ fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
+
+ env = getenv("GMX_SCSIGMA_MIN");
+ if (env != NULL)
+ {
+ dbl = 0;
+ sscanf(env, "%lf", &dbl);
+ fr->sc_sigma6_min = pow(dbl, 6);
+ if (fp)
+ {
+ fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
+ }
+ }
+
+ fr->bNonbonded = TRUE;
+ if (getenv("GMX_NO_NONBONDED") != NULL)
+ {
+ /* turn off non-bonded calculations */
+ fr->bNonbonded = FALSE;
+ md_print_warn(cr, fp,
+ "Found environment variable GMX_NO_NONBONDED.\n"
+ "Disabling nonbonded calculations.\n");
+ }
+
+ bGenericKernelOnly = FALSE;
+
+ /* We now check in the NS code whether a particular combination of interactions
+ * can be used with water optimization, and disable it if that is not the case.
+ */
+
+ if (getenv("GMX_NB_GENERIC") != NULL)
+ {
+ if (fp != NULL)
+ {
+ fprintf(fp,
+ "Found environment variable GMX_NB_GENERIC.\n"
+ "Disabling all interaction-specific nonbonded kernels, will only\n"
+ "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
+ }
+ bGenericKernelOnly = TRUE;
+ }
+
+ if (bGenericKernelOnly == TRUE)
+ {
+ bNoSolvOpt = TRUE;
+ }
+
+ if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+ {
+ fr->use_cpu_acceleration = FALSE;
+ if (fp != NULL)
+ {
+ fprintf(fp,
+ "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
+ "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+ }
+ }
+
+ fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
+
+ /* Check if we can/should do all-vs-all kernels */
+ fr->bAllvsAll = can_use_allvsall(ir, FALSE, NULL, NULL);
+ fr->AllvsAll_work = NULL;
+ fr->AllvsAll_workgb = NULL;
+
+ /* All-vs-all kernels have not been implemented in 4.6, and
+ * the SIMD group kernels are also buggy in this case. Non-accelerated
+ * group kernels are OK. See Redmine #1249. */
+ if (fr->bAllvsAll)
+ {
+ fr->bAllvsAll = FALSE;
+ fr->use_cpu_acceleration = FALSE;
+ if (fp != NULL)
+ {
+ fprintf(fp,
+ "\nYour simulation settings would have triggered the efficient all-vs-all\n"
+ "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
+ "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
+ "of an unfixed bug. The reference C kernels are correct, though, so\n"
+ "we are proceeding by disabling all CPU architecture-specific\n"
+ "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
+ "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
+ }
+ }
+
+ /* Neighbour searching stuff */
+ fr->cutoff_scheme = ir->cutoff_scheme;
+ fr->bGrid = (ir->ns_type == ensGRID);
+ fr->ePBC = ir->ePBC;
+
+ /* Determine if we will do PBC for distances in bonded interactions */
+ if (fr->ePBC == epbcNONE)
+ {
+ fr->bMolPBC = FALSE;
+ }
+ else
+ {
+ if (!DOMAINDECOMP(cr))
+ {
+ /* The group cut-off scheme and SHAKE assume charge groups
+ * are whole, but not using molpbc is faster in most cases.
+ */
+ if (fr->cutoff_scheme == ecutsGROUP ||
+ (ir->eConstrAlg == econtSHAKE &&
+ (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
+ {
+ fr->bMolPBC = ir->bPeriodicMols;
+ }
+ else
+ {
+ fr->bMolPBC = TRUE;
+ if (getenv("GMX_USE_GRAPH") != NULL)
+ {
+ fr->bMolPBC = FALSE;
+ if (fp)
+ {
+ fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+ }
+ }
+ }
+ }
+ else
+ {
+ fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
+ }
+ }
+ fr->bGB = (ir->implicit_solvent == eisGBSA);
+
+ fr->rc_scaling = ir->refcoord_scaling;
+ copy_rvec(ir->posres_com, fr->posres_com);
+ copy_rvec(ir->posres_comB, fr->posres_comB);
+ fr->rlist = cutoff_inf(ir->rlist);
+ fr->rlistlong = cutoff_inf(ir->rlistlong);
+ fr->eeltype = ir->coulombtype;
+ fr->vdwtype = ir->vdwtype;
+
+ fr->coulomb_modifier = ir->coulomb_modifier;
+ fr->vdw_modifier = ir->vdw_modifier;
+
+ /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
+ switch (fr->eeltype)
+ {
+ case eelCUT:
+ fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
+ break;
+
+ case eelRF:
+ case eelGRF:
+ case eelRF_NEC:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ break;
+
+ case eelRF_ZERO:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ fr->coulomb_modifier = eintmodEXACTCUTOFF;
+ break;
+
+ case eelSWITCH:
+ case eelSHIFT:
+ case eelUSER:
+ case eelENCADSHIFT:
+ case eelPMESWITCH:
+ case eelPMEUSER:
+ case eelPMEUSERSWITCH:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ break;
+
+ case eelPME:
+ case eelEWALD:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
+ break;
+
+ default:
+ gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
+ break;
+ }
+
+ /* Vdw: Translate from mdp settings to kernel format */
+ switch (fr->vdwtype)
+ {
+ case evdwCUT:
+ if (fr->bBHAM)
+ {
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
+ }
+ else
+ {
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
+ }
+ break;
+
+ case evdwSWITCH:
+ case evdwSHIFT:
+ case evdwUSER:
+ case evdwENCADSHIFT:
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ break;
+
+ default:
+ gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
+ break;
+ }
+
+ /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
+ fr->nbkernel_elec_modifier = fr->coulomb_modifier;
+ fr->nbkernel_vdw_modifier = fr->vdw_modifier;
+
+ fr->bTwinRange = fr->rlistlong > fr->rlist;
+ fr->bEwald = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
+
+ fr->reppow = mtop->ffparams.reppow;
+
+ if (ir->cutoff_scheme == ecutsGROUP)
+ {
+ fr->bvdwtab = (fr->vdwtype != evdwCUT ||
+ !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+ /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
+ fr->bcoultab = !(fr->eeltype == eelCUT ||
+ fr->eeltype == eelEWALD ||
+ fr->eeltype == eelPME ||
+ fr->eeltype == eelRF ||
+ fr->eeltype == eelRF_ZERO);
+
+ /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
+ * going to be faster to tabulate the interaction than calling the generic kernel.
+ */
+ if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
+ {
+ if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
+ {
+ fr->bcoultab = TRUE;
+ }
+ }
+ else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
+ ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
+ fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
+ (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
+ {
+ if (fr->rcoulomb != fr->rvdw)
+ {
+ fr->bcoultab = TRUE;
+ }
+ }
+
+ if (getenv("GMX_REQUIRE_TABLES"))
+ {
+ fr->bvdwtab = TRUE;
+ fr->bcoultab = TRUE;
+ }
+
+ if (fp)
+ {
+ fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
+ fprintf(fp, "Table routines are used for vdw: %s\n", bool_names[fr->bvdwtab ]);
+ }
+
+ if (fr->bvdwtab == TRUE)
+ {
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ fr->nbkernel_vdw_modifier = eintmodNONE;
+ }
+ if (fr->bcoultab == TRUE)
+ {
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ fr->nbkernel_elec_modifier = eintmodNONE;
+ }
+ }
+
+ if (ir->cutoff_scheme == ecutsVERLET)
+ {
+ if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+ {
+ gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
+ }
+ fr->bvdwtab = FALSE;
+ fr->bcoultab = FALSE;
+ }
+
+ /* Tables are used for direct ewald sum */
+ if (fr->bEwald)
+ {
+ if (EEL_PME(ir->coulombtype))
+ {
+ if (fp)
+ {
+ fprintf(fp, "Will do PME sum in reciprocal space.\n");
+ }
+ if (ir->coulombtype == eelP3M_AD)
+ {
+ please_cite(fp, "Hockney1988");
+ please_cite(fp, "Ballenegger2012");
+ }
+ else
+ {
+ please_cite(fp, "Essmann95a");
+ }
+
+ if (ir->ewald_geometry == eewg3DC)
+ {
+ if (fp)
+ {
+ fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
+ }
+ please_cite(fp, "In-Chul99a");
+ }
+ }
+ fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ init_ewald_tab(&(fr->ewald_table), ir, fp);
+ if (fp)
+ {
+ fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
+ 1/fr->ewaldcoeff);
+ }
+ }
+
+ /* Electrostatics */
+ fr->epsilon_r = ir->epsilon_r;
+ fr->epsilon_rf = ir->epsilon_rf;
+ fr->fudgeQQ = mtop->ffparams.fudgeQQ;
+ fr->rcoulomb_switch = ir->rcoulomb_switch;
+ fr->rcoulomb = cutoff_inf(ir->rcoulomb);
+
+ /* Parameters for generalized RF */
+ fr->zsquare = 0.0;
+ fr->temp = 0.0;
+
+ if (fr->eeltype == eelGRF)
+ {
+ init_generalized_rf(fp, mtop, ir, fr);
+ }
+ else if (fr->eeltype == eelSHIFT)
+ {
+ for (m = 0; (m < DIM); m++)
+ {
+ box_size[m] = box[m][m];
+ }
+
+ if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
+ {
+ set_shift_consts(fr->rcoulomb_switch, fr->rcoulomb, box_size);
+ }
+ }
+
+ fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
+ gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
+ IR_ELEC_FIELD(*ir) ||
+ (fr->adress_icor != eAdressICOff)
+ );
+
+ if (fr->cutoff_scheme == ecutsGROUP &&
+ ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
+ {
+ /* Count the total number of charge groups */
+ fr->cg_nalloc = ncg_mtop(mtop);
+ srenew(fr->cg_cm, fr->cg_nalloc);
+ }
+ if (fr->shift_vec == NULL)
+ {
+ snew(fr->shift_vec, SHIFTS);
+ }
+
+ if (fr->fshift == NULL)
+ {
+ snew(fr->fshift, SHIFTS);
+ }
+
+ if (fr->nbfp == NULL)
+ {
+ fr->ntype = mtop->ffparams.atnr;
+ fr->nbfp = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+ }
+
+ /* Copy the energy group exclusions */
+ fr->egp_flags = ir->opts.egp_flags;
+
+ /* Van der Waals stuff */
+ fr->rvdw = cutoff_inf(ir->rvdw);
+ fr->rvdw_switch = ir->rvdw_switch;
+ if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
+ {
+ if (fr->rvdw_switch >= fr->rvdw)
+ {
+ gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
+ fr->rvdw_switch, fr->rvdw);
+ }
+ if (fp)
+ {
+ fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
+ (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
+ fr->rvdw_switch, fr->rvdw);
+ }
+ }
+
+ if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
+ {
+ gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
+ }
+
+ if (fp)
+ {
+ fprintf(fp, "Cut-off's: NS: %g Coulomb: %g %s: %g\n",
+ fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
+ }
+
+ fr->eDispCorr = ir->eDispCorr;
+ if (ir->eDispCorr != edispcNO)
+ {
+ set_avcsixtwelve(fp, fr, mtop);
+ }
+
+ if (fr->bBHAM)
+ {
+ set_bham_b_max(fp, fr, mtop);
+ }
+
+ fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
+
+ /* Copy the GBSA data (radius, volume and surftens for each
+ * atomtype) from the topology atomtype section to forcerec.
+ */
+ snew(fr->atype_radius, fr->ntype);
+ snew(fr->atype_vol, fr->ntype);
+ snew(fr->atype_surftens, fr->ntype);
+ snew(fr->atype_gb_radius, fr->ntype);
+ snew(fr->atype_S_hct, fr->ntype);
+
+ if (mtop->atomtypes.nr > 0)
+ {
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_radius[i] = mtop->atomtypes.radius[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_vol[i] = mtop->atomtypes.vol[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
+ }
+ }
+
+ /* Generate the GB table if needed */
+ if (fr->bGB)
+ {
+#ifdef GMX_DOUBLE
+ fr->gbtabscale = 2000;
+#else
+ fr->gbtabscale = 500;
+#endif
+
+ fr->gbtabr = 100;
+ fr->gbtab = make_gb_table(oenv, fr);
+
+ init_gb(&fr->born, cr, fr, ir, mtop, ir->gb_algorithm);
+
+ /* Copy local gb data (for dd, this is done in dd_partition_system) */
+ if (!DOMAINDECOMP(cr))
+ {
+ make_local_gb(cr, fr->born, ir->gb_algorithm);
+ }
+ }
+
+ /* Set the charge scaling */
+ if (fr->epsilon_r != 0)
+ {
+ fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
+ }
+ else
+ {
+ /* eps = 0 is infinite dieletric: no coulomb interactions */
+ fr->epsfac = 0;
+ }
+
+ /* Reaction field constants */
+ if (EEL_RF(fr->eeltype))
+ {
+ calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ fr->rcoulomb, fr->temp, fr->zsquare, box,
+ &fr->kappa, &fr->k_rf, &fr->c_rf);
+ }
+
+ set_chargesum(fp, fr, mtop);
+
+ /* if we are using LR electrostatics, and they are tabulated,
+ * the tables will contain modified coulomb interactions.
+ * Since we want to use the non-shifted ones for 1-4
+ * coulombic interactions, we must have an extra set of tables.
+ */
+
+ /* Construct tables.
+ * A little unnecessary to make both vdw and coul tables sometimes,
+ * but what the heck... */
+
+ bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
+
+ bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
+ fr->bBHAM || fr->bEwald) &&
+ (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
+
+ negp_pp = ir->opts.ngener - ir->nwall;
+ negptable = 0;
+ if (!bTab)
+ {
+ bNormalnblists = TRUE;
+ fr->nnblists = 1;
+ }
+ else
+ {
+ bNormalnblists = (ir->eDispCorr != edispcNO);
+ for (egi = 0; egi < negp_pp; egi++)
+ {
+ for (egj = egi; egj < negp_pp; egj++)
+ {
+ egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ if (!(egp_flags & EGP_EXCL))
+ {
+ if (egp_flags & EGP_TABLE)
+ {
+ negptable++;
+ }
+ else
+ {
+ bNormalnblists = TRUE;
+ }
+ }
+ }
+ }
+ if (bNormalnblists)
+ {
+ fr->nnblists = negptable + 1;
+ }
+ else
+ {
+ fr->nnblists = negptable;
+ }
+ if (fr->nnblists > 1)
+ {
+ snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
+ }
+ }
+
+ if (ir->adress)
+ {
+ fr->nnblists *= 2;
+ }
+
+ snew(fr->nblists, fr->nnblists);
+
+ /* This code automatically gives table length tabext without cut-off's,
+ * in that case grompp should already have checked that we do not need
+ * normal tables and we only generate tables for 1-4 interactions.
+ */
+ rtab = ir->rlistlong + ir->tabext;
+
+ if (bTab)
+ {
+ /* make tables for ordinary interactions */
+ if (bNormalnblists)
+ {
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
+ if (ir->adress)
+ {
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
+ }
+ if (!bSep14tab)
+ {
+ fr->tab14 = fr->nblists[0].table_elec_vdw;
+ }
+ m = 1;
+ }
+ else
+ {
+ m = 0;
+ }
+ if (negptable > 0)
+ {
+ /* Read the special tables for certain energy group pairs */
+ nm_ind = mtop->groups.grps[egcENER].nm_ind;
+ for (egi = 0; egi < negp_pp; egi++)
+ {
+ for (egj = egi; egj < negp_pp; egj++)
+ {
+ egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
+ {
+ nbl = &(fr->nblists[m]);
+ if (fr->nnblists > 1)
+ {
+ fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
+ }
+ /* Read the table file with the two energy groups names appended */
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ *mtop->groups.grpname[nm_ind[egi]],
+ *mtop->groups.grpname[nm_ind[egj]],
+ &fr->nblists[m]);
+ if (ir->adress)
+ {
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ *mtop->groups.grpname[nm_ind[egi]],
+ *mtop->groups.grpname[nm_ind[egj]],
+ &fr->nblists[fr->nnblists/2+m]);
+ }
+ m++;
+ }
+ else if (fr->nnblists > 1)
+ {
+ fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
+ }
+ }
+ }
+ }
+ }
+ if (bSep14tab)
+ {
+ /* generate extra tables with plain Coulomb for 1-4 interactions only */
+ fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
+ GMX_MAKETABLES_14ONLY);
+ }
+
+ /* Read AdResS Thermo Force table if needed */
+ if (fr->adress_icor == eAdressICThermoForce)
+ {
+ /* old todo replace */
+
+ if (ir->adress->n_tf_grps > 0)
+ {
+ make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
+
+ }
+ else
+ {
+ /* load the default table */
+ snew(fr->atf_tabs, 1);
+ fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
+ }
+ }
+
+ /* Wall stuff */
+ fr->nwall = ir->nwall;
+ if (ir->nwall && ir->wall_type == ewtTABLE)
+ {
+ make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
+ }
+
+ if (fcd && tabbfn)
+ {
+ fcd->bondtab = make_bonded_tables(fp,
+ F_TABBONDS, F_TABBONDSNC,
+ mtop, tabbfn, "b");
+ fcd->angletab = make_bonded_tables(fp,
+ F_TABANGLES, -1,
+ mtop, tabbfn, "a");
+ fcd->dihtab = make_bonded_tables(fp,
+ F_TABDIHS, -1,
+ mtop, tabbfn, "d");
+ }
+ else
+ {
+ if (debug)
+ {
+ fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
+ }
+ }
+
+ /* QM/MM initialization if requested
+ */
+ if (ir->bQMMM)
+ {
+ fprintf(stderr, "QM/MM calculation requested.\n");
+ }
+
+ fr->bQMMM = ir->bQMMM;
+ fr->qr = mk_QMMMrec();
+
+ /* Set all the static charge group info */
+ fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
+ &fr->bExcl_IntraCGAll_InterCGNone);
+ if (DOMAINDECOMP(cr))
+ {
+ fr->cginfo = NULL;
+ }
+ else
+ {
+ fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
+ }
+
+ if (!DOMAINDECOMP(cr))
+ {
+ /* When using particle decomposition, the effect of the second argument,
+ * which sets fr->hcg, is corrected later in do_md and init_em.
+ */
+ forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
+ mtop->natoms, mtop->natoms, mtop->natoms);
+ }
+
+ fr->print_force = print_force;
+
+
+ /* coarse load balancing vars */
+ fr->t_fnbf = 0.;
+ fr->t_wait = 0.;
+ fr->timesteps = 0;
+
+ /* Initialize neighbor search */
+ init_ns(fp, cr, &fr->ns, fr, mtop);
+
+ if (cr->duty & DUTY_PP)
+ {
+ gmx_nonbonded_setup(fr, bGenericKernelOnly);
+ /*
+ if (ir->bAdress)
+ {
+ gmx_setup_adress_kernels(fp,bGenericKernelOnly);
+ }
+ */
+ }
+
+ /* Initialize the thread working data for bonded interactions */
+ init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+
+ snew(fr->excl_load, fr->nthreads+1);
+
+ if (fr->cutoff_scheme == ecutsVERLET)
+ {
+ if (ir->rcoulomb != ir->rvdw)
+ {
+ gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
+ }
+
+ init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+ }
+
+ /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ init_interaction_const(fp, &fr->ic, fr, rtab);
+ if (ir->eDispCorr != edispcNO)
+ {
+ calc_enervirdiff(fp, ir->eDispCorr, fr);
+ }
+}
+
+#define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
+#define pr_int(fp, i) fprintf((fp), "%s: %d\n",#i, i)
+#define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
+
+void pr_forcerec(FILE *fp, t_forcerec *fr)
+{
+ int i;
+
+ pr_real(fp, fr->rlist);
+ pr_real(fp, fr->rcoulomb);
+ pr_real(fp, fr->fudgeQQ);
+ pr_bool(fp, fr->bGrid);
+ pr_bool(fp, fr->bTwinRange);
+ /*pr_int(fp,fr->cg0);
+ pr_int(fp,fr->hcg);*/
+ for (i = 0; i < fr->nnblists; i++)
+ {
+ pr_int(fp, fr->nblists[i].table_elec_vdw.n);
+ }
+ pr_real(fp, fr->rcoulomb_switch);
+ pr_real(fp, fr->rcoulomb);
+
+ fflush(fp);
+}
+
+void forcerec_set_excl_load(t_forcerec *fr,
+ const gmx_localtop_t *top, const t_commrec *cr)
+{
+ const int *ind, *a;
+ int t, i, j, ntot, n, ntarget;
+
+ if (cr != NULL && PARTDECOMP(cr))
+ {
+ /* No OpenMP with particle decomposition */
+ pd_at_range(cr,
+ &fr->excl_load[0],
+ &fr->excl_load[1]);
+
+ return;
+ }
+
+ ind = top->excls.index;
+ a = top->excls.a;
+
+ ntot = 0;
+ for (i = 0; i < top->excls.nr; i++)
+ {
+ for (j = ind[i]; j < ind[i+1]; j++)
+ {
+ if (a[j] > i)
+ {
+ ntot++;
+ }
+ }
+ }
+
+ fr->excl_load[0] = 0;
+ n = 0;
+ i = 0;
+ for (t = 1; t <= fr->nthreads; t++)
+ {
+ ntarget = (ntot*t)/fr->nthreads;
+ while (i < top->excls.nr && n < ntarget)
+ {
+ for (j = ind[i]; j < ind[i+1]; j++)
+ {
+ if (a[j] > i)
+ {
+ n++;
+ }
+ }
+ i++;
+ }
+ fr->excl_load[t] = i;
+ }
+}
--- /dev/null
- /* Set the diagonal cluster pair exclusion mask setup data.
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "smalloc.h"
+#include "macros.h"
+#include "vec.h"
+#include "nbnxn_consts.h"
+#include "nbnxn_internal.h"
+#include "nbnxn_search.h"
+#include "nbnxn_atomdata.h"
+#include "gmx_omp_nthreads.h"
+
+/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
+void nbnxn_alloc_aligned(void **ptr, size_t nbytes)
+{
+ *ptr = save_malloc_aligned("ptr", __FILE__, __LINE__, nbytes, 1, NBNXN_MEM_ALIGN);
+}
+
+/* Free function for memory allocated with nbnxn_alloc_aligned */
+void nbnxn_free_aligned(void *ptr)
+{
+ sfree_aligned(ptr);
+}
+
+/* Reallocation wrapper function for nbnxn data structures */
+void nbnxn_realloc_void(void **ptr,
+ int nbytes_copy, int nbytes_new,
+ nbnxn_alloc_t *ma,
+ nbnxn_free_t *mf)
+{
+ void *ptr_new;
+
+ ma(&ptr_new, nbytes_new);
+
+ if (nbytes_new > 0 && ptr_new == NULL)
+ {
+ gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
+ }
+
+ if (nbytes_copy > 0)
+ {
+ if (nbytes_new < nbytes_copy)
+ {
+ gmx_incons("In nbnxn_realloc_void: new size less than copy size");
+ }
+ memcpy(ptr_new, *ptr, nbytes_copy);
+ }
+ if (*ptr != NULL)
+ {
+ mf(*ptr);
+ }
+ *ptr = ptr_new;
+}
+
+/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
+void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n)
+{
+ int t;
+
+ nbnxn_realloc_void((void **)&nbat->type,
+ nbat->natoms*sizeof(*nbat->type),
+ n*sizeof(*nbat->type),
+ nbat->alloc, nbat->free);
+ nbnxn_realloc_void((void **)&nbat->lj_comb,
+ nbat->natoms*2*sizeof(*nbat->lj_comb),
+ n*2*sizeof(*nbat->lj_comb),
+ nbat->alloc, nbat->free);
+ if (nbat->XFormat != nbatXYZQ)
+ {
+ nbnxn_realloc_void((void **)&nbat->q,
+ nbat->natoms*sizeof(*nbat->q),
+ n*sizeof(*nbat->q),
+ nbat->alloc, nbat->free);
+ }
+ if (nbat->nenergrp > 1)
+ {
+ nbnxn_realloc_void((void **)&nbat->energrp,
+ nbat->natoms/nbat->na_c*sizeof(*nbat->energrp),
+ n/nbat->na_c*sizeof(*nbat->energrp),
+ nbat->alloc, nbat->free);
+ }
+ nbnxn_realloc_void((void **)&nbat->x,
+ nbat->natoms*nbat->xstride*sizeof(*nbat->x),
+ n*nbat->xstride*sizeof(*nbat->x),
+ nbat->alloc, nbat->free);
+ for (t = 0; t < nbat->nout; t++)
+ {
+ /* Allocate one element extra for possible signaling with CUDA */
+ nbnxn_realloc_void((void **)&nbat->out[t].f,
+ nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
+ n*nbat->fstride*sizeof(*nbat->out[t].f),
+ nbat->alloc, nbat->free);
+ }
+ nbat->nalloc = n;
+}
+
+/* Initializes an nbnxn_atomdata_output_t data structure */
+static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
+ int nb_kernel_type,
+ int nenergrp, int stride,
+ nbnxn_alloc_t *ma)
+{
+ int cj_size;
+
+ out->f = NULL;
+ ma((void **)&out->fshift, SHIFTS*DIM*sizeof(*out->fshift));
+ out->nV = nenergrp*nenergrp;
+ ma((void **)&out->Vvdw, out->nV*sizeof(*out->Vvdw));
+ ma((void **)&out->Vc, out->nV*sizeof(*out->Vc ));
+
+ if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+ nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
+ {
+ cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
+ out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
+ ma((void **)&out->VSvdw, out->nVS*sizeof(*out->VSvdw));
+ ma((void **)&out->VSc, out->nVS*sizeof(*out->VSc ));
+ }
+ else
+ {
+ out->nVS = 0;
+ }
+}
+
+static void copy_int_to_nbat_int(const int *a, int na, int na_round,
+ const int *in, int fill, int *innb)
+{
+ int i, j;
+
+ j = 0;
+ for (i = 0; i < na; i++)
+ {
+ innb[j++] = in[a[i]];
+ }
+ /* Complete the partially filled last cell with fill */
+ for (; i < na_round; i++)
+ {
+ innb[j++] = fill;
+ }
+}
+
+static void clear_nbat_real(int na, int nbatFormat, real *xnb, int a0)
+{
+ int a, d, j, c;
+
+ switch (nbatFormat)
+ {
+ case nbatXYZ:
+ for (a = 0; a < na; a++)
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ xnb[(a0+a)*STRIDE_XYZ+d] = 0;
+ }
+ }
+ break;
+ case nbatXYZQ:
+ for (a = 0; a < na; a++)
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ xnb[(a0+a)*STRIDE_XYZQ+d] = 0;
+ }
+ }
+ break;
+ case nbatX4:
+ j = X4_IND_A(a0);
+ c = a0 & (PACK_X4-1);
+ for (a = 0; a < na; a++)
+ {
+ xnb[j+XX*PACK_X4] = 0;
+ xnb[j+YY*PACK_X4] = 0;
+ xnb[j+ZZ*PACK_X4] = 0;
+ j++;
+ c++;
+ if (c == PACK_X4)
+ {
+ j += (DIM-1)*PACK_X4;
+ c = 0;
+ }
+ }
+ break;
+ case nbatX8:
+ j = X8_IND_A(a0);
+ c = a0 & (PACK_X8-1);
+ for (a = 0; a < na; a++)
+ {
+ xnb[j+XX*PACK_X8] = 0;
+ xnb[j+YY*PACK_X8] = 0;
+ xnb[j+ZZ*PACK_X8] = 0;
+ j++;
+ c++;
+ if (c == PACK_X8)
+ {
+ j += (DIM-1)*PACK_X8;
+ c = 0;
+ }
+ }
+ break;
+ }
+}
+
+void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
+ rvec *x, int nbatFormat, real *xnb, int a0,
+ int cx, int cy, int cz)
+{
+ int i, j, c;
+
+/* We might need to place filler particles to fill up the cell to na_round.
+ * The coefficients (LJ and q) for such particles are zero.
+ * But we might still get NaN as 0*NaN when distances are too small.
+ * We hope that -107 nm is far away enough from to zero
+ * to avoid accidental short distances to particles shifted down for pbc.
+ */
+#define NBAT_FAR_AWAY 107
+
+ switch (nbatFormat)
+ {
+ case nbatXYZ:
+ j = a0*STRIDE_XYZ;
+ for (i = 0; i < na; i++)
+ {
+ xnb[j++] = x[a[i]][XX];
+ xnb[j++] = x[a[i]][YY];
+ xnb[j++] = x[a[i]][ZZ];
+ }
+ /* Complete the partially filled last cell with copies of the last element.
+ * This simplifies the bounding box calculation and avoid
+ * numerical issues with atoms that are coincidentally close.
+ */
+ for (; i < na_round; i++)
+ {
+ xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
+ xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
+ xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
+ }
+ break;
+ case nbatXYZQ:
+ j = a0*STRIDE_XYZQ;
+ for (i = 0; i < na; i++)
+ {
+ xnb[j++] = x[a[i]][XX];
+ xnb[j++] = x[a[i]][YY];
+ xnb[j++] = x[a[i]][ZZ];
+ j++;
+ }
+ /* Complete the partially filled last cell with particles far apart */
+ for (; i < na_round; i++)
+ {
+ xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
+ xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
+ xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
+ j++;
+ }
+ break;
+ case nbatX4:
+ j = X4_IND_A(a0);
+ c = a0 & (PACK_X4-1);
+ for (i = 0; i < na; i++)
+ {
+ xnb[j+XX*PACK_X4] = x[a[i]][XX];
+ xnb[j+YY*PACK_X4] = x[a[i]][YY];
+ xnb[j+ZZ*PACK_X4] = x[a[i]][ZZ];
+ j++;
+ c++;
+ if (c == PACK_X4)
+ {
+ j += (DIM-1)*PACK_X4;
+ c = 0;
+ }
+ }
+ /* Complete the partially filled last cell with particles far apart */
+ for (; i < na_round; i++)
+ {
+ xnb[j+XX*PACK_X4] = -NBAT_FAR_AWAY*(1 + cx);
+ xnb[j+YY*PACK_X4] = -NBAT_FAR_AWAY*(1 + cy);
+ xnb[j+ZZ*PACK_X4] = -NBAT_FAR_AWAY*(1 + cz + i);
+ j++;
+ c++;
+ if (c == PACK_X4)
+ {
+ j += (DIM-1)*PACK_X4;
+ c = 0;
+ }
+ }
+ break;
+ case nbatX8:
+ j = X8_IND_A(a0);
+ c = a0 & (PACK_X8 - 1);
+ for (i = 0; i < na; i++)
+ {
+ xnb[j+XX*PACK_X8] = x[a[i]][XX];
+ xnb[j+YY*PACK_X8] = x[a[i]][YY];
+ xnb[j+ZZ*PACK_X8] = x[a[i]][ZZ];
+ j++;
+ c++;
+ if (c == PACK_X8)
+ {
+ j += (DIM-1)*PACK_X8;
+ c = 0;
+ }
+ }
+ /* Complete the partially filled last cell with particles far apart */
+ for (; i < na_round; i++)
+ {
+ xnb[j+XX*PACK_X8] = -NBAT_FAR_AWAY*(1 + cx);
+ xnb[j+YY*PACK_X8] = -NBAT_FAR_AWAY*(1 + cy);
+ xnb[j+ZZ*PACK_X8] = -NBAT_FAR_AWAY*(1 + cz + i);
+ j++;
+ c++;
+ if (c == PACK_X8)
+ {
+ j += (DIM-1)*PACK_X8;
+ c = 0;
+ }
+ }
+ break;
+ default:
+ gmx_incons("Unsupported nbnxn_atomdata_t format");
+ }
+}
+
+/* Determines the combination rule (or none) to be used, stores it,
+ * and sets the LJ parameters required with the rule.
+ */
+static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
+{
+ int nt, i, j;
+ real c6, c12;
+
+ nt = nbat->ntype;
+
+ switch (nbat->comb_rule)
+ {
+ case ljcrGEOM:
+ nbat->comb_rule = ljcrGEOM;
+
+ for (i = 0; i < nt; i++)
+ {
+ /* Copy the diagonal from the nbfp matrix */
+ nbat->nbfp_comb[i*2 ] = sqrt(nbat->nbfp[(i*nt+i)*2 ]);
+ nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
+ }
+ break;
+ case ljcrLB:
+ for (i = 0; i < nt; i++)
+ {
+ /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
+ c6 = nbat->nbfp[(i*nt+i)*2 ];
+ c12 = nbat->nbfp[(i*nt+i)*2+1];
+ if (c6 > 0 && c12 > 0)
+ {
+ /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
+ * so we get 6*C6 and 12*C12 after combining.
+ */
+ nbat->nbfp_comb[i*2 ] = 0.5*pow(c12/c6, 1.0/6.0);
+ nbat->nbfp_comb[i*2+1] = sqrt(c6*c6/c12);
+ }
+ else
+ {
+ nbat->nbfp_comb[i*2 ] = 0;
+ nbat->nbfp_comb[i*2+1] = 0;
+ }
+ }
+ break;
+ case ljcrNONE:
+ /* nbfp_s4 stores two parameters using a stride of 4,
+ * because this would suit x86 SIMD single-precision
+ * quad-load intrinsics. There's a slight inefficiency in
+ * allocating and initializing nbfp_s4 when it might not
+ * be used, but introducing the conditional code is not
+ * really worth it. */
+ nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
+ for (i = 0; i < nt; i++)
+ {
+ for (j = 0; j < nt; j++)
+ {
+ nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
+ nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
+ nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
+ nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+ }
+ }
+ break;
+ default:
+ gmx_incons("Unknown combination rule");
+ break;
+ }
+}
+
+/* Initializes an nbnxn_atomdata_t data structure */
+void nbnxn_atomdata_init(FILE *fp,
+ nbnxn_atomdata_t *nbat,
+ int nb_kernel_type,
+ int ntype, const real *nbfp,
+ int n_energygroups,
+ int nout,
+ nbnxn_alloc_t *alloc,
+ nbnxn_free_t *free)
+{
+ int i, j;
+ real c6, c12, tol;
+ char *ptr;
+ gmx_bool simple, bCombGeom, bCombLB;
+
+ if (alloc == NULL)
+ {
+ nbat->alloc = nbnxn_alloc_aligned;
+ }
+ else
+ {
+ nbat->alloc = alloc;
+ }
+ if (free == NULL)
+ {
+ nbat->free = nbnxn_free_aligned;
+ }
+ else
+ {
+ nbat->free = free;
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype);
+ }
+ nbat->ntype = ntype + 1;
+ nbat->alloc((void **)&nbat->nbfp,
+ nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp));
+ nbat->alloc((void **)&nbat->nbfp_comb, nbat->ntype*2*sizeof(*nbat->nbfp_comb));
+
+ /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
+ * force-field floating point parameters.
+ */
+ tol = 1e-5;
+ ptr = getenv("GMX_LJCOMB_TOL");
+ if (ptr != NULL)
+ {
+ double dbl;
+
+ sscanf(ptr, "%lf", &dbl);
+ tol = dbl;
+ }
+ bCombGeom = TRUE;
+ bCombLB = TRUE;
+
+ /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
+ * to check for the LB rule.
+ */
+ for (i = 0; i < ntype; i++)
+ {
+ c6 = nbfp[(i*ntype+i)*2 ]/6.0;
+ c12 = nbfp[(i*ntype+i)*2+1]/12.0;
+ if (c6 > 0 && c12 > 0)
+ {
+ nbat->nbfp_comb[i*2 ] = pow(c12/c6, 1.0/6.0);
+ nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12;
+ }
+ else if (c6 == 0 && c12 == 0)
+ {
+ nbat->nbfp_comb[i*2 ] = 0;
+ nbat->nbfp_comb[i*2+1] = 0;
+ }
+ else
+ {
+ /* Can not use LB rule with only dispersion or repulsion */
+ bCombLB = FALSE;
+ }
+ }
+
+ for (i = 0; i < nbat->ntype; i++)
+ {
+ for (j = 0; j < nbat->ntype; j++)
+ {
+ if (i < ntype && j < ntype)
+ {
+ /* fr->nbfp has been updated, so that array too now stores c6/c12 including
+ * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
+ */
+ c6 = nbfp[(i*ntype+j)*2 ];
+ c12 = nbfp[(i*ntype+j)*2+1];
+ nbat->nbfp[(i*nbat->ntype+j)*2 ] = c6;
+ nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12;
+
+ /* Compare 6*C6 and 12*C12 for geometric cobination rule */
+ bCombGeom = bCombGeom &&
+ gmx_within_tol(c6*c6, nbfp[(i*ntype+i)*2 ]*nbfp[(j*ntype+j)*2 ], tol) &&
+ gmx_within_tol(c12*c12, nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1], tol);
+
+ /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
+ c6 /= 6.0;
+ c12 /= 12.0;
+ bCombLB = bCombLB &&
+ ((c6 == 0 && c12 == 0 &&
+ (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
+ (c6 > 0 && c12 > 0 &&
+ gmx_within_tol(pow(c12/c6, 1.0/6.0), 0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]), tol) &&
+ gmx_within_tol(0.25*c6*c6/c12, sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]), tol)));
+ }
+ else
+ {
+ /* Add zero parameters for the additional dummy atom type */
+ nbat->nbfp[(i*nbat->ntype+j)*2 ] = 0;
+ nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0;
+ }
+ }
+ }
+ if (debug)
+ {
+ fprintf(debug, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
+ bCombGeom, bCombLB);
+ }
+
+ simple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
+
+ if (simple)
+ {
+ /* We prefer the geometic combination rule,
+ * as that gives a slightly faster kernel than the LB rule.
+ */
+ if (bCombGeom)
+ {
+ nbat->comb_rule = ljcrGEOM;
+ }
+ else if (bCombLB)
+ {
+ nbat->comb_rule = ljcrLB;
+ }
+ else
+ {
+ nbat->comb_rule = ljcrNONE;
+
+ nbat->free(nbat->nbfp_comb);
+ }
+
+ if (fp)
+ {
+ if (nbat->comb_rule == ljcrNONE)
+ {
+ fprintf(fp, "Using full Lennard-Jones parameter combination matrix\n\n");
+ }
+ else
+ {
+ fprintf(fp, "Using %s Lennard-Jones combination rule\n\n",
+ nbat->comb_rule == ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
+ }
+ }
+
+ set_combination_rule_data(nbat);
+ }
+ else
+ {
+ nbat->comb_rule = ljcrNONE;
+
+ nbat->free(nbat->nbfp_comb);
+ }
+
+ nbat->natoms = 0;
+ nbat->type = NULL;
+ nbat->lj_comb = NULL;
+ if (simple)
+ {
+ int pack_x;
+
+ switch (nb_kernel_type)
+ {
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+ pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+ nbnxn_kernel_to_cj_size(nb_kernel_type));
+ switch (pack_x)
+ {
+ case 4:
+ nbat->XFormat = nbatX4;
+ break;
+ case 8:
+ nbat->XFormat = nbatX8;
+ break;
+ default:
+ gmx_incons("Unsupported packing width");
+ }
+ break;
+ default:
+ nbat->XFormat = nbatXYZ;
+ break;
+ }
+
+ nbat->FFormat = nbat->XFormat;
+ }
+ else
+ {
+ nbat->XFormat = nbatXYZQ;
+ nbat->FFormat = nbatXYZ;
+ }
+ nbat->q = NULL;
+ nbat->nenergrp = n_energygroups;
+ if (!simple)
+ {
+ /* Energy groups not supported yet for super-sub lists */
+ if (n_energygroups > 1 && fp != NULL)
+ {
+ fprintf(fp, "\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
+ }
+ nbat->nenergrp = 1;
+ }
+ /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
+ if (nbat->nenergrp > 64)
+ {
+ gmx_fatal(FARGS, "With NxN kernels not more than 64 energy groups are supported\n");
+ }
+ nbat->neg_2log = 1;
+ while (nbat->nenergrp > (1<<nbat->neg_2log))
+ {
+ nbat->neg_2log++;
+ }
+ nbat->energrp = NULL;
+ nbat->alloc((void **)&nbat->shift_vec, SHIFTS*sizeof(*nbat->shift_vec));
+ nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
+ nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
+ nbat->x = NULL;
+
+#ifdef GMX_NBNXN_SIMD
+ if (simple)
+ {
- * Here we store j - i for generating the mask for the first i,
++ /* Set the diagonal cluster pair interaction mask setup data.
+ * In the kernel we check 0 < j - i to generate the masks.
- * In the kernel we can subtract 1 to generate the subsequent mask.
++ * Here we store j - i for generating the mask for the first i (i=0);
+ * we substract 0.5 to avoid rounding issues.
- const int simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
- int simd_4xn_diag_size, real_excl, simd_excl_size, j, s;
++ * In the kernel we can subtract 1 to generate the mask for the next i.
+ */
- simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
- snew_aligned(nbat->simd_4xn_diag, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_4xn_diag_size; j++)
++ const int simd_width = GMX_SIMD_WIDTH_HERE;
++ int simd_4xn_diag_ind_size, simd_interaction_size, j;
+
- nbat->simd_4xn_diag[j] = j - 0.5;
++ simd_4xn_diag_ind_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
++ snew_aligned(nbat->simd_4xn_diagonal_j_minus_i,
++ simd_4xn_diag_ind_size, NBNXN_MEM_ALIGN);
++ for (j = 0; j < simd_4xn_diag_ind_size; j++)
+ {
- snew_aligned(nbat->simd_2xnn_diag, simd_width, NBNXN_MEM_ALIGN);
++ nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
+ }
+
- nbat->simd_2xnn_diag[j] = j - 0.5;
++ snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i,
++ simd_width, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_width/2; j++)
+ {
+ /* The j-cluster size is half the SIMD width */
- nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
++ nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5;
+ /* The next half of the SIMD width is for i + 1 */
- /* We always use 32-bit integer exclusion masks. When we use
- * double precision, we fit two integers in a double SIMD register.
++ nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
+ }
+
- real_excl = sizeof(real)/sizeof(*nbat->simd_excl_mask);
- /* Set bits for use with both 4xN and 2x(N+N) kernels */
- simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width*real_excl;
- snew_aligned(nbat->simd_excl_mask, simd_excl_size*real_excl, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_excl_size; j++)
++ /* We use up to 32 bits for exclusion masking.
++ * The same masks are used for the 4xN and 2x(N+N) kernels.
++ * The masks are read either into epi32 SIMD registers or into
++ * real SIMD registers (together with a cast).
++ * In single precision this means the real and epi32 SIMD registers
++ * are of equal size.
++ * In double precision the epi32 registers can be smaller than
++ * the real registers, so depending on the architecture, we might
++ * need to use two, identical, 32-bit masks per real.
+ */
- /* Set the consecutive bits for masking pair exclusions.
- * For double a single-bit mask would be enough.
- * But using two bits avoids endianness issues.
- */
- for (s = 0; s < real_excl; s++)
- {
- /* Set the consecutive bits for masking pair exclusions */
- nbat->simd_excl_mask[j*real_excl + s] = (1U << j);
- }
++ simd_interaction_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
++ snew_aligned(nbat->simd_exclusion_filter1, simd_interaction_size, NBNXN_MEM_ALIGN);
++ snew_aligned(nbat->simd_exclusion_filter2, simd_interaction_size*2, NBNXN_MEM_ALIGN);
++
++ for (j = 0; j < simd_interaction_size; j++)
+ {
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
-
++ /* Set the consecutive bits for filters pair exclusions masks */
++ nbat->simd_exclusion_filter1[j] = (1U << j);
++ nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
++ nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
+ }
+ }
+#endif
+
+ /* Initialize the output data structures */
+ nbat->nout = nout;
+ snew(nbat->out, nbat->nout);
+ nbat->nalloc = 0;
+ for (i = 0; i < nbat->nout; i++)
+ {
+ nbnxn_atomdata_output_init(&nbat->out[i],
+ nb_kernel_type,
+ nbat->nenergrp, 1<<nbat->neg_2log,
+ nbat->alloc);
+ }
+ nbat->buffer_flags.flag = NULL;
+ nbat->buffer_flags.flag_nalloc = 0;
+}
+
+static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
+ const int *type, int na,
+ real *ljparam_at)
+{
+ int is, k, i;
+
+ /* The LJ params follow the combination rule:
+ * copy the params for the type array to the atom array.
+ */
+ for (is = 0; is < na; is += PACK_X4)
+ {
+ for (k = 0; k < PACK_X4; k++)
+ {
+ i = is + k;
+ ljparam_at[is*2 +k] = ljparam_type[type[i]*2 ];
+ ljparam_at[is*2+PACK_X4+k] = ljparam_type[type[i]*2+1];
+ }
+ }
+}
+
+static void copy_lj_to_nbat_lj_comb_x8(const real *ljparam_type,
+ const int *type, int na,
+ real *ljparam_at)
+{
+ int is, k, i;
+
+ /* The LJ params follow the combination rule:
+ * copy the params for the type array to the atom array.
+ */
+ for (is = 0; is < na; is += PACK_X8)
+ {
+ for (k = 0; k < PACK_X8; k++)
+ {
+ i = is + k;
+ ljparam_at[is*2 +k] = ljparam_type[type[i]*2 ];
+ ljparam_at[is*2+PACK_X8+k] = ljparam_type[type[i]*2+1];
+ }
+ }
+}
+
+/* Sets the atom type and LJ data in nbnxn_atomdata_t */
+static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat,
+ int ngrid,
+ const nbnxn_search_t nbs,
+ const int *type)
+{
+ int g, i, ncz, ash;
+ const nbnxn_grid_t *grid;
+
+ for (g = 0; g < ngrid; g++)
+ {
+ grid = &nbs->grid[g];
+
+ /* Loop over all columns and copy and fill */
+ for (i = 0; i < grid->ncx*grid->ncy; i++)
+ {
+ ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
+ ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
+
+ copy_int_to_nbat_int(nbs->a+ash, grid->cxy_na[i], ncz*grid->na_sc,
+ type, nbat->ntype-1, nbat->type+ash);
+
+ if (nbat->comb_rule != ljcrNONE)
+ {
+ if (nbat->XFormat == nbatX4)
+ {
+ copy_lj_to_nbat_lj_comb_x4(nbat->nbfp_comb,
+ nbat->type+ash, ncz*grid->na_sc,
+ nbat->lj_comb+ash*2);
+ }
+ else if (nbat->XFormat == nbatX8)
+ {
+ copy_lj_to_nbat_lj_comb_x8(nbat->nbfp_comb,
+ nbat->type+ash, ncz*grid->na_sc,
+ nbat->lj_comb+ash*2);
+ }
+ }
+ }
+ }
+}
+
+/* Sets the charges in nbnxn_atomdata_t *nbat */
+static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat,
+ int ngrid,
+ const nbnxn_search_t nbs,
+ const real *charge)
+{
+ int g, cxy, ncz, ash, na, na_round, i, j;
+ real *q;
+ const nbnxn_grid_t *grid;
+
+ for (g = 0; g < ngrid; g++)
+ {
+ grid = &nbs->grid[g];
+
+ /* Loop over all columns and copy and fill */
+ for (cxy = 0; cxy < grid->ncx*grid->ncy; cxy++)
+ {
+ ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+ na = grid->cxy_na[cxy];
+ na_round = (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
+
+ if (nbat->XFormat == nbatXYZQ)
+ {
+ q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1;
+ for (i = 0; i < na; i++)
+ {
+ *q = charge[nbs->a[ash+i]];
+ q += STRIDE_XYZQ;
+ }
+ /* Complete the partially filled last cell with zeros */
+ for (; i < na_round; i++)
+ {
+ *q = 0;
+ q += STRIDE_XYZQ;
+ }
+ }
+ else
+ {
+ q = nbat->q + ash;
+ for (i = 0; i < na; i++)
+ {
+ *q = charge[nbs->a[ash+i]];
+ q++;
+ }
+ /* Complete the partially filled last cell with zeros */
+ for (; i < na_round; i++)
+ {
+ *q = 0;
+ q++;
+ }
+ }
+ }
+ }
+}
+
+/* Copies the energy group indices to a reordered and packed array */
+static void copy_egp_to_nbat_egps(const int *a, int na, int na_round,
+ int na_c, int bit_shift,
+ const int *in, int *innb)
+{
+ int i, j, sa, at;
+ int comb;
+
+ j = 0;
+ for (i = 0; i < na; i += na_c)
+ {
+ /* Store na_c energy group numbers into one int */
+ comb = 0;
+ for (sa = 0; sa < na_c; sa++)
+ {
+ at = a[i+sa];
+ if (at >= 0)
+ {
+ comb |= (GET_CGINFO_GID(in[at]) << (sa*bit_shift));
+ }
+ }
+ innb[j++] = comb;
+ }
+ /* Complete the partially filled last cell with fill */
+ for (; i < na_round; i += na_c)
+ {
+ innb[j++] = 0;
+ }
+}
+
+/* Set the energy group indices for atoms in nbnxn_atomdata_t */
+static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat,
+ int ngrid,
+ const nbnxn_search_t nbs,
+ const int *atinfo)
+{
+ int g, i, ncz, ash;
+ const nbnxn_grid_t *grid;
+
+ for (g = 0; g < ngrid; g++)
+ {
+ grid = &nbs->grid[g];
+
+ /* Loop over all columns and copy and fill */
+ for (i = 0; i < grid->ncx*grid->ncy; i++)
+ {
+ ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
+ ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
+
+ copy_egp_to_nbat_egps(nbs->a+ash, grid->cxy_na[i], ncz*grid->na_sc,
+ nbat->na_c, nbat->neg_2log,
+ atinfo, nbat->energrp+(ash>>grid->na_c_2log));
+ }
+ }
+}
+
+/* Sets all required atom parameter data in nbnxn_atomdata_t */
+void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat,
+ int locality,
+ const nbnxn_search_t nbs,
+ const t_mdatoms *mdatoms,
+ const int *atinfo)
+{
+ int ngrid;
+
+ if (locality == eatLocal)
+ {
+ ngrid = 1;
+ }
+ else
+ {
+ ngrid = nbs->ngrid;
+ }
+
+ nbnxn_atomdata_set_atomtypes(nbat, ngrid, nbs, mdatoms->typeA);
+
+ nbnxn_atomdata_set_charges(nbat, ngrid, nbs, mdatoms->chargeA);
+
+ if (nbat->nenergrp > 1)
+ {
+ nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo);
+ }
+}
+
+/* Copies the shift vector array to nbnxn_atomdata_t */
+void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox,
+ rvec *shift_vec,
+ nbnxn_atomdata_t *nbat)
+{
+ int i;
+
+ nbat->bDynamicBox = bDynamicBox;
+ for (i = 0; i < SHIFTS; i++)
+ {
+ copy_rvec(shift_vec[i], nbat->shift_vec[i]);
+ }
+}
+
+/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
+ int locality,
+ gmx_bool FillLocal,
+ rvec *x,
+ nbnxn_atomdata_t *nbat)
+{
+ int g0 = 0, g1 = 0;
+ int nth, th;
+
+ switch (locality)
+ {
+ case eatAll:
+ g0 = 0;
+ g1 = nbs->ngrid;
+ break;
+ case eatLocal:
+ g0 = 0;
+ g1 = 1;
+ break;
+ case eatNonlocal:
+ g0 = 1;
+ g1 = nbs->ngrid;
+ break;
+ }
+
+ if (FillLocal)
+ {
+ nbat->natoms_local = nbs->grid[0].nc*nbs->grid[0].na_sc;
+ }
+
+ nth = gmx_omp_nthreads_get(emntPairsearch);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for (th = 0; th < nth; th++)
+ {
+ int g;
+
+ for (g = g0; g < g1; g++)
+ {
+ const nbnxn_grid_t *grid;
+ int cxy0, cxy1, cxy;
+
+ grid = &nbs->grid[g];
+
+ cxy0 = (grid->ncx*grid->ncy* th +nth-1)/nth;
+ cxy1 = (grid->ncx*grid->ncy*(th+1)+nth-1)/nth;
+
+ for (cxy = cxy0; cxy < cxy1; cxy++)
+ {
+ int na, ash, na_fill;
+
+ na = grid->cxy_na[cxy];
+ ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+
+ if (g == 0 && FillLocal)
+ {
+ na_fill =
+ (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
+ }
+ else
+ {
+ /* We fill only the real particle locations.
+ * We assume the filling entries at the end have been
+ * properly set before during ns.
+ */
+ na_fill = na;
+ }
+ copy_rvec_to_nbat_real(nbs->a+ash, na, na_fill, x,
+ nbat->XFormat, nbat->x, ash,
+ 0, 0, 0);
+ }
+ }
+ }
+}
+
+static void
+nbnxn_atomdata_clear_reals(real * gmx_restrict dest,
+ int i0, int i1)
+{
+ int i;
+
+ for (i = i0; i < i1; i++)
+ {
+ dest[i] = 0;
+ }
+}
+
+static void
+nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
+ gmx_bool bDestSet,
+ real ** gmx_restrict src,
+ int nsrc,
+ int i0, int i1)
+{
+ int i, s;
+
+ if (bDestSet)
+ {
+ /* The destination buffer contains data, add to it */
+ for (i = i0; i < i1; i++)
+ {
+ for (s = 0; s < nsrc; s++)
+ {
+ dest[i] += src[s][i];
+ }
+ }
+ }
+ else
+ {
+ /* The destination buffer is unitialized, set it first */
+ for (i = i0; i < i1; i++)
+ {
+ dest[i] = src[0][i];
+ for (s = 1; s < nsrc; s++)
+ {
+ dest[i] += src[s][i];
+ }
+ }
+ }
+}
+
+static void
+nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
+ gmx_bool bDestSet,
+ real ** gmx_restrict src,
+ int nsrc,
+ int i0, int i1)
+{
+#ifdef GMX_NBNXN_SIMD
+/* The SIMD width here is actually independent of that in the kernels,
+ * but we use the same width for simplicity (usually optimal anyhow).
+ */
+ int i, s;
+ gmx_mm_pr dest_SSE, src_SSE;
+
+ if (bDestSet)
+ {
+ for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
+ {
+ dest_SSE = gmx_load_pr(dest+i);
+ for (s = 0; s < nsrc; s++)
+ {
+ src_SSE = gmx_load_pr(src[s]+i);
+ dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
+ }
+ gmx_store_pr(dest+i, dest_SSE);
+ }
+ }
+ else
+ {
+ for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
+ {
+ dest_SSE = gmx_load_pr(src[0]+i);
+ for (s = 1; s < nsrc; s++)
+ {
+ src_SSE = gmx_load_pr(src[s]+i);
+ dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
+ }
+ gmx_store_pr(dest+i, dest_SSE);
+ }
+ }
+#endif
+}
+
+/* Add part of the force array(s) from nbnxn_atomdata_t to f */
+static void
+nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
+ const nbnxn_atomdata_t *nbat,
+ nbnxn_atomdata_output_t *out,
+ int nfa,
+ int a0, int a1,
+ rvec *f)
+{
+ int a, i, fa;
+ const int *cell;
+ const real *fnb;
+
+ cell = nbs->cell;
+
+ /* Loop over all columns and copy and fill */
+ switch (nbat->FFormat)
+ {
+ case nbatXYZ:
+ case nbatXYZQ:
+ if (nfa == 1)
+ {
+ fnb = out[0].f;
+
+ for (a = a0; a < a1; a++)
+ {
+ i = cell[a]*nbat->fstride;
+
+ f[a][XX] += fnb[i];
+ f[a][YY] += fnb[i+1];
+ f[a][ZZ] += fnb[i+2];
+ }
+ }
+ else
+ {
+ for (a = a0; a < a1; a++)
+ {
+ i = cell[a]*nbat->fstride;
+
+ for (fa = 0; fa < nfa; fa++)
+ {
+ f[a][XX] += out[fa].f[i];
+ f[a][YY] += out[fa].f[i+1];
+ f[a][ZZ] += out[fa].f[i+2];
+ }
+ }
+ }
+ break;
+ case nbatX4:
+ if (nfa == 1)
+ {
+ fnb = out[0].f;
+
+ for (a = a0; a < a1; a++)
+ {
+ i = X4_IND_A(cell[a]);
+
+ f[a][XX] += fnb[i+XX*PACK_X4];
+ f[a][YY] += fnb[i+YY*PACK_X4];
+ f[a][ZZ] += fnb[i+ZZ*PACK_X4];
+ }
+ }
+ else
+ {
+ for (a = a0; a < a1; a++)
+ {
+ i = X4_IND_A(cell[a]);
+
+ for (fa = 0; fa < nfa; fa++)
+ {
+ f[a][XX] += out[fa].f[i+XX*PACK_X4];
+ f[a][YY] += out[fa].f[i+YY*PACK_X4];
+ f[a][ZZ] += out[fa].f[i+ZZ*PACK_X4];
+ }
+ }
+ }
+ break;
+ case nbatX8:
+ if (nfa == 1)
+ {
+ fnb = out[0].f;
+
+ for (a = a0; a < a1; a++)
+ {
+ i = X8_IND_A(cell[a]);
+
+ f[a][XX] += fnb[i+XX*PACK_X8];
+ f[a][YY] += fnb[i+YY*PACK_X8];
+ f[a][ZZ] += fnb[i+ZZ*PACK_X8];
+ }
+ }
+ else
+ {
+ for (a = a0; a < a1; a++)
+ {
+ i = X8_IND_A(cell[a]);
+
+ for (fa = 0; fa < nfa; fa++)
+ {
+ f[a][XX] += out[fa].f[i+XX*PACK_X8];
+ f[a][YY] += out[fa].f[i+YY*PACK_X8];
+ f[a][ZZ] += out[fa].f[i+ZZ*PACK_X8];
+ }
+ }
+ }
+ break;
+ default:
+ gmx_incons("Unsupported nbnxn_atomdata_t format");
+ }
+}
+
+/* Add the force array(s) from nbnxn_atomdata_t to f */
+void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
+ int locality,
+ const nbnxn_atomdata_t *nbat,
+ rvec *f)
+{
+ int a0 = 0, na = 0;
+ int nth, th;
+
+ nbs_cycle_start(&nbs->cc[enbsCCreducef]);
+
+ switch (locality)
+ {
+ case eatAll:
+ a0 = 0;
+ na = nbs->natoms_nonlocal;
+ break;
+ case eatLocal:
+ a0 = 0;
+ na = nbs->natoms_local;
+ break;
+ case eatNonlocal:
+ a0 = nbs->natoms_local;
+ na = nbs->natoms_nonlocal - nbs->natoms_local;
+ break;
+ }
+
+ nth = gmx_omp_nthreads_get(emntNonbonded);
+
+ if (nbat->nout > 1)
+ {
+ if (locality != eatAll)
+ {
+ gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
+ }
+
+ /* Reduce the force thread output buffers into buffer 0, before adding
+ * them to the, differently ordered, "real" force buffer.
+ */
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for (th = 0; th < nth; th++)
+ {
+ const nbnxn_buffer_flags_t *flags;
+ int b0, b1, b;
+ int i0, i1;
+ int nfptr;
+ real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
+ int out;
+
+ flags = &nbat->buffer_flags;
+
+ /* Calculate the cell-block range for our thread */
+ b0 = (flags->nflag* th )/nth;
+ b1 = (flags->nflag*(th+1))/nth;
+
+ for (b = b0; b < b1; b++)
+ {
+ i0 = b *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+ i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+
+ nfptr = 0;
+ for (out = 1; out < nbat->nout; out++)
+ {
+ if (flags->flag[b] & (1U<<out))
+ {
+ fptr[nfptr++] = nbat->out[out].f;
+ }
+ }
+ if (nfptr > 0)
+ {
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_atomdata_reduce_reals_simd
+#else
+ nbnxn_atomdata_reduce_reals
+#endif
+ (nbat->out[0].f,
+ flags->flag[b] & (1U<<0),
+ fptr, nfptr,
+ i0, i1);
+ }
+ else if (!(flags->flag[b] & (1U<<0)))
+ {
+ nbnxn_atomdata_clear_reals(nbat->out[0].f,
+ i0, i1);
+ }
+ }
+ }
+ }
+
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for (th = 0; th < nth; th++)
+ {
+ nbnxn_atomdata_add_nbat_f_to_f_part(nbs, nbat,
+ nbat->out,
+ 1,
+ a0+((th+0)*na)/nth,
+ a0+((th+1)*na)/nth,
+ f);
+ }
+
+ nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
+}
+
+/* Adds the shift forces from nbnxn_atomdata_t to fshift */
+void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
+ rvec *fshift)
+{
+ const nbnxn_atomdata_output_t *out;
+ int th;
+ int s;
+ rvec sum;
+
+ out = nbat->out;
+
+ for (s = 0; s < SHIFTS; s++)
+ {
+ clear_rvec(sum);
+ for (th = 0; th < nbat->nout; th++)
+ {
+ sum[XX] += out[th].fshift[s*DIM+XX];
+ sum[YY] += out[th].fshift[s*DIM+YY];
+ sum[ZZ] += out[th].fshift[s*DIM+ZZ];
+ }
+ rvec_inc(fshift[s], sum);
+ }
+}
--- /dev/null
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_consts_h
+#define _nbnxn_consts_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* The number of pair-search sub-cells per super-cell, used for GPU */
+#define GPU_NSUBCELL_Z 2
+#define GPU_NSUBCELL_Y 2
+#define GPU_NSUBCELL_X 2
+#define GPU_NSUBCELL (GPU_NSUBCELL_Z*GPU_NSUBCELL_Y*GPU_NSUBCELL_X)
+/* In the non-bonded GPU kernel we operate on cluster-pairs, not cells.
+ * The number of cluster in a super-cluster matches the number of sub-cells
+ * in a pair-search cell, so we introduce a new name for the same value.
+ */
+#define NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER GPU_NSUBCELL
+
+/* With CPU kernels the i-cluster size is always 4 atoms.
+ * With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4.
+ */
+#define NBNXN_CPU_CLUSTER_I_SIZE 4
+
+#define NBNXN_CPU_CLUSTER_I_SIZE_2LOG 2
+
+/* With GPU kernels the cluster size is 8 atoms */
+#define NBNXN_GPU_CLUSTER_SIZE 8
+
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
+ * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ */
+#define NBNXN_GPU_JGROUP_SIZE 4
+#define NBNXN_GPU_JGROUP_SIZE_2LOG 2
+
+/* To avoid NaN when excluded atoms are at zero distance, we add a small
+ * number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
+ */
+#ifndef GMX_DOUBLE
+#define NBNXN_AVOID_SING_R2_INC 1.0e-12f
+#else
+/* The double prec. x86 SIMD kernels use a single prec. invsqrt, so > 1e-38 */
+#define NBNXN_AVOID_SING_R2_INC 1.0e-36
+#endif
+
+/* Coulomb force table size chosen such that it fits along the non-bonded
+ parameters in the texture cache. */
+#define GPU_EWALD_COULOMB_FORCE_TABLE_SIZE 1536
+
+
+/* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
+#define STRIDE_XYZ 3
+#define STRIDE_XYZQ 4
+/* Size of packs of x, y or z with SSE/AVX packed coords/forces */
+#define PACK_X4 4
+#define PACK_X8 8
+/* Strides for a pack of 4 and 8 coordinates/forces */
+#define STRIDE_P4 (DIM*PACK_X4)
+#define STRIDE_P8 (DIM*PACK_X8)
+
+/* Index of atom a into the SSE/AVX coordinate/force array */
+#define X4_IND_A(a) (STRIDE_P4*((a) >> 2) + ((a) & (PACK_X4 - 1)))
+#define X8_IND_A(a) (STRIDE_P8*((a) >> 3) + ((a) & (PACK_X8 - 1)))
+
+
++/* Cluster-pair Interaction masks for 4xN and 2xNN kernels.
++ * Bit i*CJ_SIZE + j tells if atom i and j interact.
++ */
++/* All interaction mask is the same for all kernels */
++#define NBNXN_INTERACTION_MASK_ALL 0xffffffff
++/* 4x4 kernel diagonal mask */
++#define NBNXN_INTERACTION_MASK_DIAG 0x08ce
++/* 4x2 kernel diagonal masks */
++#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002
++#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002F
++/* 4x8 kernel diagonal masks */
++#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfe
++#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0
++
++
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
- #define NBNXN_MEM_ALIGN (GMX_NBNXN_SIMD_BITWIDTH/8)
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_internal_h
+#define _nbnxn_internal_h
+
+#include "typedefs.h"
+#include "domdec.h"
+#include "gmx_cyclecounter.h"
+
++#ifdef GMX_NBNXN_SIMD
++/* The include below sets the SIMD instruction type (precision+width)
++ * for all nbnxn SIMD search and non-bonded kernel code.
++ */
++#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
++#define GMX_USE_HALF_WIDTH_SIMD_HERE
++#endif
++#include "gmx_simd_macros.h"
++#endif
++
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef GMX_X86_SSE2
+/* Use 4-way SIMD for, always, single precision bounding box calculations */
+#define NBNXN_SEARCH_BB_SSE
+#endif
+
+
+#ifdef GMX_NBNXN_SIMD
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
++#define NBNXN_MEM_ALIGN (GMX_SIMD_WIDTH_HERE*sizeof(real))
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_MEM_ALIGN 32
+#endif
+
+
+/* A pair-search grid struct for one domain decomposition zone */
+typedef struct {
+ rvec c0; /* The lower corner of the (local) grid */
+ rvec c1; /* The upper corner of the (local) grid */
+ real atom_density; /* The atom number density for the local grid */
+
+ gmx_bool bSimple; /* Is this grid simple or super/sub */
+ int na_c; /* Number of atoms per cluster */
+ int na_cj; /* Number of atoms for list j-clusters */
+ int na_sc; /* Number of atoms per super-cluster */
+ int na_c_2log; /* 2log of na_c */
+
+ int ncx; /* Number of (super-)cells along x */
+ int ncy; /* Number of (super-)cells along y */
+ int nc; /* Total number of (super-)cells */
+
+ real sx; /* x-size of a (super-)cell */
+ real sy; /* y-size of a (super-)cell */
+ real inv_sx; /* 1/sx */
+ real inv_sy; /* 1/sy */
+
+ int cell0; /* Index in nbs->cell corresponding to cell 0 */
+
+ int *cxy_na; /* The number of atoms for each column in x,y */
+ int *cxy_ind; /* Grid (super)cell index, offset from cell0 */
+ int cxy_nalloc; /* Allocation size for cxy_na and cxy_ind */
+
+ int *nsubc; /* The number of sub cells for each super cell */
+ float *bbcz; /* Bounding boxes in z for the super cells */
+ float *bb; /* 3D bounding boxes for the sub cells */
+ float *bbj; /* 3D j-b.boxes for SSE-double or AVX-single */
+ int *flags; /* Flag for the super cells */
+ int nc_nalloc; /* Allocation size for the pointers above */
+
+ float *bbcz_simple; /* bbcz for simple grid converted from super */
+ float *bb_simple; /* bb for simple grid converted from super */
+ int *flags_simple; /* flags for simple grid converted from super */
+ int nc_nalloc_simple; /* Allocation size for the pointers above */
+
+ int nsubc_tot; /* Total number of subcell, used for printing */
+} nbnxn_grid_t;
+
+#ifdef GMX_NBNXN_SIMD
+
+typedef struct nbnxn_x_ci_simd_4xn {
+ /* The i-cluster coordinates for simple search */
+ gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
+ gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
+ gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
+ gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
+} nbnxn_x_ci_simd_4xn_t;
+
+typedef struct nbnxn_x_ci_simd_2xnn {
+ /* The i-cluster coordinates for simple search */
+ gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
+ gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
+} nbnxn_x_ci_simd_2xnn_t;
+
+#endif
+
+/* Working data for the actual i-supercell during pair search */
+typedef struct nbnxn_list_work {
+ gmx_cache_protect_t cp0; /* Protect cache between threads */
+
+ float *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
+ real *x_ci; /* The coordinates, pbc shifted, for each atom */
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
+ nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
+#endif
+ int cj_ind; /* The current cj_ind index for the current list */
+ int cj4_init; /* The first unitialized cj4 block */
+
+ float *d2; /* Bounding box distance work array */
+
+ nbnxn_cj_t *cj; /* The j-cell list */
+ int cj_nalloc; /* Allocation size of cj */
+
+ int ncj_noq; /* Nr. of cluster pairs without Coul for flop count */
+ int ncj_hlj; /* Nr. of cluster pairs with 1/2 LJ for flop count */
+
+ int *sort; /* Sort index */
+ int sort_nalloc; /* Allocation size of sort */
+
+ nbnxn_sci_t *sci_sort; /* Second sci array, for sorting */
+ int sci_sort_nalloc; /* Allocation size of sci_sort */
+
+ gmx_cache_protect_t cp1; /* Protect cache between threads */
+} nbnxn_list_work_t;
+
+/* Function type for setting the i-atom coordinate working data */
+typedef void
+ gmx_icell_set_x_t (int ci,
+ real shx, real shy, real shz,
+ int na_c,
+ int stride, const real *x,
+ nbnxn_list_work_t *work);
+
+static gmx_icell_set_x_t icell_set_x_simple;
+#ifdef GMX_NBNXN_SIMD
+static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
+#endif
+static gmx_icell_set_x_t icell_set_x_supersub;
+#ifdef NBNXN_SEARCH_SSE
+static gmx_icell_set_x_t icell_set_x_supersub_sse8;
+#endif
+
+/* Local cycle count struct for profiling */
+typedef struct {
+ int count;
+ gmx_cycles_t c;
+ gmx_cycles_t start;
+} nbnxn_cycle_t;
+
+/* Local cycle count enum for profiling */
+enum {
+ enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr
+};
+
+/* Thread-local work struct, contains part of nbnxn_grid_t */
+typedef struct {
+ gmx_cache_protect_t cp0;
+
+ int *cxy_na;
+ int cxy_na_nalloc;
+
+ int *sort_work;
+ int sort_work_nalloc;
+
+ nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
+
+ int ndistc; /* Number of distance checks for flop counting */
+
+ nbnxn_cycle_t cc[enbsCCnr];
+
+ gmx_cache_protect_t cp1;
+} nbnxn_search_work_t;
+
+/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
+typedef struct nbnxn_search {
+ int ePBC; /* PBC type enum */
+ matrix box; /* The periodic unit-cell */
+
+ gmx_bool DomDec; /* Are we doing domain decomposition? */
+ ivec dd_dim; /* Are we doing DD in x,y,z? */
+ gmx_domdec_zones_t *zones; /* The domain decomposition zones */
+
+ int ngrid; /* The number of grids, equal to #DD-zones */
+ nbnxn_grid_t *grid; /* Array of grids, size ngrid */
+ int *cell; /* Actual allocated cell array for all grids */
+ int cell_nalloc; /* Allocation size of cell */
+ int *a; /* Atom index for grid, the inverse of cell */
+ int a_nalloc; /* Allocation size of a */
+
+ int natoms_local; /* The local atoms run from 0 to natoms_local */
+ int natoms_nonlocal; /* The non-local atoms run from natoms_local
+ * to natoms_nonlocal */
+
+ gmx_bool print_cycles;
+ int search_count;
+ nbnxn_cycle_t cc[enbsCCnr];
+
+ gmx_icell_set_x_t *icell_set_x; /* Function for setting i-coords */
+
+ int nthread_max; /* Maximum number of threads for pair-search */
+ nbnxn_search_work_t *work; /* Work array, size nthread_max */
+} nbnxn_search_t_t;
+
+
+static void nbs_cycle_start(nbnxn_cycle_t *cc)
+{
+ cc->start = gmx_cycles_read();
+}
+
+static void nbs_cycle_stop(nbnxn_cycle_t *cc)
+{
+ cc->c += gmx_cycles_read() - cc->start;
+ cc->count++;
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
- #include "nbnxn_kernel_simd_2xnn.h"
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "typedefs.h"
+#include "vec.h"
+#include "smalloc.h"
+#include "force.h"
+#include "gmx_omp_nthreads.h"
+#include "../nbnxn_consts.h"
+#include "nbnxn_kernel_common.h"
+
+#ifdef GMX_NBNXN_SIMD_2XNN
+
- /* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
++/* Include the full width SIMD macros */
++#include "gmx_simd_macros.h"
++#include "gmx_simd_vec.h"
+
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #define GMX_MM128_HERE
- #else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- #define GMX_MM256_HERE
- #else
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
- #endif
++#include "nbnxn_kernel_simd_2xnn.h"
+
- const int simd_width = GMX_SIMD_WIDTH_HERE;
- const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
++#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)
++#error "unsupported SIMD width"
+#endif
+
++
++/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
++
+/* Analytical reaction-field kernels */
+#define CALC_COUL_RF
+
+#include "nbnxn_kernel_simd_2xnn_includes.h"
+
+#undef CALC_COUL_RF
+
+/* Tabulated exclusion interaction electrostatics kernels */
+#define CALC_COUL_TAB
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_simd_2xnn_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_simd_2xnn_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_TAB
+
+/* Analytical Ewald exclusion interaction electrostatics kernels */
+#define CALC_COUL_EWALD
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_simd_2xnn_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_simd_2xnn_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_EWALD
+
+
+typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f,
+ real *fshift,
+ real *Vvdw,
+ real *Vc);
+
+typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f,
+ real *fshift);
+
+enum {
+ coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR
+};
+
+#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _ener
+static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
+{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _energrp
+static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
+{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _noener
+static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
+{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+#undef NBK_FN
+
+
+static void reduce_group_energies(int ng, int ng_2log,
+ const real *VSvdw, const real *VSc,
+ real *Vvdw, real *Vc)
+{
- c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
++ const int unrollj = GMX_SIMD_WIDTH_HERE/2;
++ const int unrollj_half = unrollj/2;
+ int ng_p2, i, j, j0, j1, c, s;
+
+ ng_p2 = (1<<ng_2log);
+
+ /* The size of the x86 SIMD energy group buffer array is:
+ * ng*ng*ng_p2*unrollj_half*simd_width
+ */
+ for (i = 0; i < ng; i++)
+ {
+ for (j = 0; j < ng; j++)
+ {
+ Vvdw[i*ng+j] = 0;
+ Vc[i*ng+j] = 0;
+ }
+
+ for (j1 = 0; j1 < ng; j1++)
+ {
+ for (j0 = 0; j0 < ng; j0++)
+ {
- c += simd_width/2 + 2;
++ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
+ for (s = 0; s < unrollj_half; s++)
+ {
+ Vvdw[i*ng+j0] += VSvdw[c+0];
+ Vvdw[i*ng+j1] += VSvdw[c+1];
+ Vc [i*ng+j0] += VSc [c+0];
+ Vc [i*ng+j1] += VSc [c+1];
++ c += unrollj + 2;
+ }
+ }
+ }
+ }
+}
+
+#endif /* GMX_NBNXN_SIMD_2XNN */
+
+void
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t gmx_unused *nbl_list,
+ const nbnxn_atomdata_t gmx_unused *nbat,
+ const interaction_const_t gmx_unused *ic,
+ int gmx_unused ewald_excl,
+ rvec gmx_unused *shift_vec,
+ int gmx_unused force_flags,
+ int gmx_unused clearF,
+ real gmx_unused *fshift,
+ real gmx_unused *Vc,
+ real gmx_unused *Vvdw)
+#ifdef GMX_NBNXN_SIMD_2XNN
+{
+ int nnbl;
+ nbnxn_pairlist_t **nbl;
+ int coult;
+ int nb;
+
+ nnbl = nbl_list->nnbl;
+ nbl = nbl_list->nbl;
+
+ if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ {
+ coult = coultRF;
+ }
+ else
+ {
+ if (ewald_excl == ewaldexclTable)
+ {
+ if (ic->rcoulomb == ic->rvdw)
+ {
+ coult = coultTAB;
+ }
+ else
+ {
+ coult = coultTAB_TWIN;
+ }
+ }
+ else
+ {
+ if (ic->rcoulomb == ic->rvdw)
+ {
+ coult = coultEWALD;
+ }
+ else
+ {
+ coult = coultEWALD_TWIN;
+ }
+ }
+ }
+
+#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ for (nb = 0; nb < nnbl; nb++)
+ {
+ nbnxn_atomdata_output_t *out;
+ real *fshift_p;
+
+ out = &nbat->out[nb];
+
+ if (clearF == enbvClearFYes)
+ {
+ clear_f(nbat, nb, out->f);
+ }
+
+ if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+ {
+ fshift_p = fshift;
+ }
+ else
+ {
+ fshift_p = out->fshift;
+
+ if (clearF == enbvClearFYes)
+ {
+ clear_fshift(fshift_p);
+ }
+ }
+
+ /* With Ewald type electrostatics we the forces for excluded atom pairs
+ * should not contribute to the virial sum. The exclusion forces
+ * are not calculate in the energy kernels, but are in _noener.
+ */
+ if (!((force_flags & GMX_FORCE_ENERGY) ||
+ (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ {
+ /* Don't calculate energies */
+ p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
+ ic,
+ shift_vec,
+ out->f,
+ fshift_p);
+ }
+ else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ {
+ /* No energy groups */
+ out->Vvdw[0] = 0;
+ out->Vc[0] = 0;
+
+ p_nbk_ener[coult][nbat->comb_rule](nbl[nb], nbat,
+ ic,
+ shift_vec,
+ out->f,
+ fshift_p,
+ out->Vvdw,
+ out->Vc);
+ }
+ else
+ {
+ /* Calculate energy group contributions */
+ int i;
+
+ for (i = 0; i < out->nVS; i++)
+ {
+ out->VSvdw[i] = 0;
+ }
+ for (i = 0; i < out->nVS; i++)
+ {
+ out->VSc[i] = 0;
+ }
+
+ p_nbk_energrp[coult][nbat->comb_rule](nbl[nb], nbat,
+ ic,
+ shift_vec,
+ out->f,
+ fshift_p,
+ out->VSvdw,
+ out->VSc);
+
+ reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
+ out->VSvdw, out->VSc,
+ out->Vvdw, out->Vc);
+ }
+ }
+
+ if (force_flags & GMX_FORCE_ENERGY)
+ {
+ reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
+ }
+}
+#else
+{
+ gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
+}
+#endif
--- /dev/null
- #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel duplicates the data for N j-particles in
+ * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ * This leads to 4/2=2 sets of most instructions. Therefore we call
+ * this kernel 2x(N+N) = 2xnn
+ *
+ * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ * and instructions removed.
+ *
+ * An alternative would be to load to different cluster of N j-particles
+ * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ * of instructions, which could lead to better scheduling. But we actually
+ * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ * It could be worth trying this option, but it takes some more effort.
+ * This 2xnn kernel is basically the 4xn equivalent with
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this can be faster with blendv.
+ */
- gmx_mm_pr int_S0;
- gmx_mm_pr int_S2;
++#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV && !defined COUNT_PAIRS
+/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ * With gcc this is slower, except for RF on Sandy Bridge.
+ * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#define CUTOFF_BLENDV
+#endif
+/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ * Tested with icc 13.
+ */
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#define CUTOFF_BLENDV
+#endif
+#endif
+
+{
+ int cj, aj, ajx, ajy, ajz;
+
+#ifdef ENERGY_GROUPS
+ /* Energy group indices for two atoms packed into one int */
+ int egp_jj[UNROLLJ/2];
+#endif
+
+#ifdef CHECK_EXCLS
+ /* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pr wco_S0;
- gmx_mm_pr wco_S2;
++ gmx_mm_pb interact_S0;
++ gmx_mm_pb interact_S2;
+#endif
+
+ gmx_mm_pr jx_S, jy_S, jz_S;
+ gmx_mm_pr dx_S0, dy_S0, dz_S0;
+ gmx_mm_pr dx_S2, dy_S2, dz_S2;
+ gmx_mm_pr tx_S0, ty_S0, tz_S0;
+ gmx_mm_pr tx_S2, ty_S2, tz_S2;
+ gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0;
+ gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2;
+#ifndef CUTOFF_BLENDV
+ /* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pr wco_vdw_S0;
++ gmx_mm_pb wco_S0;
++ gmx_mm_pb wco_S2;
+#endif
+#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr wco_vdw_S2;
++ gmx_mm_pb wco_vdw_S0;
+#ifndef HALF_LJ
- /* Load integer interaction mask */
++ gmx_mm_pb wco_vdw_S2;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+ /* 1/r masked with the interaction mask */
+ gmx_mm_pr rinv_ex_S0;
+ gmx_mm_pr rinv_ex_S2;
+#endif
+ gmx_mm_pr jq_S;
+ gmx_mm_pr qq_S0;
+ gmx_mm_pr qq_S2;
+#ifdef CALC_COUL_TAB
+ /* The force (PME mesh force) we need to subtract from 1/r^2 */
+ gmx_mm_pr fsub_S0;
+ gmx_mm_pr fsub_S2;
+#endif
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr brsq_S0, brsq_S2;
+ gmx_mm_pr ewcorr_S0, ewcorr_S2;
+#endif
+
+ /* frcoul = (1/r - fsub)*r */
+ gmx_mm_pr frcoul_S0;
+ gmx_mm_pr frcoul_S2;
+#ifdef CALC_COUL_TAB
+ /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+ gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0;
+ gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2;
+ /* Table index: rs truncated to an int */
+ gmx_epi32 ti_S0, ti_S2;
+ /* Linear force table values */
+ gmx_mm_pr ctab0_S0, ctab1_S0;
+ gmx_mm_pr ctab0_S2, ctab1_S2;
+#ifdef CALC_ENERGIES
+ /* Quadratic energy table value */
+ gmx_mm_pr ctabv_S0;
+ gmx_mm_pr ctabv_S2;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ /* The potential (PME mesh) we need to subtract from 1/r */
+ gmx_mm_pr vc_sub_S0;
+ gmx_mm_pr vc_sub_S2;
+#endif
+#ifdef CALC_ENERGIES
+ /* Electrostatic potential */
+ gmx_mm_pr vcoul_S0;
+ gmx_mm_pr vcoul_S2;
+#endif
+#endif
+ /* The force times 1/r */
+ gmx_mm_pr fscal_S0;
+ gmx_mm_pr fscal_S2;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+ /* LJ sigma_j/2 and sqrt(epsilon_j) */
+ gmx_mm_pr hsig_j_S, seps_j_S;
+ /* LJ sigma_ij and epsilon_ij */
+ gmx_mm_pr sig_S0, eps_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr sig_S2, eps_S2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sig2_S0, sig6_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr sig2_S2, sig6_S2;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+ gmx_mm_pr c6s_j_S, c12s_j_S;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ /* Index for loading LJ parameters, complicated when interleaving */
+ int aj2;
+#endif
+
+#ifndef FIX_LJ_C
+ /* LJ C6 and C12 parameters, used with geometric comb. rule */
+ gmx_mm_pr c6_S0, c12_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr c6_S2, c12_S2;
+#endif
+#endif
+
+ /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+ gmx_mm_pr rinvsix_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr rinvsix_S2;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+ gmx_mm_pr sir_S0, sir2_S0, sir6_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr sir_S2, sir2_S2, sir6_S2;
+#endif
+#endif
+
+ gmx_mm_pr FrLJ6_S0, FrLJ12_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr FrLJ6_S2, FrLJ12_S2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0;
+#ifndef HALF_LJ
+ gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+ gmx_mm_hpr fjx_S, fjy_S, fjz_S;
+
+ /* j-cluster index */
+ cj = l_cj[cjind].cj;
+
+ /* Atom indices (of the first atom in the cluster) */
+ aj = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+ aj2 = aj*2;
+#else
+ aj2 = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+ ajx = aj*DIM;
+#else
+ ajx = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+ ajy = ajx + STRIDE;
+ ajz = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ {
- int_S0 = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
- int_S2 = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
++ /* Load integer topology exclusion interaction mask */
++ gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
++
++ interact_S0 = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
++ interact_S2 = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
++ }
++#else
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
++ {
++ /* Integer mask set, cast to real and real mask operations */
+ gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
+
- gmx_loaddh_pr(jx_S, x+ajx);
- gmx_loaddh_pr(jy_S, x+ajy);
- gmx_loaddh_pr(jz_S, x+ajz);
++ interact_S0 = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
++ interact_S2 = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
+ }
++#else
++#error "No SIMD bitmask operation available"
++#endif
+#endif
++#endif /* CHECK_EXCLS */
+
+ /* load j atom coordinates */
- wco_S0 = gmx_and_pr(wco_S0, diag_S0);
- wco_S2 = gmx_and_pr(wco_S2, diag_S2);
++ gmx_loaddh_pr(&jx_S, x+ajx);
++ gmx_loaddh_pr(&jy_S, x+ajy);
++ gmx_loaddh_pr(&jz_S, x+ajz);
+
+ /* Calculate distance */
+ dx_S0 = gmx_sub_pr(ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(iz_S0, jz_S);
+ dx_S2 = gmx_sub_pr(ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(iz_S2, jz_S);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+
+#ifndef CUTOFF_BLENDV
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+ /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+ if (cj == ci_sh)
+ {
- wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
- wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2);
+ }
+#else
+#if UNROLLJ == 2*UNROLLI
+ if (cj*2 == ci_sh)
+ {
- wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
- wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
+ }
+ else if (cj*2 + 1 == ci_sh)
+ {
- wco_S0 = gmx_and_pr(wco_S0, int_S0);
- wco_S2 = gmx_and_pr(wco_S2, int_S2);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
+ }
+#else
+#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
+#endif
+#endif
+#else /* EXCL_FORCES */
+ /* No exclusion forces: remove all excluded atom pairs from the list */
- rsq_S0 = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
- rsq_S2 = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
++ wco_S0 = gmx_and_pb(wco_S0, interact_S0);
++ wco_S2 = gmx_and_pb(wco_S2, interact_S2);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+ {
+ int i, j;
+ real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
+ tmp = gmx_simd_align_real(tmpa);
+ for (i = 0; i < UNROLLI; i+=2)
+ {
+ gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2);
+ for (j = 0; j < 2*UNROLLJ; j++)
+ {
+ if (!(tmp[j] == 0))
+ {
+ npair++;
+ }
+ }
+ }
+ }
+#endif
+
+#ifdef CHECK_EXCLS
+ /* For excluded pairs add a small number to avoid r^-6 = NaN */
- gmx_loaddh_pr(jq_S, q+aj);
++ rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
++ rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
+#endif
+
+ /* Calculate 1/r */
+ rinv_S0 = gmx_invsqrt_pr(rsq_S0);
+ rinv_S2 = gmx_invsqrt_pr(rsq_S2);
+
+#ifdef CALC_COULOMB
+ /* Load parameters for j atom */
- load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_S0, c12_S0);
++ gmx_loaddh_pr(&jq_S, q+aj);
+ qq_S0 = gmx_mul_pr(iq_S0, jq_S);
+ qq_S2 = gmx_mul_pr(iq_S2, jq_S);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params2(nbfp2, nbfp3, type, aj, c6_S2, c12_S2);
++ load_lj_pair_params2(nbfp0, nbfp1, type, aj, &c6_S0, &c12_S0);
+#ifndef HALF_LJ
- gmx_loaddh_pr(c6s_j_S, ljc+aj2+0);
- gmx_loaddh_pr(c12s_j_S, ljc+aj2+STRIDE);
++ load_lj_pair_params2(nbfp2, nbfp3, type, aj, &c6_S2, &c12_S2);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
- gmx_loaddh_pr(hsig_j_S, ljc+aj2+0);
- gmx_loaddh_pr(seps_j_S, ljc+aj2+STRIDE);
++ gmx_loaddh_pr(&c6s_j_S, ljc+aj2+0);
++ gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE);
+ c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S );
+#ifndef HALF_LJ
+ c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S );
+#endif
+ c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S);
+#ifndef HALF_LJ
+ c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
- rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, int_S0);
- rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, int_S2);
++ gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0);
++ gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE);
+
+ sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S);
+ eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S);
+#ifndef HALF_LJ
+ sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S);
+ eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef CUTOFF_BLENDV
+ rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0);
+ rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2);
+#else
+ /* We only need to mask for the cut-off: blendv is faster */
+ rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
+ rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+#endif
+
+ rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0);
+ rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2);
+
+#ifdef CALC_COULOMB
+ /* Note that here we calculate force*r, not the usual force/r.
+ * This allows avoiding masking the reaction-field contribution,
+ * as frcoul is later multiplied by rinvsq which has been
+ * masked with the cut-off check.
+ */
+
+#ifdef EXCL_FORCES
+ /* Only add 1/r for non-excluded atom pairs */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
++ rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0);
++ rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2);
+#else
+ /* No exclusion forces, we always need 1/r */
+#define rinv_ex_S0 rinv_S0
+#define rinv_ex_S2 rinv_S2
+#endif
+
+#ifdef CALC_COUL_RF
+ /* Electrostatic interactions */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
++ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
++ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
+
+#ifdef CALC_ENERGIES
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ /* We need to mask (or limit) rsq for the cut-off,
+ * as large distances can cause an overflow in gmx_pmecorrF/V.
+ */
+#ifndef CUTOFF_BLENDV
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+#else
+ /* Strangely, putting mul on a separate line is slower (icc 13) */
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+#endif
+ ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
+ ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
- #ifdef GMX_HAVE_SIMD_FLOOR
++ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
++ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
+
+#ifdef CALC_ENERGIES
+ vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
+ vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
+#ifdef CALC_COUL_TAB
+ /* Electrostatic interactions */
+ r_S0 = gmx_mul_pr(rsq_S0, rinv_S0);
+ r_S2 = gmx_mul_pr(rsq_S2, rinv_S2);
+ /* Convert r to scaled table units */
+ rs_S0 = gmx_mul_pr(r_S0, invtsp_S);
+ rs_S2 = gmx_mul_pr(r_S2, invtsp_S);
+ /* Truncate scaled r to an int */
+ ti_S0 = gmx_cvttpr_epi32(rs_S0);
+ ti_S2 = gmx_cvttpr_epi32(rs_S2);
- load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
- load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
++#ifdef GMX_SIMD_HAVE_FLOOR
+ rf_S0 = gmx_floor_pr(rs_S0);
+ rf_S2 = gmx_floor_pr(rs_S2);
+#else
+ rf_S0 = gmx_cvtepi32_pr(ti_S0);
+ rf_S2 = gmx_cvtepi32_pr(ti_S2);
+#endif
+ frac_S0 = gmx_sub_pr(rs_S0, rf_S0);
+ frac_S2 = gmx_sub_pr(rs_S2, rf_S2);
+
+ /* Load and interpolate table forces and possibly energies.
+ * Force and energy can be combined in one table, stride 4: FDV0
+ * or in two separate tables with stride 1: F and V
+ * Currently single precision uses FDV0, double F and V.
+ */
+#ifndef CALC_ENERGIES
- load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
++ load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
++ load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
+#else
+#ifdef TAB_FDV0
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
++ load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++ load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
+#else
- vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
- vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
++ load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
+#endif
+#endif
+ fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
+ fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+
+#ifdef CALC_ENERGIES
+ vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
+ vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+#endif
+#endif /* CALC_COUL_TAB */
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#ifndef NO_SHIFT_EWALD
+ /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
- rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, int_S0);
++ vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
++ vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
+#else
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+#endif
+#endif
+
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+#endif
+
+#ifdef CALC_ENERGIES
+ /* Mask energy for cut-off and diagonal */
+ vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0);
+ vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+ /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+ wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+#ifndef HALF_LJ
+ wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+#endif
+#else
+ /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define wco_vdw_S0 wco_S0
+#define wco_vdw_S2 wco_S2
+#endif
+
+#ifndef LJ_COMB_LB
+ rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+#ifdef EXCL_FORCES
- rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, int_S2);
++ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0);
+#endif
+#ifndef HALF_LJ
+ rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+#ifdef EXCL_FORCES
- sir6_S0 = gmx_blendzero_pr(sir6_S0, int_S0);
++ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+#endif
+#endif
+ FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0);
+#ifndef HALF_LJ
+ FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2);
+#endif
+ FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+#ifndef HALF_LJ
+ FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+ sir_S0 = gmx_mul_pr(sig_S0, rinv_S0);
+#ifndef HALF_LJ
+ sir_S2 = gmx_mul_pr(sig_S2, rinv_S2);
+#endif
+ sir2_S0 = gmx_mul_pr(sir_S0, sir_S0);
+#ifndef HALF_LJ
+ sir2_S2 = gmx_mul_pr(sir_S2, sir_S2);
+#endif
+ sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+#ifdef EXCL_FORCES
- sir6_S2 = gmx_blendzero_pr(sir6_S2, int_S2);
++ sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0);
+#endif
+#ifndef HALF_LJ
+ sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+#ifdef EXCL_FORCES
- VLJ_S0 = gmx_blendzero_pr(VLJ_S0, int_S0);
++ sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+#endif
+#endif
+ FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0);
+#ifndef HALF_LJ
+ FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2);
+#endif
+ FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+#ifndef HALF_LJ
+ FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+#endif
+#if defined CALC_ENERGIES
+ /* We need C6 and C12 to calculate the LJ potential shift */
+ sig2_S0 = gmx_mul_pr(sig_S0, sig_S0);
+#ifndef HALF_LJ
+ sig2_S2 = gmx_mul_pr(sig_S2, sig_S2);
+#endif
+ sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+#ifndef HALF_LJ
+ sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+#endif
+ c6_S0 = gmx_mul_pr(eps_S0, sig6_S0);
+#ifndef HALF_LJ
+ c6_S2 = gmx_mul_pr(eps_S2, sig6_S2);
+#endif
+ c12_S0 = gmx_mul_pr(c6_S0, sig6_S0);
+#ifndef HALF_LJ
+ c12_S2 = gmx_mul_pr(c6_S2, sig6_S2);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+ /* Extract the group pair index per j pair.
+ * Energy groups are stored per i-cluster, so things get
+ * complicated when the i- and j-cluster size don't match.
+ */
+ {
+ int egps_j;
+#if UNROLLJ == 2
+ egps_j = nbat->energrp[cj>>1];
+ egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+ /* We assume UNROLLI <= UNROLLJ */
+ int jdi;
+ for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
+ {
+ int jj;
+ egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ for (jj = 0; jj < (UNROLLI/2); jj++)
+ {
+ egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ }
+ }
+#endif
+ }
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+ vctot_S = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2));
+#else
+ add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
+ add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+ /* Calculate the LJ energies */
+ VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
+#ifndef HALF_LJ
+ VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
+#endif
+ VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
+#ifndef HALF_LJ
+ VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
+#endif
+
+ VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
+#endif
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
+#endif
+#ifdef CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
- VLJ_S2 = gmx_blendzero_pr(VLJ_S2, int_S2);
++ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0);
+#ifndef HALF_LJ
- gmx_load_hpr(fjx_S, f+ajx);
- gmx_load_hpr(fjy_S, f+ajy);
- gmx_load_hpr(fjz_S, f+ajz);
++ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+ Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
+#ifndef HALF_LJ
+ gmx_add_pr(VLJ_S0, VLJ_S2)
+#else
+ VLJ_S0
+#endif
+ );
+#else
+ add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
+#ifndef HALF_LJ
+ add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_S0,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+#else
+ fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_S2,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+#else
+ /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+ fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+#endif
+
+ /* Calculate temporary vectorial force */
+ tx_S0 = gmx_mul_pr(fscal_S0, dx_S0);
+ tx_S2 = gmx_mul_pr(fscal_S2, dx_S2);
+ ty_S0 = gmx_mul_pr(fscal_S0, dy_S0);
+ ty_S2 = gmx_mul_pr(fscal_S2, dy_S2);
+ tz_S0 = gmx_mul_pr(fscal_S0, dz_S0);
+ tz_S2 = gmx_mul_pr(fscal_S2, dz_S2);
+
+ /* Increment i atom force */
+ fix_S0 = gmx_add_pr(fix_S0, tx_S0);
+ fix_S2 = gmx_add_pr(fix_S2, tx_S2);
+ fiy_S0 = gmx_add_pr(fiy_S0, ty_S0);
+ fiy_S2 = gmx_add_pr(fiy_S2, ty_S2);
+ fiz_S0 = gmx_add_pr(fiz_S0, tz_S0);
+ fiz_S2 = gmx_add_pr(fiz_S2, tz_S2);
+
+ /* Decrement j atom force */
++ gmx_load_hpr(&fjx_S, f+ajx);
++ gmx_load_hpr(&fjy_S, f+ajy);
++ gmx_load_hpr(&fjz_S, f+ajz);
+ gmx_store_hpr(f+ajx, gmx_sub_hpr(fjx_S, gmx_sum4_hpr(tx_S0, tx_S2)));
+ gmx_store_hpr(f+ajy, gmx_sub_hpr(fjy_S, gmx_sum4_hpr(ty_S0, ty_S2)));
+ gmx_store_hpr(f+ajz, gmx_sub_hpr(fjz_S, gmx_sum4_hpr(tz_S0, tz_S2)));
+}
+
+#undef rinv_ex_S0
+#undef rinv_ex_S2
+
+#undef wco_vdw_S0
+#undef wco_vdw_S2
+
+#undef CUTOFF_BLENDV
+
+#undef EXCL_FORCES
--- /dev/null
- /* Include the full width SIMD macros */
- #include "gmx_simd_macros.h"
-
-
- /* Define a few macros for half-width SIMD */
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
-
- /* Half-width SIMD real type */
- #define gmx_mm_hpr __m128
-
- /* Half-width SIMD operations */
- /* Load reals at half-width aligned pointer b into half-width SIMD register a */
- #define gmx_load_hpr(a, b) a = _mm_load_ps(b)
- /* Load one real at pointer b into half-width SIMD register a */
- #define gmx_load1_hpr(a, b) a = _mm_load1_ps(b)
- /* Load one real at b and one real at b+1 into halves of a, respectively */
- #define gmx_load1p1_pr(a, b) a = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
- /* Load reals at half-width aligned pointer b into two halves of a */
- #define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
- /* To half-width SIMD register b into half width aligned memory a */
- #define gmx_store_hpr(a, b) _mm_store_ps(a, b)
- #define gmx_add_hpr _mm_add_ps
- #define gmx_sub_hpr _mm_sub_ps
- /* Horizontal sum over a half SIMD register */
- #define gmx_sum4_hpr gmx_mm256_sum4h_m128
-
- #else
- #error "Half-width SIMD macros are not yet defined"
- #endif
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+
- #define SIMD_MASK_ALL 0xffffffff
++/* Half-width SIMD operations are required here.
++ * As the 4xn kernels are the "standard" kernels and some special operations
++ * are required only here, we define those in nbnxn_kernel_simd_utils_...
++ *
++ * Half-width SIMD real type:
++ * gmx_mm_hpr
++ *
++ * Half-width SIMD operations
++ * Load reals at half-width aligned pointer b into half-width SIMD register a:
++ * gmx_load_hpr(a, b)
++ * Set all entries in half-width SIMD register *a to b:
++ * gmx_set1_hpr(a, b)
++ * Load one real at b and one real at b+1 into halves of a, respectively:
++ * gmx_load1p1_pr(a, b)
++ * Load reals at half-width aligned pointer b into two halves of a:
++ * gmx_loaddh_pr(a, b)
++ * Store half-width SIMD register b into half width aligned memory a:
++ * gmx_store_hpr(a, b)
++ * gmx_add_hpr(a, b)
++ * gmx_sub_hpr(a, b)
++ * Sum over 4 half SIMD registers:
++ * gmx_sum4_hpr(a, b)
++ * Sum the elements of halfs of each input register and store sums in out:
++ * gmx_mm_transpose_sum4h_pr(a, b)
++ * Extract two half-width registers *b, *c from a full width register a:
++ * gmx_pr_to_2hpr(a, b, c)
++ */
+
+
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
+
+/* The stride of all the atom data arrays is equal to half the SIMD width */
+#define STRIDE (GMX_SIMD_WIDTH_HERE/2)
+
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#if GMX_SIMD_WIDTH_HERE == 16
+/* This is getting ridiculous, SIMD horizontal adds would help,
+ * but this is not performance critical (only used to reduce energies)
+ */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
+#else
+#error "unsupported kernel configuration"
+#endif
+#endif
+
+
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+/* AVX-256 single precision 2x(4+4) kernel,
+ * we can do half SIMD-width aligned FDV0 table loads.
+ */
+#define TAB_FDV0
+#endif
+
++/* Currently stride 4 for the 2 LJ parameters is hard coded */
++#define NBFP_STRIDE 4
+
- #if UNROLLJ >= 4
- #ifndef GMX_DOUBLE
- __m128 fix_S, fiy_S, fiz_S;
- #else
- __m256d fix_S, fiy_S, fiz_S;
- #endif
- #else
- __m128d fix0_S, fiy0_S, fiz0_S;
- __m128d fix2_S, fiy2_S, fiz2_S;
- #endif
+
+#include "nbnxn_kernel_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
+#else
+#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
+#else
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
+#else
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+(const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f
+#ifdef CALC_SHIFTFORCES
+ ,
+ real *fshift
+#endif
+#ifdef CALC_ENERGIES
+ ,
+ real *Vvdw,
+ real *Vc
+#endif
+)
+{
+ const nbnxn_ci_t *nbln;
+ const nbnxn_cj_t *l_cj;
+ const int *type;
+ const real *q;
+ const real *shiftvec;
+ const real *x;
+ const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
+ real facel;
+ real *nbfp_ptr;
+ int nbfp_stride;
+ int n, ci, ci_sh;
+ int ish, ish3;
+ gmx_bool do_LJ, half_LJ, do_coul;
+ int sci, scix, sciy, sciz, sci2;
+ int cjind0, cjind1, cjind;
+ int ip, jp;
+
+#ifdef ENERGY_GROUPS
+ int Vstride_i;
+ int egps_ishift, egps_imask;
+ int egps_jshift, egps_jmask, egps_jstride;
+ int egps_i;
+ real *vvdwtp[UNROLLI];
+ real *vctp[UNROLLI];
+#endif
+
+ gmx_mm_pr shX_S;
+ gmx_mm_pr shY_S;
+ gmx_mm_pr shZ_S;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
+ gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
+ gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
- gmx_mm_pr diag_jmi_S;
++ /* We use an i-force SIMD register width of 4 */
++ /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
++ gmx_mm_pr4 fix_S, fiy_S, fiz_S;
+
- gmx_mm_pr diag_S0, diag_S2;
++ gmx_mm_pr diagonal_jmi_S;
+#if UNROLLI == UNROLLJ
- gmx_mm_pr diag0_S0, diag0_S2;
- gmx_mm_pr diag1_S0, diag1_S2;
++ gmx_mm_pb diagonal_mask_S0, diagonal_mask_S2;
+#else
- gmx_mm_pr mask_S0, mask_S2;
++ gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S2;
++ gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S2;
+#endif
+
- #ifndef GMX_DOUBLE
++ unsigned *excl_filter;
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++ gmx_epi32 filter_S0, filter_S2;
++#else
++ gmx_mm_pr filter_S0, filter_S2;
++#endif
+
+ gmx_mm_pr zero_S = gmx_set1_pr(0);
+
+ gmx_mm_pr one_S = gmx_set1_pr(1.0);
+ gmx_mm_pr iq_S0 = gmx_setzero_pr();
+ gmx_mm_pr iq_S2 = gmx_setzero_pr();
+ gmx_mm_pr mrc_3_S;
+#ifdef CALC_ENERGIES
+ gmx_mm_pr hrc_3_S, moh_rc_S;
+#endif
+
+#ifdef CALC_COUL_TAB
+ /* Coulomb table variables */
+ gmx_mm_pr invtsp_S;
+ const real *tab_coul_F;
+#ifndef TAB_FDV0
+ const real *tab_coul_V;
+#endif
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
+#ifdef CALC_ENERGIES
+ gmx_mm_pr mhalfsp_S;
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr beta2_S, beta_S;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ gmx_mm_pr sh_ewald_S;
+#endif
+
+#ifdef LJ_COMB_LB
+ const real *ljc;
+
+ gmx_mm_pr hsig_i_S0, seps_i_S0;
+ gmx_mm_pr hsig_i_S2, seps_i_S2;
+#else
+#ifdef FIX_LJ_C
+ real pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
+ real *pvdw_c6, *pvdw_c12;
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S2, c12_S2;
+#endif
+
+#ifdef LJ_COMB_GEOM
+ const real *ljc;
+
+ gmx_mm_pr c6s_S0, c12s_S0;
+ gmx_mm_pr c6s_S1, c12s_S1;
+ gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+ gmx_mm_pr vctot_S, Vvdwtot_S;
+ gmx_mm_pr sixth_S, twelveth_S;
+
+ gmx_mm_pr avoid_sing_S;
+ gmx_mm_pr rc2_S;
+#ifdef VDW_CUTOFF_CHECK
+ gmx_mm_pr rcvdw2_S;
+#endif
+
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
+
+ /* cppcheck-suppress unassignedVariable */
+ real tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+ /* cppcheck-suppress unassignedVariable */
+ real shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
+#endif
+
+ int ninner;
+
+#ifdef COUNT_PAIRS
+ int npair = 0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ ljc = nbat->lj_comb;
+#else
+ /* No combination rule used */
- #define NBFP_STRIDE 4
++#if NBFP_STRIDE == 2
++ nbfp_ptr = nbat->nbfp;
++#else
++#if NBFP_STRIDE == 4
+ nbfp_ptr = nbat->nbfp_s4;
- nbfp_ptr = nbat->nbfp;
- #define NBFP_STRIDE 2
+#else
- diag_jmi_S = gmx_load_pr(nbat->simd_2xnn_diag);
++#error "Only NBFP_STRIDE 2 and 4 are currently supported"
++#endif
+#endif
+ nbfp_stride = NBFP_STRIDE;
+#endif
+
+ /* Load j-i for the first i */
- diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
++ diagonal_jmi_S = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
- diag0_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag0_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag1_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag1_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
++ diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+#else
+#if 2*UNROLLI == UNROLLJ
- mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*2*UNROLLJ);
- mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*2*UNROLLJ);
++ diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+#endif
+#endif
+
+ /* Load masks for topology exclusion masking */
- #if UNROLLJ == 2
- if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
- #endif
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#define FILTER_STRIDE (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
++#else
++#ifdef GMX_DOUBLE
++#define FILTER_STRIDE 2
++#else
++#define FILTER_STRIDE 1
++#endif
++#endif
++#if FILTER_STRIDE == 1
++ excl_filter = nbat->simd_exclusion_filter1;
++#else
++ excl_filter = nbat->simd_exclusion_filter2;
++#endif
++ /* Here we cast the exclusion filters from unsigned * to int * or real *.
++ * Since we only check bits, the actual value they represent does not
++ * matter, as long as both filter and mask data are treated the same way.
++ */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++ filter_S0 = gmx_load_si((int *)excl_filter + 0*2*UNROLLJ*FILTER_STRIDE);
++ filter_S2 = gmx_load_si((int *)excl_filter + 1*2*UNROLLJ*FILTER_STRIDE);
++#else
++ filter_S0 = gmx_load_pr((real *)excl_filter + 0*2*UNROLLJ);
++ filter_S2 = gmx_load_pr((real *)excl_filter + 1*2*UNROLLJ);
++#endif
++#undef FILTER_STRIDE
+
+#ifdef CALC_COUL_TAB
+ /* Generate aligned table index pointers */
+ ti0 = gmx_simd_align_int(ti0_array);
+ ti2 = gmx_simd_align_int(ti2_array);
+
+ invtsp_S = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+ mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
+#endif
+
+#ifdef TAB_FDV0
+ tab_coul_F = ic->tabq_coul_FDV0;
+#else
+ tab_coul_F = ic->tabq_coul_F;
+ tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+ beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_S = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+ sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+ q = nbat->q;
+ type = nbat->type;
+ facel = ic->epsfac;
+ shiftvec = shift_vec[0];
+ x = nbat->x;
+
+ avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+ /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+ rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+ rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+ sixth_S = gmx_set1_pr(1.0/6.0);
+ twelveth_S = gmx_set1_pr(1.0/12.0);
+
+ sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+ mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+ hrc_3_S = gmx_set1_pr(ic->k_rf);
+
+ moh_rc_S = gmx_set1_pr(-ic->c_rf);
+#endif
+
+#ifdef CALC_ENERGIES
+ tmpsum = gmx_simd_align_real(tmpsum_array);
+#endif
+#ifdef CALC_SHIFTFORCES
+ shf = gmx_simd_align_real(shf_array);
+#endif
+
+#ifdef FIX_LJ_C
+ pvdw_c6 = gmx_simd_align_real(pvdw_array);
+ pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+ for (jp = 0; jp < UNROLLJ; jp++)
+ {
+ pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+ pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ }
+ c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+ egps_ishift = nbat->neg_2log;
+ egps_imask = (1<<egps_ishift) - 1;
+ egps_jshift = 2*nbat->neg_2log;
+ egps_jmask = (1<<egps_jshift) - 1;
+ egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+ /* Major division is over i-particle energy groups, determine the stride */
+ Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+ l_cj = nbl->cj;
+
+ ninner = 0;
+ for (n = 0; n < nbl->nci; n++)
+ {
+ nbln = &nbl->ci[n];
+
+ ish = (nbln->shift & NBNXN_CI_SHIFT);
+ ish3 = ish*3;
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
+ ci = nbln->ci;
+ ci_sh = (ish == CENTRAL ? ci : -1);
+
+ shX_S = gmx_load1_pr(shiftvec+ish3);
+ shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_S = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+ sci = ci*STRIDE;
+ scix = sci*DIM;
+ sci2 = sci*2;
+#else
+ sci = (ci>>1)*STRIDE;
+ scix = sci*DIM + (ci & 1)*(STRIDE>>1);
+ sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
+ sci += (ci & 1)*(STRIDE>>1);
+#endif
+
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
+ do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
+
+#ifdef ENERGY_GROUPS
+ egps_i = nbat->energrp[ci];
+ {
+ int ia, egp_ia;
+
+ for (ia = 0; ia < UNROLLI; ia++)
+ {
+ egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+ vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+ vctp[ia] = Vc + egp_ia*Vstride_i;
+ }
+ }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
- gmx_load1p1_pr(ix_S0, x+scix);
- gmx_load1p1_pr(ix_S2, x+scix+2);
- gmx_load1p1_pr(iy_S0, x+sciy);
- gmx_load1p1_pr(iy_S2, x+sciy+2);
- gmx_load1p1_pr(iz_S0, x+sciz);
- gmx_load1p1_pr(iz_S2, x+sciz+2);
+#if UNROLLJ == 8
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+ {
+ int ia;
+ real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+ Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+ Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+ Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+ /* beta/sqrt(pi) */
+ Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+#endif
+
+ for (ia = 0; ia < UNROLLI; ia++)
+ {
+ real qi;
+
+ qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+ vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+ Vc[0]
+#endif
+ -= facel*qi*qi*Vc_sub_self;
+ }
+ }
+#endif
+
+ /* Load i atom data */
+ sciy = scix + STRIDE;
+ sciz = sciy + STRIDE;
- gmx_load1p1_pr(iq_S0, q+sci);
- gmx_load1p1_pr(iq_S2, q+sci+2);
++ gmx_load1p1_pr(&ix_S0, x+scix);
++ gmx_load1p1_pr(&ix_S2, x+scix+2);
++ gmx_load1p1_pr(&iy_S0, x+sciy);
++ gmx_load1p1_pr(&iy_S2, x+sciy+2);
++ gmx_load1p1_pr(&iz_S0, x+sciz);
++ gmx_load1p1_pr(&iz_S2, x+sciz+2);
+ ix_S0 = gmx_add_pr(ix_S0, shX_S);
+ ix_S2 = gmx_add_pr(ix_S2, shX_S);
+ iy_S0 = gmx_add_pr(iy_S0, shY_S);
+ iy_S2 = gmx_add_pr(iy_S2, shY_S);
+ iz_S0 = gmx_add_pr(iz_S0, shZ_S);
+ iz_S2 = gmx_add_pr(iz_S2, shZ_S);
+
+ if (do_coul)
+ {
+ gmx_mm_pr facel_S;
+
+ facel_S = gmx_set1_pr(facel);
+
- gmx_load1p1_pr(hsig_i_S0, ljc+sci2+0);
- gmx_load1p1_pr(hsig_i_S2, ljc+sci2+2);
- gmx_load1p1_pr(seps_i_S0, ljc+sci2+STRIDE+0);
- gmx_load1p1_pr(seps_i_S2, ljc+sci2+STRIDE+2);
++ gmx_load1p1_pr(&iq_S0, q+sci);
++ gmx_load1p1_pr(&iq_S2, q+sci+2);
+ iq_S0 = gmx_mul_pr(facel_S, iq_S0);
+ iq_S2 = gmx_mul_pr(facel_S, iq_S2);
+ }
+
+#ifdef LJ_COMB_LB
- gmx_load1p1_pr(c6s_S0, ljc+sci2+0);
++ gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
++ gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
++ gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
++ gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
+#else
+#ifdef LJ_COMB_GEOM
- gmx_load1p1_pr(c6s_S2, ljc+sci2+2);
++ gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
+ if (!half_LJ)
+ {
- gmx_load1p1_pr(c12s_S0, ljc+sci2+STRIDE+0);
++ gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
+ }
- gmx_load1p1_pr(c12s_S2, ljc+sci2+STRIDE+2);
++ gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
+ if (!half_LJ)
+ {
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++ gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
+ }
+#else
+ nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
+ nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+ if (!half_LJ)
+ {
+ nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+ nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+ }
+#endif
+#endif
+
+ /* Zero the potential energy for this list */
+ Vvdwtot_S = gmx_setzero_pr();
+ vctot_S = gmx_setzero_pr();
+
+ /* Clear i atom forces */
+ fix_S0 = gmx_setzero_pr();
+ fix_S2 = gmx_setzero_pr();
+ fiy_S0 = gmx_setzero_pr();
+ fiy_S2 = gmx_setzero_pr();
+ fiz_S0 = gmx_setzero_pr();
+ fiz_S2 = gmx_setzero_pr();
+
+ cjind = cjind0;
+
+ /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+ if (half_LJ)
+ {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for (; (cjind < cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+#undef HALF_LJ
+#undef CALC_COULOMB
+ }
+ else if (do_coul)
+ {
+#define CALC_COULOMB
+#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for (; (cjind < cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+#undef CALC_COULOMB
+ }
+ else
+ {
+#define CHECK_EXCLS
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
- #define gmx_load_pr4 _mm_load_ps
- #define gmx_store_pr4 _mm_store_ps
- #define gmx_add_pr4 _mm_add_ps
- #else
- #error "You need to define 4-width SIM macros for i-force reduction"
- #endif
- GMX_MM_TRANSPOSE_SUM4H_PR(fix_S0, fix_S2, fix_S);
++ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for (; (cjind < cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+ }
+ }
+#undef CALC_LJ
+ ninner += cjind1 - cjind0;
+
+ /* Add accumulated i-forces to the force array */
- GMX_MM_TRANSPOSE_SUM4H_PR(fiy_S0, fiy_S2, fiy_S);
++ fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
+ gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
+
- GMX_MM_TRANSPOSE_SUM4H_PR(fiz_S0, fiz_S2, fiz_S);
++ fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
+ gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
+
- #undef gmx_load_pr4
- #undef gmx_store_pr4
- #undef gmx_store_pr4
-
++ fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
+ gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+ gmx_store_pr4(shf, fix_S);
+ fshift[ish3+0] += SUM_SIMD4(shf);
+ gmx_store_pr4(shf, fiy_S);
+ fshift[ish3+1] += SUM_SIMD4(shf);
+ gmx_store_pr4(shf, fiz_S);
+ fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+
+#ifdef CALC_ENERGIES
+ if (do_coul)
+ {
+ gmx_store_pr(tmpsum, vctot_S);
+ *Vc += SUM_SIMD(tmpsum);
+ }
+
+ gmx_store_pr(tmpsum, Vvdwtot_S);
+ *Vvdw += SUM_SIMD(tmpsum);
+#endif
+
+ /* Outer loop uses 6 flops/iteration */
+ }
+
+#ifdef COUNT_PAIRS
+ printf("atom pairs %d\n", npair);
+#endif
+}
+
+
-
- #undef gmx_mm_hpr
-
- #undef gmx_load_hpr
- #undef gmx_load1_hpr
- #undef gmx_load1p1_pr
- #undef gmx_loaddh_pr
- #undef gmx_store_hpr
- #undef gmx_add_hpr
- #undef gmx_sub_hpr
-
- #undef gmx_sum4_hpr
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI
+#undef UNROLLJ
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
--- /dev/null
- #include "nbnxn_kernel_simd_4xn.h"
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "typedefs.h"
+#include "vec.h"
+#include "smalloc.h"
+#include "force.h"
+#include "gmx_omp_nthreads.h"
+#include "../nbnxn_consts.h"
+#include "nbnxn_kernel_common.h"
+
+#ifdef GMX_NBNXN_SIMD_4XN
+
- /* Include all flavors of the SSE or AVX 4xN kernel loops */
++#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
++#define GMX_USE_HALF_WIDTH_SIMD_HERE
++#endif
++#include "gmx_simd_macros.h"
++#include "gmx_simd_vec.h"
+
- #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
++#include "nbnxn_kernel_simd_4xn.h"
+
- const int simd_width = GMX_SIMD_WIDTH_HERE;
- const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
++#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)
++#error "unsupported SIMD width"
+#endif
+
++
++/* Include all flavors of the SSE or AVX 4xN kernel loops */
++
+/* Analytical reaction-field kernels */
+#define CALC_COUL_RF
+
+#include "nbnxn_kernel_simd_4xn_includes.h"
+
+#undef CALC_COUL_RF
+
+/* Tabulated exclusion interaction electrostatics kernels */
+#define CALC_COUL_TAB
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_simd_4xn_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_simd_4xn_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_TAB
+
+/* Analytical Ewald exclusion interaction electrostatics kernels */
+#define CALC_COUL_EWALD
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_simd_4xn_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_simd_4xn_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_EWALD
+
+
+typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f,
+ real *fshift,
+ real *Vvdw,
+ real *Vc);
+
+typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f,
+ real *fshift);
+
+enum {
+ coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR
+};
+
+#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _ener
+static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
+{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _energrp
+static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
+{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _noener
+static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
+{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
+ { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
+ { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
+ { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
+ { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
+#undef NBK_FN
+
+
+static void reduce_group_energies(int ng, int ng_2log,
+ const real *VSvdw, const real *VSc,
+ real *Vvdw, real *Vc)
+{
- c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
++ const int unrollj = GMX_SIMD_WIDTH_HERE;
++ const int unrollj_half = unrollj/2;
+ int ng_p2, i, j, j0, j1, c, s;
+
+ ng_p2 = (1<<ng_2log);
+
+ /* The size of the x86 SIMD energy group buffer array is:
+ * ng*ng*ng_p2*unrollj_half*simd_width
+ */
+ for (i = 0; i < ng; i++)
+ {
+ for (j = 0; j < ng; j++)
+ {
+ Vvdw[i*ng+j] = 0;
+ Vc[i*ng+j] = 0;
+ }
+
+ for (j1 = 0; j1 < ng; j1++)
+ {
+ for (j0 = 0; j0 < ng; j0++)
+ {
- c += simd_width + 2;
++ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
+ for (s = 0; s < unrollj_half; s++)
+ {
+ Vvdw[i*ng+j0] += VSvdw[c+0];
+ Vvdw[i*ng+j1] += VSvdw[c+1];
+ Vc [i*ng+j0] += VSc [c+0];
+ Vc [i*ng+j1] += VSc [c+1];
++ c += unrollj + 2;
+ }
+ }
+ }
+ }
+}
+
+#endif /* GMX_NBNXN_SIMD_4XN */
+
+void
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t *nbl_list,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ int ewald_excl,
+ rvec *shift_vec,
+ int force_flags,
+ int clearF,
+ real *fshift,
+ real *Vc,
+ real *Vvdw)
+#ifdef GMX_NBNXN_SIMD_4XN
+{
+ int nnbl;
+ nbnxn_pairlist_t **nbl;
+ int coult;
+ int nb;
+
+ nnbl = nbl_list->nnbl;
+ nbl = nbl_list->nbl;
+
+ if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ {
+ coult = coultRF;
+ }
+ else
+ {
+ if (ewald_excl == ewaldexclTable)
+ {
+ if (ic->rcoulomb == ic->rvdw)
+ {
+ coult = coultTAB;
+ }
+ else
+ {
+ coult = coultTAB_TWIN;
+ }
+ }
+ else
+ {
+ if (ic->rcoulomb == ic->rvdw)
+ {
+ coult = coultEWALD;
+ }
+ else
+ {
+ coult = coultEWALD_TWIN;
+ }
+ }
+ }
+
+#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ for (nb = 0; nb < nnbl; nb++)
+ {
+ nbnxn_atomdata_output_t *out;
+ real *fshift_p;
+
+ out = &nbat->out[nb];
+
+ if (clearF == enbvClearFYes)
+ {
+ clear_f(nbat, nb, out->f);
+ }
+
+ if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+ {
+ fshift_p = fshift;
+ }
+ else
+ {
+ fshift_p = out->fshift;
+
+ if (clearF == enbvClearFYes)
+ {
+ clear_fshift(fshift_p);
+ }
+ }
+
+ /* With Ewald type electrostatics we the forces for excluded atom pairs
+ * should not contribute to the virial sum. The exclusion forces
+ * are not calculate in the energy kernels, but are in _noener.
+ */
+ if (!((force_flags & GMX_FORCE_ENERGY) ||
+ (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ {
+ /* Don't calculate energies */
+ p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
+ ic,
+ shift_vec,
+ out->f,
+ fshift_p);
+ }
+ else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ {
+ /* No energy groups */
+ out->Vvdw[0] = 0;
+ out->Vc[0] = 0;
+
+ p_nbk_ener[coult][nbat->comb_rule](nbl[nb], nbat,
+ ic,
+ shift_vec,
+ out->f,
+ fshift_p,
+ out->Vvdw,
+ out->Vc);
+ }
+ else
+ {
+ /* Calculate energy group contributions */
+ int i;
+
+ for (i = 0; i < out->nVS; i++)
+ {
+ out->VSvdw[i] = 0;
+ }
+ for (i = 0; i < out->nVS; i++)
+ {
+ out->VSc[i] = 0;
+ }
+
+ p_nbk_energrp[coult][nbat->comb_rule](nbl[nb], nbat,
+ ic,
+ shift_vec,
+ out->f,
+ fshift_p,
+ out->VSvdw,
+ out->VSc);
+
+ reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
+ out->VSvdw, out->VSc,
+ out->Vvdw, out->Vc);
+ }
+ }
+
+ if (force_flags & GMX_FORCE_ENERGY)
+ {
+ reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
+ }
+}
+#else
+{
+ gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
+}
+#endif
--- /dev/null
- #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel calculates interactions of 4 i-atoms
+ * with N j-atoms stored in N wide SIMD registers.
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this can be faster when we have defined gmx_blendv_pr, i.e. an instruction
+ * that selects from two SIMD registers based on the contents of a third.
+ */
- gmx_mm_pr int_S0;
- gmx_mm_pr int_S1;
- gmx_mm_pr int_S2;
- gmx_mm_pr int_S3;
++#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
+/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ * With gcc this is slower, except for RF on Sandy Bridge.
+ * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#define NBNXN_CUTOFF_USE_BLENDV
+#endif
+/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ * Tested with icc 13.
+ */
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#define NBNXN_CUTOFF_USE_BLENDV
+#endif
+#endif
+
+{
+ int cj, aj, ajx, ajy, ajz;
+
+#ifdef ENERGY_GROUPS
+ /* Energy group indices for two atoms packed into one int */
+ int egp_jj[UNROLLJ/2];
+#endif
+
+#ifdef CHECK_EXCLS
+ /* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pr wco_S0;
- gmx_mm_pr wco_S1;
- gmx_mm_pr wco_S2;
- gmx_mm_pr wco_S3;
++ gmx_mm_pb interact_S0;
++ gmx_mm_pb interact_S1;
++ gmx_mm_pb interact_S2;
++ gmx_mm_pb interact_S3;
+#endif
+
+ gmx_mm_pr jx_S, jy_S, jz_S;
+ gmx_mm_pr dx_S0, dy_S0, dz_S0;
+ gmx_mm_pr dx_S1, dy_S1, dz_S1;
+ gmx_mm_pr dx_S2, dy_S2, dz_S2;
+ gmx_mm_pr dx_S3, dy_S3, dz_S3;
+ gmx_mm_pr tx_S0, ty_S0, tz_S0;
+ gmx_mm_pr tx_S1, ty_S1, tz_S1;
+ gmx_mm_pr tx_S2, ty_S2, tz_S2;
+ gmx_mm_pr tx_S3, ty_S3, tz_S3;
+ gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0;
+ gmx_mm_pr rsq_S1, rinv_S1, rinvsq_S1;
+ gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2;
+ gmx_mm_pr rsq_S3, rinv_S3, rinvsq_S3;
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ /* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pr wco_vdw_S0;
- gmx_mm_pr wco_vdw_S1;
++ gmx_mm_pb wco_S0;
++ gmx_mm_pb wco_S1;
++ gmx_mm_pb wco_S2;
++ gmx_mm_pb wco_S3;
+#endif
+#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr wco_vdw_S2;
- gmx_mm_pr wco_vdw_S3;
++ gmx_mm_pb wco_vdw_S0;
++ gmx_mm_pb wco_vdw_S1;
+#ifndef HALF_LJ
- #ifdef gmx_checkbitmask_epi32
++ gmx_mm_pb wco_vdw_S2;
++ gmx_mm_pb wco_vdw_S3;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+ /* 1/r masked with the interaction mask */
+ gmx_mm_pr rinv_ex_S0;
+ gmx_mm_pr rinv_ex_S1;
+ gmx_mm_pr rinv_ex_S2;
+ gmx_mm_pr rinv_ex_S3;
+#endif
+ gmx_mm_pr jq_S;
+ gmx_mm_pr qq_S0;
+ gmx_mm_pr qq_S1;
+ gmx_mm_pr qq_S2;
+ gmx_mm_pr qq_S3;
+#ifdef CALC_COUL_TAB
+ /* The force (PME mesh force) we need to subtract from 1/r^2 */
+ gmx_mm_pr fsub_S0;
+ gmx_mm_pr fsub_S1;
+ gmx_mm_pr fsub_S2;
+ gmx_mm_pr fsub_S3;
+#endif
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr brsq_S0, brsq_S1, brsq_S2, brsq_S3;
+ gmx_mm_pr ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
+#endif
+
+ /* frcoul = (1/r - fsub)*r */
+ gmx_mm_pr frcoul_S0;
+ gmx_mm_pr frcoul_S1;
+ gmx_mm_pr frcoul_S2;
+ gmx_mm_pr frcoul_S3;
+#ifdef CALC_COUL_TAB
+ /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+ gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0;
+ gmx_mm_pr r_S1, rs_S1, rf_S1, frac_S1;
+ gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2;
+ gmx_mm_pr r_S3, rs_S3, rf_S3, frac_S3;
+ /* Table index: rs truncated to an int */
+ gmx_epi32 ti_S0, ti_S1, ti_S2, ti_S3;
+ /* Linear force table values */
+ gmx_mm_pr ctab0_S0, ctab1_S0;
+ gmx_mm_pr ctab0_S1, ctab1_S1;
+ gmx_mm_pr ctab0_S2, ctab1_S2;
+ gmx_mm_pr ctab0_S3, ctab1_S3;
+#ifdef CALC_ENERGIES
+ /* Quadratic energy table value */
+ gmx_mm_pr ctabv_S0;
+ gmx_mm_pr ctabv_S1;
+ gmx_mm_pr ctabv_S2;
+ gmx_mm_pr ctabv_S3;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ /* The potential (PME mesh) we need to subtract from 1/r */
+ gmx_mm_pr vc_sub_S0;
+ gmx_mm_pr vc_sub_S1;
+ gmx_mm_pr vc_sub_S2;
+ gmx_mm_pr vc_sub_S3;
+#endif
+#ifdef CALC_ENERGIES
+ /* Electrostatic potential */
+ gmx_mm_pr vcoul_S0;
+ gmx_mm_pr vcoul_S1;
+ gmx_mm_pr vcoul_S2;
+ gmx_mm_pr vcoul_S3;
+#endif
+#endif
+ /* The force times 1/r */
+ gmx_mm_pr fscal_S0;
+ gmx_mm_pr fscal_S1;
+ gmx_mm_pr fscal_S2;
+ gmx_mm_pr fscal_S3;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+ /* LJ sigma_j/2 and sqrt(epsilon_j) */
+ gmx_mm_pr hsig_j_S, seps_j_S;
+ /* LJ sigma_ij and epsilon_ij */
+ gmx_mm_pr sig_S0, eps_S0;
+ gmx_mm_pr sig_S1, eps_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr sig_S2, eps_S2;
+ gmx_mm_pr sig_S3, eps_S3;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sig2_S0, sig6_S0;
+ gmx_mm_pr sig2_S1, sig6_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr sig2_S2, sig6_S2;
+ gmx_mm_pr sig2_S3, sig6_S3;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+ gmx_mm_pr c6s_j_S, c12s_j_S;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ /* Index for loading LJ parameters, complicated when interleaving */
+ int aj2;
+#endif
+
+#ifndef FIX_LJ_C
+ /* LJ C6 and C12 parameters, used with geometric comb. rule */
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S1, c12_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr c6_S2, c12_S2;
+ gmx_mm_pr c6_S3, c12_S3;
+#endif
+#endif
+
+ /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+ gmx_mm_pr rinvsix_S0;
+ gmx_mm_pr rinvsix_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr rinvsix_S2;
+ gmx_mm_pr rinvsix_S3;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+ gmx_mm_pr sir_S0, sir2_S0, sir6_S0;
+ gmx_mm_pr sir_S1, sir2_S1, sir6_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr sir_S2, sir2_S2, sir6_S2;
+ gmx_mm_pr sir_S3, sir2_S3, sir6_S3;
+#endif
+#endif
+
+ gmx_mm_pr FrLJ6_S0, FrLJ12_S0;
+ gmx_mm_pr FrLJ6_S1, FrLJ12_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr FrLJ6_S2, FrLJ12_S2;
+ gmx_mm_pr FrLJ6_S3, FrLJ12_S3;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0;
+ gmx_mm_pr VLJ6_S1, VLJ12_S1, VLJ_S1;
+#ifndef HALF_LJ
+ gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2;
+ gmx_mm_pr VLJ6_S3, VLJ12_S3, VLJ_S3;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+ /* j-cluster index */
+ cj = l_cj[cjind].cj;
+
+ /* Atom indices (of the first atom in the cluster) */
+ aj = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+ aj2 = aj*2;
+#else
+ aj2 = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+ ajx = aj*DIM;
+#else
+ ajx = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+ ajy = ajx + STRIDE;
+ ajz = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
- /* Integer mask set and operations, cast result to real */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ {
- int_S0 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S0));
- int_S1 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S1));
- int_S2 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S2));
- int_S3 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S3));
++ /* Load integer topology exclusion interaction mask */
+ gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
+
- int_S0 = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
- int_S1 = gmx_checkbitmask_pr(mask_pr_S, mask_S1);
- int_S2 = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
- int_S3 = gmx_checkbitmask_pr(mask_pr_S, mask_S3);
++ interact_S0 = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
++ interact_S1 = gmx_checkbitmask_epi32(mask_pr_S, filter_S1);
++ interact_S2 = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
++ interact_S3 = gmx_checkbitmask_epi32(mask_pr_S, filter_S3);
+ }
+#else
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
+ {
+ /* Integer mask set, cast to real and real mask operations */
+ gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
+
- wco_S0 = gmx_and_pr(wco_S0, diag_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag_S3);
++ interact_S0 = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
++ interact_S1 = gmx_checkbitmask_pr(mask_pr_S, filter_S1);
++ interact_S2 = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
++ interact_S3 = gmx_checkbitmask_pr(mask_pr_S, filter_S3);
+ }
++#else
++#error "No SIMD bitmask operation available"
+#endif
+#endif
++#endif /* CHECK_EXCLS */
+
+ /* load j atom coordinates */
+ jx_S = gmx_load_pr(x+ajx);
+ jy_S = gmx_load_pr(x+ajy);
+ jz_S = gmx_load_pr(x+ajz);
+
+ /* Calculate distance */
+ dx_S0 = gmx_sub_pr(ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(iz_S0, jz_S);
+ dx_S1 = gmx_sub_pr(ix_S1, jx_S);
+ dy_S1 = gmx_sub_pr(iy_S1, jy_S);
+ dz_S1 = gmx_sub_pr(iz_S1, jz_S);
+ dx_S2 = gmx_sub_pr(ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(iz_S2, jz_S);
+ dx_S3 = gmx_sub_pr(ix_S3, jx_S);
+ dy_S3 = gmx_sub_pr(iy_S3, jy_S);
+ dz_S3 = gmx_sub_pr(iz_S3, jz_S);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
+ wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+ /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+ if (cj == ci_sh)
+ {
- wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag0_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag0_S3);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0);
++ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask_S1);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2);
++ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask_S3);
+ }
+#else
+#if UNROLLJ < UNROLLI
+ if (cj == ci_sh*2)
+ {
- wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag1_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag1_S3);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
++ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask0_S1);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
++ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask0_S3);
+ }
+ if (cj == ci_sh*2 + 1)
+ {
- wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag0_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag0_S3);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
++ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask1_S1);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
++ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask1_S3);
+ }
+#else
+ if (cj*2 == ci_sh)
+ {
- wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag1_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag1_S3);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
++ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask0_S1);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
++ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask0_S3);
+ }
+ else if (cj*2 + 1 == ci_sh)
+ {
- wco_S0 = gmx_and_pr(wco_S0, int_S0);
- wco_S1 = gmx_and_pr(wco_S1, int_S1);
- wco_S2 = gmx_and_pr(wco_S2, int_S2);
- wco_S3 = gmx_and_pr(wco_S3, int_S3);
++ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
++ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask1_S1);
++ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
++ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask1_S3);
+ }
+#endif
+#endif
+#else /* EXCL_FORCES */
+ /* No exclusion forces: remove all excluded atom pairs from the list */
- gmx_store_pr(tmp, i == 0 ? wco_S0 : (i == 1 ? wco_S1 : (i == 2 ? wco_S2 : wco_S3)));
++ wco_S0 = gmx_and_pb(wco_S0, interact_S0);
++ wco_S1 = gmx_and_pb(wco_S1, interact_S1);
++ wco_S2 = gmx_and_pb(wco_S2, interact_S2);
++ wco_S3 = gmx_and_pb(wco_S3, interact_S3);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+ {
+ int i, j;
+ real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
+ tmp = gmx_simd_align_real(tmpa);
+ for (i = 0; i < UNROLLI; i++)
+ {
- if (!(tmp[j] == 0))
++ gmx_store_pr(tmp, gmx_sub_pr(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
+ for (j = 0; j < UNROLLJ; j++)
+ {
- rsq_S0 = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
- rsq_S1 = gmx_add_pr(rsq_S1, gmx_andnot_pr(int_S1, avoid_sing_S));
- rsq_S2 = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
- rsq_S3 = gmx_add_pr(rsq_S3, gmx_andnot_pr(int_S3, avoid_sing_S));
++ if (tmp[j] >= 0)
+ {
+ npair++;
+ }
+ }
+ }
+ }
+#endif
+
+#ifdef CHECK_EXCLS
+ /* For excluded pairs add a small number to avoid r^-6 = NaN */
- GMX_MM_INVSQRT2_PD(rsq_S0, rsq_S1, rinv_S0, rinv_S1);
- GMX_MM_INVSQRT2_PD(rsq_S2, rsq_S3, rinv_S2, rinv_S3);
++ rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
++ rsq_S1 = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
++ rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
++ rsq_S3 = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
+#endif
+
+ /* Calculate 1/r */
+#ifndef GMX_DOUBLE
+ rinv_S0 = gmx_invsqrt_pr(rsq_S0);
+ rinv_S1 = gmx_invsqrt_pr(rsq_S1);
+ rinv_S2 = gmx_invsqrt_pr(rsq_S2);
+ rinv_S3 = gmx_invsqrt_pr(rsq_S3);
+#else
- load_lj_pair_params(nbfp0, type, aj, c6_S0, c12_S0);
- load_lj_pair_params(nbfp1, type, aj, c6_S1, c12_S1);
++ gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
++ gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
+#endif
+
+#ifdef CALC_COULOMB
+ /* Load parameters for j atom */
+ jq_S = gmx_load_pr(q+aj);
+ qq_S0 = gmx_mul_pr(iq_S0, jq_S);
+ qq_S1 = gmx_mul_pr(iq_S1, jq_S);
+ qq_S2 = gmx_mul_pr(iq_S2, jq_S);
+ qq_S3 = gmx_mul_pr(iq_S3, jq_S);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params(nbfp2, type, aj, c6_S2, c12_S2);
- load_lj_pair_params(nbfp3, type, aj, c6_S3, c12_S3);
++ load_lj_pair_params(nbfp0, type, aj, &c6_S0, &c12_S0);
++ load_lj_pair_params(nbfp1, type, aj, &c6_S1, &c12_S1);
+#ifndef HALF_LJ
- rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, int_S0);
- rinv_ex_S1 = gmx_blendzero_pr(rinv_S1, int_S1);
- rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, int_S2);
- rinv_ex_S3 = gmx_blendzero_pr(rinv_S3, int_S3);
++ load_lj_pair_params(nbfp2, type, aj, &c6_S2, &c12_S2);
++ load_lj_pair_params(nbfp3, type, aj, &c6_S3, &c12_S3);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
+ c6s_j_S = gmx_load_pr(ljc+aj2+0);
+ c12s_j_S = gmx_load_pr(ljc+aj2+STRIDE);
+ c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S );
+ c6_S1 = gmx_mul_pr(c6s_S1, c6s_j_S );
+#ifndef HALF_LJ
+ c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S );
+ c6_S3 = gmx_mul_pr(c6s_S3, c6s_j_S );
+#endif
+ c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S);
+ c12_S1 = gmx_mul_pr(c12s_S1, c12s_j_S);
+#ifndef HALF_LJ
+ c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S);
+ c12_S3 = gmx_mul_pr(c12s_S3, c12s_j_S);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
+ hsig_j_S = gmx_load_pr(ljc+aj2+0);
+ seps_j_S = gmx_load_pr(ljc+aj2+STRIDE);
+
+ sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S);
+ sig_S1 = gmx_add_pr(hsig_i_S1, hsig_j_S);
+ eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S);
+ eps_S1 = gmx_mul_pr(seps_i_S1, seps_j_S);
+#ifndef HALF_LJ
+ sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S);
+ sig_S3 = gmx_add_pr(hsig_i_S3, hsig_j_S);
+ eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S);
+ eps_S3 = gmx_mul_pr(seps_i_S3, seps_j_S);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0);
+ rinv_S1 = gmx_blendzero_pr(rinv_S1, wco_S1);
+ rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2);
+ rinv_S3 = gmx_blendzero_pr(rinv_S3, wco_S3);
+#else
+ /* We only need to mask for the cut-off: blendv is faster */
+ rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
+ rinv_S1 = gmx_blendv_pr(rinv_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1));
+ rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+ rinv_S3 = gmx_blendv_pr(rinv_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3));
+#endif
+
+ rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0);
+ rinvsq_S1 = gmx_mul_pr(rinv_S1, rinv_S1);
+ rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2);
+ rinvsq_S3 = gmx_mul_pr(rinv_S3, rinv_S3);
+
+#ifdef CALC_COULOMB
+ /* Note that here we calculate force*r, not the usual force/r.
+ * This allows avoiding masking the reaction-field contribution,
+ * as frcoul is later multiplied by rinvsq which has been
+ * masked with the cut-off check.
+ */
+
+#ifdef EXCL_FORCES
+ /* Only add 1/r for non-excluded atom pairs */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
- frcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(rsq_S1, mrc_3_S)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
- frcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(rsq_S3, mrc_3_S)));
++ rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0);
++ rinv_ex_S1 = gmx_blendzero_pr(rinv_S1, interact_S1);
++ rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2);
++ rinv_ex_S3 = gmx_blendzero_pr(rinv_S3, interact_S3);
+#else
+ /* No exclusion forces, we always need 1/r */
+#define rinv_ex_S0 rinv_S0
+#define rinv_ex_S1 rinv_S1
+#define rinv_ex_S2 rinv_S2
+#define rinv_ex_S3 rinv_S3
+#endif
+
+#ifdef CALC_COUL_RF
+ /* Electrostatic interactions */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
- frcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(ewcorr_S1, brsq_S1)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
- frcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(ewcorr_S3, brsq_S3)));
++ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
++ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_madd_pr(rsq_S1, mrc_3_S, rinv_ex_S1));
++ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
++ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_madd_pr(rsq_S3, mrc_3_S, rinv_ex_S3));
+
+#ifdef CALC_ENERGIES
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
+ vcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_add_pr(gmx_mul_pr(rsq_S1, hrc_3_S), moh_rc_S)));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+ vcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_add_pr(gmx_mul_pr(rsq_S3, hrc_3_S), moh_rc_S)));
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ /* We need to mask (or limit) rsq for the cut-off,
+ * as large distances can cause an overflow in gmx_pmecorrF/V.
+ */
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
+ brsq_S1 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S1, wco_S1));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+ brsq_S3 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S3, wco_S3));
+#else
+ /* Strangely, putting mul on a separate line is slower (icc 13) */
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
+ brsq_S1 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1)));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+ brsq_S3 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3)));
+#endif
+ ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
+ ewcorr_S1 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S);
+ ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
+ ewcorr_S3 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S);
- #ifdef GMX_HAVE_SIMD_FLOOR
++ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
++ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_madd_pr(ewcorr_S1, brsq_S1, rinv_ex_S1));
++ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
++ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_madd_pr(ewcorr_S3, brsq_S3, rinv_ex_S3));
+
+#ifdef CALC_ENERGIES
+ vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
+ vc_sub_S1 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S1), beta_S);
+ vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+ vc_sub_S3 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S3), beta_S);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
+#ifdef CALC_COUL_TAB
+ /* Electrostatic interactions */
+ r_S0 = gmx_mul_pr(rsq_S0, rinv_S0);
+ r_S1 = gmx_mul_pr(rsq_S1, rinv_S1);
+ r_S2 = gmx_mul_pr(rsq_S2, rinv_S2);
+ r_S3 = gmx_mul_pr(rsq_S3, rinv_S3);
+ /* Convert r to scaled table units */
+ rs_S0 = gmx_mul_pr(r_S0, invtsp_S);
+ rs_S1 = gmx_mul_pr(r_S1, invtsp_S);
+ rs_S2 = gmx_mul_pr(r_S2, invtsp_S);
+ rs_S3 = gmx_mul_pr(r_S3, invtsp_S);
+ /* Truncate scaled r to an int */
+ ti_S0 = gmx_cvttpr_epi32(rs_S0);
+ ti_S1 = gmx_cvttpr_epi32(rs_S1);
+ ti_S2 = gmx_cvttpr_epi32(rs_S2);
+ ti_S3 = gmx_cvttpr_epi32(rs_S3);
- load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
- load_table_f(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1);
- load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
- load_table_f(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3);
++#ifdef GMX_SIMD_HAVE_FLOOR
+ /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+ rf_S0 = gmx_floor_pr(rs_S0);
+ rf_S1 = gmx_floor_pr(rs_S1);
+ rf_S2 = gmx_floor_pr(rs_S2);
+ rf_S3 = gmx_floor_pr(rs_S3);
+#else
+ rf_S0 = gmx_cvtepi32_pr(ti_S0);
+ rf_S1 = gmx_cvtepi32_pr(ti_S1);
+ rf_S2 = gmx_cvtepi32_pr(ti_S2);
+ rf_S3 = gmx_cvtepi32_pr(ti_S3);
+#endif
+ frac_S0 = gmx_sub_pr(rs_S0, rf_S0);
+ frac_S1 = gmx_sub_pr(rs_S1, rf_S1);
+ frac_S2 = gmx_sub_pr(rs_S2, rf_S2);
+ frac_S3 = gmx_sub_pr(rs_S3, rf_S3);
+
+ /* Load and interpolate table forces and possibly energies.
+ * Force and energy can be combined in one table, stride 4: FDV0
+ * or in two separate tables with stride 1: F and V
+ * Currently single precision uses FDV0, double F and V.
+ */
+#ifndef CALC_ENERGIES
- load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
- load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
- load_table_f_v(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
++ load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
++ load_table_f(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1);
++ load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
++ load_table_f(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3);
+#else
+#ifdef TAB_FDV0
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
++ load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++ load_table_f_v(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
++ load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
++ load_table_f_v(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
+#else
- vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
- vc_sub_S1 = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, int_S1));
- vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
- vc_sub_S3 = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, int_S3));
++ load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
++ load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
++ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
++ load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
+#endif
+#endif
+ fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
+ fsub_S1 = gmx_add_pr(ctab0_S1, gmx_mul_pr(frac_S1, ctab1_S1));
+ fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
+ fsub_S3 = gmx_add_pr(ctab0_S3, gmx_mul_pr(frac_S3, ctab1_S3));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
+ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, gmx_mul_pr(fsub_S1, r_S1)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, gmx_mul_pr(fsub_S3, r_S3)));
+
+#ifdef CALC_ENERGIES
+ vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
+ vc_sub_S1 = gmx_add_pr(ctabv_S1, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S1), gmx_add_pr(ctab0_S1, fsub_S1)));
+ vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+ vc_sub_S3 = gmx_add_pr(ctabv_S3, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S3), gmx_add_pr(ctab0_S3, fsub_S3)));
+#endif
+#endif /* CALC_COUL_TAB */
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#ifndef NO_SHIFT_EWALD
+ /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
- rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, int_S0);
- rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, int_S1);
++ vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
++ vc_sub_S1 = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, interact_S1));
++ vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
++ vc_sub_S3 = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, interact_S3));
+#else
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
+ vc_sub_S1 = gmx_add_pr(vc_sub_S1, sh_ewald_S);
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+ vc_sub_S3 = gmx_add_pr(vc_sub_S3, sh_ewald_S);
+#endif
+#endif
+
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
+ vcoul_S1 = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, vc_sub_S1));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+ vcoul_S3 = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, vc_sub_S3));
+
+#endif
+
+#ifdef CALC_ENERGIES
+ /* Mask energy for cut-off and diagonal */
+ vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0);
+ vcoul_S1 = gmx_blendzero_pr(vcoul_S1, wco_S1);
+ vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2);
+ vcoul_S3 = gmx_blendzero_pr(vcoul_S3, wco_S3);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+ /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+ wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+ wco_vdw_S1 = gmx_cmplt_pr(rsq_S1, rcvdw2_S);
+#ifndef HALF_LJ
+ wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+ wco_vdw_S3 = gmx_cmplt_pr(rsq_S3, rcvdw2_S);
+#endif
+#else
+ /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define wco_vdw_S0 wco_S0
+#define wco_vdw_S1 wco_S1
+#define wco_vdw_S2 wco_S2
+#define wco_vdw_S3 wco_S3
+#endif
+
+#ifndef LJ_COMB_LB
+ rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+ rinvsix_S1 = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1));
+#ifdef EXCL_FORCES
- rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, int_S2);
- rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, int_S3);
++ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0);
++ rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, interact_S1);
+#endif
+#ifndef HALF_LJ
+ rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+ rinvsix_S3 = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3));
+#ifdef EXCL_FORCES
- sir6_S0 = gmx_blendzero_pr(sir6_S0, int_S0);
- sir6_S1 = gmx_blendzero_pr(sir6_S1, int_S1);
++ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2);
++ rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, interact_S3);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+ rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, wco_vdw_S1);
+#ifndef HALF_LJ
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+ rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, wco_vdw_S3);
+#endif
+#endif
+ FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0);
+ FrLJ6_S1 = gmx_mul_pr(c6_S1, rinvsix_S1);
+#ifndef HALF_LJ
+ FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2);
+ FrLJ6_S3 = gmx_mul_pr(c6_S3, rinvsix_S3);
+#endif
+ FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+ FrLJ12_S1 = gmx_mul_pr(c12_S1, gmx_mul_pr(rinvsix_S1, rinvsix_S1));
+#ifndef HALF_LJ
+ FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+ FrLJ12_S3 = gmx_mul_pr(c12_S3, gmx_mul_pr(rinvsix_S3, rinvsix_S3));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+ sir_S0 = gmx_mul_pr(sig_S0, rinv_S0);
+ sir_S1 = gmx_mul_pr(sig_S1, rinv_S1);
+#ifndef HALF_LJ
+ sir_S2 = gmx_mul_pr(sig_S2, rinv_S2);
+ sir_S3 = gmx_mul_pr(sig_S3, rinv_S3);
+#endif
+ sir2_S0 = gmx_mul_pr(sir_S0, sir_S0);
+ sir2_S1 = gmx_mul_pr(sir_S1, sir_S1);
+#ifndef HALF_LJ
+ sir2_S2 = gmx_mul_pr(sir_S2, sir_S2);
+ sir2_S3 = gmx_mul_pr(sir_S3, sir_S3);
+#endif
+ sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+ sir6_S1 = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1));
+#ifdef EXCL_FORCES
- sir6_S2 = gmx_blendzero_pr(sir6_S2, int_S2);
- sir6_S3 = gmx_blendzero_pr(sir6_S3, int_S3);
++ sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0);
++ sir6_S1 = gmx_blendzero_pr(sir6_S1, interact_S1);
+#endif
+#ifndef HALF_LJ
+ sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+ sir6_S3 = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3));
+#ifdef EXCL_FORCES
- VLJ_S0 = gmx_blendzero_pr(VLJ_S0, int_S0);
- VLJ_S1 = gmx_blendzero_pr(VLJ_S1, int_S1);
++ sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2);
++ sir6_S3 = gmx_blendzero_pr(sir6_S3, interact_S3);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+ sir6_S1 = gmx_blendzero_pr(sir6_S1, wco_vdw_S1);
+#ifndef HALF_LJ
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+ sir6_S3 = gmx_blendzero_pr(sir6_S3, wco_vdw_S3);
+#endif
+#endif
+ FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0);
+ FrLJ6_S1 = gmx_mul_pr(eps_S1, sir6_S1);
+#ifndef HALF_LJ
+ FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2);
+ FrLJ6_S3 = gmx_mul_pr(eps_S3, sir6_S3);
+#endif
+ FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+ FrLJ12_S1 = gmx_mul_pr(FrLJ6_S1, sir6_S1);
+#ifndef HALF_LJ
+ FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+ FrLJ12_S3 = gmx_mul_pr(FrLJ6_S3, sir6_S3);
+#endif
+#if defined CALC_ENERGIES
+ /* We need C6 and C12 to calculate the LJ potential shift */
+ sig2_S0 = gmx_mul_pr(sig_S0, sig_S0);
+ sig2_S1 = gmx_mul_pr(sig_S1, sig_S1);
+#ifndef HALF_LJ
+ sig2_S2 = gmx_mul_pr(sig_S2, sig_S2);
+ sig2_S3 = gmx_mul_pr(sig_S3, sig_S3);
+#endif
+ sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+ sig6_S1 = gmx_mul_pr(sig2_S1, gmx_mul_pr(sig2_S1, sig2_S1));
+#ifndef HALF_LJ
+ sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+ sig6_S3 = gmx_mul_pr(sig2_S3, gmx_mul_pr(sig2_S3, sig2_S3));
+#endif
+ c6_S0 = gmx_mul_pr(eps_S0, sig6_S0);
+ c6_S1 = gmx_mul_pr(eps_S1, sig6_S1);
+#ifndef HALF_LJ
+ c6_S2 = gmx_mul_pr(eps_S2, sig6_S2);
+ c6_S3 = gmx_mul_pr(eps_S3, sig6_S3);
+#endif
+ c12_S0 = gmx_mul_pr(c6_S0, sig6_S0);
+ c12_S1 = gmx_mul_pr(c6_S1, sig6_S1);
+#ifndef HALF_LJ
+ c12_S2 = gmx_mul_pr(c6_S2, sig6_S2);
+ c12_S3 = gmx_mul_pr(c6_S3, sig6_S3);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+ /* Extract the group pair index per j pair.
+ * Energy groups are stored per i-cluster, so things get
+ * complicated when the i- and j-cluster size don't match.
+ */
+ {
+ int egps_j;
+#if UNROLLJ == 2
+ egps_j = nbat->energrp[cj>>1];
+ egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+ /* We assume UNROLLI <= UNROLLJ */
+ int jdi;
+ for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
+ {
+ int jj;
+ egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ for (jj = 0; jj < (UNROLLI/2); jj++)
+ {
+ egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ }
+ }
+#endif
+ }
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+ vctot_S = gmx_add_pr(vctot_S, gmx_sum4_pr(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
+#else
+ add_ener_grp(vcoul_S0, vctp[0], egp_jj);
+ add_ener_grp(vcoul_S1, vctp[1], egp_jj);
+ add_ener_grp(vcoul_S2, vctp[2], egp_jj);
+ add_ener_grp(vcoul_S3, vctp[3], egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+ /* Calculate the LJ energies */
+ VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
+ VLJ6_S1 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S1, gmx_mul_pr(c6_S1, sh_invrc6_S)));
+#ifndef HALF_LJ
+ VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
+ VLJ6_S3 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S3, gmx_mul_pr(c6_S3, sh_invrc6_S)));
+#endif
+ VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
+ VLJ12_S1 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S1, gmx_mul_pr(c12_S1, sh_invrc12_S)));
+#ifndef HALF_LJ
+ VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
+ VLJ12_S3 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S3, gmx_mul_pr(c12_S3, sh_invrc12_S)));
+#endif
+
+ VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
+ VLJ_S1 = gmx_sub_pr(VLJ12_S1, VLJ6_S1);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
+ VLJ_S3 = gmx_sub_pr(VLJ12_S3, VLJ6_S3);
+#endif
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
+ VLJ_S1 = gmx_blendzero_pr(VLJ_S1, wco_vdw_S1);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
+ VLJ_S3 = gmx_blendzero_pr(VLJ_S3, wco_vdw_S3);
+#endif
+#ifdef CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
- VLJ_S2 = gmx_blendzero_pr(VLJ_S2, int_S2);
- VLJ_S3 = gmx_blendzero_pr(VLJ_S3, int_S3);
++ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0);
++ VLJ_S1 = gmx_blendzero_pr(VLJ_S1, interact_S1);
+#ifndef HALF_LJ
++ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2);
++ VLJ_S3 = gmx_blendzero_pr(VLJ_S3, interact_S3);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+ Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
+#ifndef HALF_LJ
+ gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
+#else
+ gmx_add_pr(VLJ_S0, VLJ_S1)
+#endif
+ );
+#else
+ add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
+ add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
+#ifndef HALF_LJ
+ add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
+ add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_S0,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+ fscal_S1 = gmx_mul_pr(rinvsq_S1,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_S1,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+#else
+ fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+ fscal_S1 = gmx_mul_pr(rinvsq_S1, frcoul_S1);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_S2,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+ fscal_S3 = gmx_mul_pr(rinvsq_S3,
+#ifdef CALC_COULOMB
+ gmx_add_pr(frcoul_S3,
+#else
+ (
+#endif
+ gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+#else
+ /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+ fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+ fscal_S3 = gmx_mul_pr(rinvsq_S3, frcoul_S3);
+#endif
+
+ /* Calculate temporary vectorial force */
+ tx_S0 = gmx_mul_pr(fscal_S0, dx_S0);
+ tx_S1 = gmx_mul_pr(fscal_S1, dx_S1);
+ tx_S2 = gmx_mul_pr(fscal_S2, dx_S2);
+ tx_S3 = gmx_mul_pr(fscal_S3, dx_S3);
+ ty_S0 = gmx_mul_pr(fscal_S0, dy_S0);
+ ty_S1 = gmx_mul_pr(fscal_S1, dy_S1);
+ ty_S2 = gmx_mul_pr(fscal_S2, dy_S2);
+ ty_S3 = gmx_mul_pr(fscal_S3, dy_S3);
+ tz_S0 = gmx_mul_pr(fscal_S0, dz_S0);
+ tz_S1 = gmx_mul_pr(fscal_S1, dz_S1);
+ tz_S2 = gmx_mul_pr(fscal_S2, dz_S2);
+ tz_S3 = gmx_mul_pr(fscal_S3, dz_S3);
+
+ /* Increment i atom force */
+ fix_S0 = gmx_add_pr(fix_S0, tx_S0);
+ fix_S1 = gmx_add_pr(fix_S1, tx_S1);
+ fix_S2 = gmx_add_pr(fix_S2, tx_S2);
+ fix_S3 = gmx_add_pr(fix_S3, tx_S3);
+ fiy_S0 = gmx_add_pr(fiy_S0, ty_S0);
+ fiy_S1 = gmx_add_pr(fiy_S1, ty_S1);
+ fiy_S2 = gmx_add_pr(fiy_S2, ty_S2);
+ fiy_S3 = gmx_add_pr(fiy_S3, ty_S3);
+ fiz_S0 = gmx_add_pr(fiz_S0, tz_S0);
+ fiz_S1 = gmx_add_pr(fiz_S1, tz_S1);
+ fiz_S2 = gmx_add_pr(fiz_S2, tz_S2);
+ fiz_S3 = gmx_add_pr(fiz_S3, tz_S3);
+
+ /* Decrement j atom force */
+ gmx_store_pr(f+ajx,
+ gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_S0, tx_S1, tx_S2, tx_S3) ));
+ gmx_store_pr(f+ajy,
+ gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_S0, ty_S1, ty_S2, ty_S3) ));
+ gmx_store_pr(f+ajz,
+ gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_S0, tz_S1, tz_S2, tz_S3) ));
+}
+
+#undef rinv_ex_S0
+#undef rinv_ex_S1
+#undef rinv_ex_S2
+#undef rinv_ex_S3
+
+#undef wco_vdw_S0
+#undef wco_vdw_S1
+#undef wco_vdw_S2
+#undef wco_vdw_S3
+
+#undef NBNXN_CUTOFF_USE_BLENDV
+
+#undef EXCL_FORCES
--- /dev/null
- #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
- #endif
-
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
-
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
- #define SIMD_MASK_ALL 0xffffffff
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ GMX_SIMD_WIDTH_HERE
+
+/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
+#if GMX_SIMD_WIDTH_HERE >= UNROLLI
+#define STRIDE GMX_SIMD_WIDTH_HERE
+#else
+#define STRIDE UNROLLI
+#endif
+
+#if GMX_SIMD_WIDTH_HERE == 2
+#define SUM_SIMD(x) (x[0]+x[1])
+#else
+#if GMX_SIMD_WIDTH_HERE == 4
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#else
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#error "unsupported kernel configuration"
+#endif
+#endif
+#endif
+
+
+/* Decide if we should use the FDV0 table layout */
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+#if GMX_SIMD_WIDTH_HERE/2 == 4
+#define TAB_FDV0
+#endif
+#else
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
+#endif
+#endif
+
++/* Decide the stride for the 2 LJ parameters */
++#ifdef GMX_X86_SSE2
++#ifdef GMX_DOUBLE
++#define NBFP_STRIDE 2
++#else
++#define NBFP_STRIDE 4
++#endif
++#else
++#if GMX_SIMD_WIDTH_HERE > 4
++#define NBFP_STRIDE 4
++#else
++#define NBFP_STRIDE GMX_SIMD_WIDTH_HERE
++#endif
++#endif
+
- #ifndef GMX_DOUBLE
- __m128 fix_S, fiy_S, fiz_S;
+
+#include "nbnxn_kernel_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
+#else
+#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
+#else
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
+#else
+#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+(const nbnxn_pairlist_t *nbl,
+ const nbnxn_atomdata_t *nbat,
+ const interaction_const_t *ic,
+ rvec *shift_vec,
+ real *f
+#ifdef CALC_SHIFTFORCES
+ ,
+ real *fshift
+#endif
+#ifdef CALC_ENERGIES
+ ,
+ real *Vvdw,
+ real *Vc
+#endif
+)
+{
+ const nbnxn_ci_t *nbln;
+ const nbnxn_cj_t *l_cj;
+ const int *type;
+ const real *q;
+ const real *shiftvec;
+ const real *x;
+ const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
+ real facel;
+ real *nbfp_ptr;
+ int nbfp_stride;
+ int n, ci, ci_sh;
+ int ish, ish3;
+ gmx_bool do_LJ, half_LJ, do_coul;
+ int sci, scix, sciy, sciz, sci2;
+ int cjind0, cjind1, cjind;
+ int ip, jp;
+
+#ifdef ENERGY_GROUPS
+ int Vstride_i;
+ int egps_ishift, egps_imask;
+ int egps_jshift, egps_jmask, egps_jstride;
+ int egps_i;
+ real *vvdwtp[UNROLLI];
+ real *vctp[UNROLLI];
+#endif
+
+ gmx_mm_pr shX_S;
+ gmx_mm_pr shY_S;
+ gmx_mm_pr shZ_S;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S1, iy_S1, iz_S1;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
+ gmx_mm_pr ix_S3, iy_S3, iz_S3;
+ gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
+ gmx_mm_pr fix_S1, fiy_S1, fiz_S1;
+ gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
+ gmx_mm_pr fix_S3, fiy_S3, fiz_S3;
+#if UNROLLJ >= 4
- __m256d fix_S, fiy_S, fiz_S;
++ /* We use an i-force SIMD register width of 4 */
++#if UNROLLJ == 4
++#define gmx_mm_pr4 gmx_mm_pr
++#define gmx_load_pr4 gmx_load_pr
++#define gmx_store_pr4 gmx_store_pr
++#define gmx_add_pr4 gmx_add_pr
+#else
- __m128d fix0_S, fiy0_S, fiz0_S;
- __m128d fix2_S, fiy2_S, fiz2_S;
++ /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
+#endif
++ gmx_mm_pr4 fix_S, fiy_S, fiz_S;
+#else
- gmx_mm_pr diag_jmi_S;
++ /* We use an i-force SIMD register width of 2 */
++ gmx_mm_pr fix0_S, fiy0_S, fiz0_S;
++ gmx_mm_pr fix2_S, fiy2_S, fiz2_S;
+#endif
+
- gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
++ gmx_mm_pr diagonal_jmi_S;
+#if UNROLLI == UNROLLJ
- gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
- gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
++ gmx_mm_pb diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
+#else
- #ifdef gmx_checkbitmask_epi32
- gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
++ gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
++ gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
+#endif
+
- gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
++ unsigned *excl_filter;
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++ gmx_epi32 filter_S0, filter_S1, filter_S2, filter_S3;
+#else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
++ gmx_mm_pr filter_S0, filter_S1, filter_S2, filter_S3;
+#endif
+
+ gmx_mm_pr zero_S = gmx_set1_pr(0);
+
+ gmx_mm_pr one_S = gmx_set1_pr(1.0);
+ gmx_mm_pr iq_S0 = gmx_setzero_pr();
+ gmx_mm_pr iq_S1 = gmx_setzero_pr();
+ gmx_mm_pr iq_S2 = gmx_setzero_pr();
+ gmx_mm_pr iq_S3 = gmx_setzero_pr();
+ gmx_mm_pr mrc_3_S;
+#ifdef CALC_ENERGIES
+ gmx_mm_pr hrc_3_S, moh_rc_S;
+#endif
+
+#ifdef CALC_COUL_TAB
+ /* Coulomb table variables */
+ gmx_mm_pr invtsp_S;
+ const real *tab_coul_F;
+#ifndef TAB_FDV0
+ const real *tab_coul_V;
+#endif
- #ifndef GMX_DOUBLE
++#if GMX_SIMD_WIDTH_HERE >= 8 || (defined GMX_DOUBLE && GMX_SIMD_WIDTH_HERE >= 4)
++#define STORE_TABLE_INDICES
++#endif
++#ifdef STORE_TABLE_INDICES
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
+ int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
+ int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
++#else
++ /* Table indices not used, but a function requires the argument */
++ int *ti0 = NULL, *ti1 = NULL, *ti2 = NULL, *ti3 = NULL;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_mm_pr mhalfsp_S;
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+ gmx_mm_pr beta2_S, beta_S;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+ gmx_mm_pr sh_ewald_S;
+#endif
+
+#ifdef LJ_COMB_LB
+ const real *ljc;
+
+ gmx_mm_pr hsig_i_S0, seps_i_S0;
+ gmx_mm_pr hsig_i_S1, seps_i_S1;
+ gmx_mm_pr hsig_i_S2, seps_i_S2;
+ gmx_mm_pr hsig_i_S3, seps_i_S3;
+#else
+#ifdef FIX_LJ_C
+ real pvdw_array[2*UNROLLI*UNROLLJ+3];
+ real *pvdw_c6, *pvdw_c12;
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S1, c12_S1;
+ gmx_mm_pr c6_S2, c12_S2;
+ gmx_mm_pr c6_S3, c12_S3;
+#endif
+
+#ifdef LJ_COMB_GEOM
+ const real *ljc;
+
+ gmx_mm_pr c6s_S0, c12s_S0;
+ gmx_mm_pr c6s_S1, c12s_S1;
+ gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+ gmx_mm_pr vctot_S, Vvdwtot_S;
+ gmx_mm_pr sixth_S, twelveth_S;
+
+ gmx_mm_pr avoid_sing_S;
+ gmx_mm_pr rc2_S;
+#ifdef VDW_CUTOFF_CHECK
+ gmx_mm_pr rcvdw2_S;
+#endif
+
+#ifdef CALC_ENERGIES
+ gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
+
+ /* cppcheck-suppress unassignedVariable */
+ real tmpsum_array[15], *tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+ /* cppcheck-suppress unassignedVariable */
+ real shf_array[15], *shf;
+#endif
+
+ int ninner;
+
+#ifdef COUNT_PAIRS
+ int npair = 0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+ ljc = nbat->lj_comb;
+#else
+ /* No combination rule used */
- #define NBFP_STRIDE 4
++#if NBFP_STRIDE == 2
++ nbfp_ptr = nbat->nbfp;
++#else
++#if NBFP_STRIDE == 4
+ nbfp_ptr = nbat->nbfp_s4;
- nbfp_ptr = nbat->nbfp;
- #define NBFP_STRIDE 2
+#else
- diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
++#error "Only NBFP_STRIDE 2 and 4 are currently supported"
++#endif
+#endif
+ nbfp_stride = NBFP_STRIDE;
+#endif
+
+ /* Load j-i for the first i */
- diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
++ diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
- diag0_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag0_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag0_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag0_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
++ diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+#else
+#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
- diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
++ diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+
+#if UNROLLI == 2*UNROLLJ
+ /* Load j-i for the second half of the j-cluster */
- diag1_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag1_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag1_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag1_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
++ diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
+#endif
+
- #ifdef gmx_checkbitmask_epi32
- mask_S0 = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
- mask_S1 = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
- mask_S2 = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
- mask_S3 = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
++ diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
++ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
++ diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+#endif
+#endif
+
+ /* Load masks for topology exclusion masking */
- mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
- mask_S1 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
- mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
- mask_S3 = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++#define FILTER_STRIDE (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
++#else
++#ifdef GMX_DOUBLE
++#define FILTER_STRIDE 2
++#else
++#define FILTER_STRIDE 1
++#endif
++#endif
++#if FILTER_STRIDE == 1
++ excl_filter = nbat->simd_exclusion_filter1;
+#else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
++ excl_filter = nbat->simd_exclusion_filter2;
++#endif
++ /* Here we cast the exclusion filters from unsigned * to int * or real *.
++ * Since we only check bits, the actual value they represent does not
++ * matter, as long as both filter and mask data are treated the same way.
++ */
++#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
++ filter_S0 = gmx_load_si((int *)excl_filter + 0*UNROLLJ*FILTER_STRIDE);
++ filter_S1 = gmx_load_si((int *)excl_filter + 1*UNROLLJ*FILTER_STRIDE);
++ filter_S2 = gmx_load_si((int *)excl_filter + 2*UNROLLJ*FILTER_STRIDE);
++ filter_S3 = gmx_load_si((int *)excl_filter + 3*UNROLLJ*FILTER_STRIDE);
++#else
++ filter_S0 = gmx_load_pr((real *)excl_filter + 0*UNROLLJ);
++ filter_S1 = gmx_load_pr((real *)excl_filter + 1*UNROLLJ);
++ filter_S2 = gmx_load_pr((real *)excl_filter + 2*UNROLLJ);
++ filter_S3 = gmx_load_pr((real *)excl_filter + 3*UNROLLJ);
+#endif
++#undef FILTER_STRIDE
+
+#ifdef CALC_COUL_TAB
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++#ifdef STORE_TABLE_INDICES
+ /* Generate aligned table index pointers */
+ ti0 = gmx_simd_align_int(ti0_array);
+ ti1 = gmx_simd_align_int(ti1_array);
+ ti2 = gmx_simd_align_int(ti2_array);
+ ti3 = gmx_simd_align_int(ti3_array);
+#endif
+
+ invtsp_S = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+ mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
+#endif
+
+#ifdef TAB_FDV0
+ tab_coul_F = ic->tabq_coul_FDV0;
+#else
+ tab_coul_F = ic->tabq_coul_F;
+ tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+ beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_S = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+ sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+ q = nbat->q;
+ type = nbat->type;
+ facel = ic->epsfac;
+ shiftvec = shift_vec[0];
+ x = nbat->x;
+
+ avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+ /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+ rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+ rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+ sixth_S = gmx_set1_pr(1.0/6.0);
+ twelveth_S = gmx_set1_pr(1.0/12.0);
+
+ sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+ mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+ hrc_3_S = gmx_set1_pr(ic->k_rf);
+
+ moh_rc_S = gmx_set1_pr(-ic->c_rf);
+#endif
+
+#ifdef CALC_ENERGIES
+ tmpsum = gmx_simd_align_real(tmpsum_array);
+#endif
+#ifdef CALC_SHIFTFORCES
+ shf = gmx_simd_align_real(shf_array);
+#endif
+
+#ifdef FIX_LJ_C
+ pvdw_c6 = gmx_simd_align_real(pvdw_array+3);
+ pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+ for (jp = 0; jp < UNROLLJ; jp++)
+ {
+ pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+ pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+ pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+ }
+ c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+ egps_ishift = nbat->neg_2log;
+ egps_imask = (1<<egps_ishift) - 1;
+ egps_jshift = 2*nbat->neg_2log;
+ egps_jmask = (1<<egps_jshift) - 1;
+ egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+ /* Major division is over i-particle energy groups, determine the stride */
+ Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+ l_cj = nbl->cj;
+
+ ninner = 0;
+ for (n = 0; n < nbl->nci; n++)
+ {
+ nbln = &nbl->ci[n];
+
+ ish = (nbln->shift & NBNXN_CI_SHIFT);
+ ish3 = ish*3;
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
+ ci = nbln->ci;
+ ci_sh = (ish == CENTRAL ? ci : -1);
+
+ shX_S = gmx_load1_pr(shiftvec+ish3);
+ shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_S = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+ sci = ci*STRIDE;
+ scix = sci*DIM;
+ sci2 = sci*2;
+#else
+ sci = (ci>>1)*STRIDE;
+ scix = sci*DIM + (ci & 1)*(STRIDE>>1);
+ sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
+ sci += (ci & 1)*(STRIDE>>1);
+#endif
+
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
+ do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
+
+#ifdef ENERGY_GROUPS
+ egps_i = nbat->energrp[ci];
+ {
+ int ia, egp_ia;
+
+ for (ia = 0; ia < UNROLLI; ia++)
+ {
+ egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+ vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+ vctp[ia] = Vc + egp_ia*Vstride_i;
+ }
+ }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
+#if UNROLLJ == 2
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+#endif
+#if UNROLLJ == 8
+ if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+ {
+ int ia;
+ real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+ Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+ Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+ Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+ /* beta/sqrt(pi) */
+ Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+#endif
+
+ for (ia = 0; ia < UNROLLI; ia++)
+ {
+ real qi;
+
+ qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+ vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+ Vc[0]
+#endif
+ -= facel*qi*qi*Vc_sub_self;
+ }
+ }
+#endif
+
+ /* Load i atom data */
+ sciy = scix + STRIDE;
+ sciz = sciy + STRIDE;
+ ix_S0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
+ ix_S1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
+ ix_S2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
+ ix_S3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
+ iy_S0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
+ iy_S1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
+ iy_S2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
+ iy_S3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
+ iz_S0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
+ iz_S1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
+ iz_S2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
+ iz_S3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
+
+ if (do_coul)
+ {
+ iq_S0 = gmx_set1_pr(facel*q[sci]);
+ iq_S1 = gmx_set1_pr(facel*q[sci+1]);
+ iq_S2 = gmx_set1_pr(facel*q[sci+2]);
+ iq_S3 = gmx_set1_pr(facel*q[sci+3]);
+ }
+
+#ifdef LJ_COMB_LB
+ hsig_i_S0 = gmx_load1_pr(ljc+sci2+0);
+ hsig_i_S1 = gmx_load1_pr(ljc+sci2+1);
+ hsig_i_S2 = gmx_load1_pr(ljc+sci2+2);
+ hsig_i_S3 = gmx_load1_pr(ljc+sci2+3);
+ seps_i_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ seps_i_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ seps_i_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ seps_i_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
+#else
+#ifdef LJ_COMB_GEOM
+ c6s_S0 = gmx_load1_pr(ljc+sci2+0);
+ c6s_S1 = gmx_load1_pr(ljc+sci2+1);
+ if (!half_LJ)
+ {
+ c6s_S2 = gmx_load1_pr(ljc+sci2+2);
+ c6s_S3 = gmx_load1_pr(ljc+sci2+3);
+ }
+ c12s_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ c12s_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ if (!half_LJ)
+ {
+ c12s_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ c12s_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ }
+#else
+ nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
+ nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+ if (!half_LJ)
+ {
+ nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+ nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+ }
+#endif
+#endif
+
+ /* Zero the potential energy for this list */
+ Vvdwtot_S = gmx_setzero_pr();
+ vctot_S = gmx_setzero_pr();
+
+ /* Clear i atom forces */
+ fix_S0 = gmx_setzero_pr();
+ fix_S1 = gmx_setzero_pr();
+ fix_S2 = gmx_setzero_pr();
+ fix_S3 = gmx_setzero_pr();
+ fiy_S0 = gmx_setzero_pr();
+ fiy_S1 = gmx_setzero_pr();
+ fiy_S2 = gmx_setzero_pr();
+ fiy_S3 = gmx_setzero_pr();
+ fiz_S0 = gmx_setzero_pr();
+ fiz_S1 = gmx_setzero_pr();
+ fiz_S2 = gmx_setzero_pr();
+ fiz_S3 = gmx_setzero_pr();
+
+ cjind = cjind0;
+
+ /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+ if (half_LJ)
+ {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_4xn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for (; (cjind < cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_4xn_inner.h"
+ }
+#undef HALF_LJ
+#undef CALC_COULOMB
+ }
+ else if (do_coul)
+ {
+#define CALC_COULOMB
+#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
++ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_4xn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for (; (cjind < cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_4xn_inner.h"
+ }
+#undef CALC_COULOMB
+ }
+ else
+ {
+#define CHECK_EXCLS
- #ifndef GMX_DOUBLE
- #define gmx_load_pr4 _mm_load_ps
- #define gmx_store_pr4 _mm_store_ps
- #define gmx_add_pr4 _mm_add_ps
- #else
- #define gmx_load_pr4 _mm256_load_pd
- #define gmx_store_pr4 _mm256_store_pd
- #define gmx_add_pr4 _mm256_add_pd
- #endif
- GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
++ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+#include "nbnxn_kernel_simd_4xn_inner.h"
+ cjind++;
+ }
+#undef CHECK_EXCLS
+ for (; (cjind < cjind1); cjind++)
+ {
+#include "nbnxn_kernel_simd_4xn_inner.h"
+ }
+ }
+#undef CALC_LJ
+ ninner += cjind1 - cjind0;
+
+ /* Add accumulated i-forces to the force array */
+#if UNROLLJ >= 4
- GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
++ fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
+ gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
+
- GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
++ fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
+ gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
+
- GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
- _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
- GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
- _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
++ fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
+ gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+ gmx_store_pr4(shf, fix_S);
+ fshift[ish3+0] += SUM_SIMD4(shf);
+ gmx_store_pr4(shf, fiy_S);
+ fshift[ish3+1] += SUM_SIMD4(shf);
+ gmx_store_pr4(shf, fiz_S);
+ fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+#else
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
- _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
- _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
++ fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
++ gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
++ fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
++ gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
+
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
- _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
- _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
++ fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
++ gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
++ fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
++ gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
+
- _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
++ fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
++ gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
++ fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
++ gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
+
+#ifdef CALC_SHIFTFORCES
- _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
++ gmx_store_pr(shf, gmx_add_pr(fix0_S, fix2_S));
+ fshift[ish3+0] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
++ gmx_store_pr(shf, gmx_add_pr(fiy0_S, fiy2_S));
+ fshift[ish3+1] += shf[0] + shf[1];
++ gmx_store_pr(shf, gmx_add_pr(fiz0_S, fiz2_S));
+ fshift[ish3+2] += shf[0] + shf[1];
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+ if (do_coul)
+ {
+ gmx_store_pr(tmpsum, vctot_S);
+ *Vc += SUM_SIMD(tmpsum);
+ }
+
+ gmx_store_pr(tmpsum, Vvdwtot_S);
+ *Vvdw += SUM_SIMD(tmpsum);
+#endif
+
+ /* Outer loop uses 6 flops/iteration */
+ }
+
+#ifdef COUNT_PAIRS
+ printf("atom pairs %d\n", npair);
+#endif
+}
+
+
++#if UNROLLJ == 4
++#undef gmx_mm_pr4
+#undef gmx_load_pr4
+#undef gmx_store_pr4
+#undef gmx_store_pr4
++#endif
++
++#undef STORE_TABLE_INDICES
+
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI
+#undef UNROLLJ
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
+
+#undef GMX_USE_HALF_WIDTH_SIMD_HERE
--- /dev/null
- #ifdef GMX_X86_SSE2
-
- /* Transpose 2 double precision registers */
- #define GMX_MM_TRANSPOSE2_OP_PD(in0, in1, out0, out1) \
- { \
- out0 = _mm_unpacklo_pd(in0, in1); \
- out1 = _mm_unpackhi_pd(in0, in1); \
- }
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_sse_utils_h_
+#define _nbnxn_kernel_sse_utils_h_
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 || !defined GMX_DOUBLE
- /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
- #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1) \
- { \
- __m128 _c01, _c23; \
- _c01 = _mm_movelh_ps(in0, in1); \
- _c23 = _mm_movelh_ps(in2, in3); \
- out0 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
- out1 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1)); \
- }
+
- /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
- #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1) \
- { \
- __m256d _c01, _c23; \
- _c01 = _mm256_shuffle_pd(in0, in1, _MM_SHUFFLE(1, 0, 1, 0)); \
- _c23 = _mm256_shuffle_pd(in2, in3, _MM_SHUFFLE(1, 0, 1, 0)); \
- out0 = _mm256_shuffle_pd(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
- out1 = _mm256_shuffle_pd(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1)); \
- }
- #endif
-
- /* Collect element 2 of the 4 inputs to out */
- #define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0, in1, in2, in3, out) \
- { \
- __m128 _c01, _c23; \
- _c01 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 2, 3, 2)); \
- _c23 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 2, 3, 2)); \
- out = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
- }
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #ifndef GMX_DOUBLE
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
- { \
- _MM_TRANSPOSE4_PS(in0, in1, in2, in3); \
- in0 = _mm_add_ps(in0, in1); \
- in2 = _mm_add_ps(in2, in3); \
- out = _mm_add_ps(in0, in2); \
- }
++/* Include SIMD architecture specific versions of the 4/5 functions above */
++#ifdef GMX_SIMD_REFERENCE_PLAIN_C
++#include "nbnxn_kernel_simd_utils_ref.h"
+#else
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM2_PD(in0, in1, out) \
- { \
- GMX_MM_TRANSPOSE2_PD(in0, in1); \
- out = _mm_add_pd(in0, in1); \
- }
++#ifdef GMX_X86_SSE2
++/* Include x86 SSE2 compatible SIMD functions */
++#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
++#ifdef GMX_DOUBLE
++#include "nbnxn_kernel_simd_utils_x86_256d.h"
+#else
- #ifndef GMX_DOUBLE
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
- { \
- in0 = _mm256_hadd_ps(in0, in1); \
- in2 = _mm256_hadd_ps(in2, in3); \
- in1 = _mm256_hadd_ps(in0, in2); \
- out = _mm_add_ps(_mm256_castps256_ps128(in1), _mm256_extractf128_ps(in1, 1)); \
- }
- /* Sum the elements of halfs of each input register and store sums in out */
- #define GMX_MM_TRANSPOSE_SUM4H_PR(in0, in2, out) \
- { \
- in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps()); \
- in2 = _mm256_hadd_ps(in2, _mm256_setzero_ps()); \
- in0 = _mm256_hadd_ps(in0, in2); \
- in2 = _mm256_permute_ps(in0, _MM_SHUFFLE(2, 3, 0, 1)); \
- out = _mm_add_ps(_mm256_castps256_ps128(in0), _mm256_extractf128_ps(in2, 1)); \
- }
++#include "nbnxn_kernel_simd_utils_x86_256s.h"
+#endif
+#else
- /* Sum the elements within each input register and store the sums in out */
- #define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
- { \
- in0 = _mm256_hadd_pd(in0, in1); \
- in2 = _mm256_hadd_pd(in2, in3); \
- out = _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31)); \
- }
- #endif
- #endif
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
-
- static inline __m128
- gmx_mm128_invsqrt_ps_single(__m128 x)
- {
- const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
- const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
- __m128 lu = _mm_rsqrt_ps(x);
-
- return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
- }
-
- /* Do 2 double precision invsqrt operations.
- * Doing the SIMD rsqrt and the first Newton Raphson iteration
- * in single precision gives full double precision accuracy.
- * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
- */
- #define GMX_MM128_INVSQRT2_PD(in0, in1, out0, out1) \
- { \
- const __m128d half = _mm_set1_pd(0.5); \
- const __m128d three = _mm_set1_pd(3.0); \
- __m128 s, ir; \
- __m128d lu0, lu1; \
- \
- s = _mm_movelh_ps(_mm_cvtpd_ps(in0), _mm_cvtpd_ps(in1)); \
- ir = gmx_mm128_invsqrt_ps_single(s); \
- lu0 = _mm_cvtps_pd(ir); \
- lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir, ir)); \
- out0 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu0, lu0), in0)), lu0)); \
- out1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), in1)), lu1)); \
- }
-
- #define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
-
- #endif
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
-
- static inline __m256
- gmx_mm256_invsqrt_ps_single(__m256 x)
- {
- const __m256 half = _mm256_set_ps(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
- const __m256 three = _mm256_set_ps(3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0);
-
- __m256 lu = _mm256_rsqrt_ps(x);
-
- return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
- }
-
- /* Do 4 double precision invsqrt operations.
- * Doing the SIMD rsqrt and the first Newton Raphson iteration
- * in single precision gives full double precision accuracy.
- */
- #define GMX_MM256_INVSQRT2_PD(in0, in1, out0, out1) \
- { \
- const __m256d half = _mm256_set1_pd(0.5); \
- const __m256d three = _mm256_set1_pd(3.0); \
- __m256 s, ir; \
- __m256d lu0, lu1; \
- \
- s = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)), _mm256_cvtpd_ps(in1), 1); \
- ir = gmx_mm256_invsqrt_ps_single(s); \
- lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir)); \
- lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir, 1)); \
- out0 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu0, lu0), in0)), lu0)); \
- out1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), in1)), lu1)); \
- }
-
- #define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
-
- #endif
-
- /* Force and energy table load and interpolation routines */
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
-
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- gmx_mm_pr clj_SSE[UNROLLJ]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0], clj_SSE[1], clj_SSE[2], clj_SSE[3], c6_SSE, c12_SSE); \
- }
-
- #endif
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
-
- /* Put two 128-bit 4-float registers into one 256-bit 8-float register */
- #define GMX_2_MM_TO_M256(in0, in1, out) \
- { \
- out = _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1); \
- }
-
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- __m128 clj_SSE[UNROLLJ], c6t_SSE[2], c12t_SSE[2]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0], clj_SSE[1], clj_SSE[2], clj_SSE[3], c6t_SSE[0], c12t_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4], clj_SSE[5], clj_SSE[6], clj_SSE[7], c6t_SSE[1], c12t_SSE[1]); \
- \
- GMX_2_MM_TO_M256(c6t_SSE[0], c6t_SSE[1], c6_SSE); \
- GMX_2_MM_TO_M256(c12t_SSE[0], c12t_SSE[1], c12_SSE); \
- }
-
- #define load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_SSE, c12_SSE) \
- { \
- __m128 clj_SSE0[UNROLLJ], clj_SSE1[UNROLLJ], c6t_SSE[2], c12t_SSE[2]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE); \
- } \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0], clj_SSE0[1], clj_SSE0[2], clj_SSE0[3], c6t_SSE[0], c12t_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0], clj_SSE1[1], clj_SSE1[2], clj_SSE1[3], c6t_SSE[1], c12t_SSE[1]); \
- \
- GMX_2_MM_TO_M256(c6t_SSE[0], c6t_SSE[1], c6_SSE); \
- GMX_2_MM_TO_M256(c12t_SSE[0], c12t_SSE[1], c12_SSE); \
- }
-
++#ifdef GMX_DOUBLE
++#include "nbnxn_kernel_simd_utils_x86_128d.h"
+#else
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
-
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- gmx_mm_pr clj_SSE[UNROLLJ]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0], clj_SSE[1], c6_SSE, c12_SSE); \
- }
-
++#include "nbnxn_kernel_simd_utils_x86_128s.h"
+#endif
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
-
- #define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- __m128d clj_SSE[UNROLLJ], c6t_SSE[2], c12t_SSE[2]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- clj_SSE[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0], clj_SSE[1], c6t_SSE[0], c12t_SSE[0]); \
- GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[2], clj_SSE[3], c6t_SSE[1], c12t_SSE[1]); \
- GMX_2_M128D_TO_M256D(c6t_SSE[0], c6t_SSE[1], c6_SSE); \
- GMX_2_M128D_TO_M256D(c12t_SSE[0], c12t_SSE[1], c12_SSE); \
- }
-
+#endif
-
-
- /* The load_table functions below are performance critical.
- * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
- * As these all have latencies, scheduling is crucial.
- * The Intel compilers and CPUs seem to do a good job at this.
- * But AMD CPUs perform significantly worse with gcc than with icc.
- * Performance is improved a bit by using the extract function UNROLLJ times,
- * instead of doing an _mm_store_si128 for every i-particle.
- * This is only faster when we use FDV0 formatted tables, where we also need
- * to multiple the index by 4, which can be done by a SIMD bit shift.
- * With single precision AVX, 8 extracts are much slower than 1 store.
- * Because of this, the load_table_f macro always takes the ti parameter,
- * but it is only used with AVX.
- */
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
-
- #define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- int idx[4]; \
- __m128 ctab_SSE[4]; \
- \
- /* Table has 4 entries, left-shift index by 2 */ \
- ti_SSE = _mm_slli_epi32(ti_SSE, 2); \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]); \
- idx[2] = gmx_mm_extract_epi32(ti_SSE, 2); \
- ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]); \
- idx[3] = gmx_mm_extract_epi32(ti_SSE, 3); \
- ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctab0_SSE, ctab1_SSE); \
- }
-
- #define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- int idx[4]; \
- __m128 ctab_SSE[4]; \
- \
- /* Table has 4 entries, left-shift index by 2 */ \
- ti_SSE = _mm_slli_epi32(ti_SSE, 2); \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]); \
- idx[2] = gmx_mm_extract_epi32(ti_SSE, 2); \
- ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]); \
- idx[3] = gmx_mm_extract_epi32(ti_SSE, 3); \
- ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctab0_SSE, ctab1_SSE); \
- /* Shuffle the energy table entries to a convenient order */ \
- GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabv_SSE); \
- }
-
+#endif
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
-
- #define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- __m128 ctab_SSE[8], ctabt_SSE[4]; \
- int j; \
- \
- /* Bit shifting would be faster, but AVX doesn't support that */ \
- _mm256_store_si256((__m256i *)ti, ti_SSE); \
- for (j = 0; j < 8; j++) \
- { \
- ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabt_SSE[0], ctabt_SSE[2]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabt_SSE[1], ctabt_SSE[3]); \
- \
- GMX_2_MM_TO_M256(ctabt_SSE[0], ctabt_SSE[1], ctab0_SSE); \
- GMX_2_MM_TO_M256(ctabt_SSE[2], ctabt_SSE[3], ctab1_SSE); \
- }
+#endif
+
- #define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- __m128 ctab_SSE[8], ctabt_SSE[4], ctabvt_SSE[2]; \
- int j; \
- \
- /* Bit shifting would be faster, but AVX doesn't support that */ \
- _mm256_store_si256((__m256i *)ti, ti_SSE); \
- for (j = 0; j < 8; j++) \
- { \
- ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabt_SSE[0], ctabt_SSE[2]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabt_SSE[1], ctabt_SSE[3]); \
- \
- GMX_2_MM_TO_M256(ctabt_SSE[0], ctabt_SSE[1], ctab0_SSE); \
- GMX_2_MM_TO_M256(ctabt_SSE[2], ctabt_SSE[3], ctab1_SSE); \
- \
- GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabvt_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabvt_SSE[1]); \
- \
- GMX_2_MM_TO_M256(ctabvt_SSE[0], ctabvt_SSE[1], ctabv_SSE); \
- }
-
- #endif
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
-
- #define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- int idx[2]; \
- __m128d ctab_SSE[2]; \
- \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], ctab0_SSE, ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm_sub_pd(ctab1_SSE, ctab0_SSE); \
- }
-
- #define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- int idx[2]; \
- __m128d ctab_SSE[4]; \
- \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], ctab0_SSE, ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm_sub_pd(ctab1_SSE, ctab0_SSE); \
- \
- ctab_SSE[2] = _mm_loadu_pd(tab_coul_V+idx[0]); \
- ctab_SSE[3] = _mm_loadu_pd(tab_coul_V+idx[1]); \
- \
- /* Shuffle the energy table entries to a single register */ \
- ctabv_SSE = _mm_shuffle_pd(ctab_SSE[2], ctab_SSE[3], _MM_SHUFFLE2(0, 0)); \
- }
-
- #endif
-
- #if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
-
- /* Put two 128-bit 2-double registers into one 256-bit 4-ouble register */
- #define GMX_2_M128D_TO_M256D(in0, in1, out) \
- { \
- out = _mm256_insertf128_pd(_mm256_castpd128_pd256(in0), in1, 1); \
- }
-
- #define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- __m128d ctab_SSE[4], tr_SSE[4]; \
- int j; \
- \
- _mm_store_si128((__m128i *)ti, ti_SSE); \
- for (j = 0; j < 4; j++) \
- { \
- ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]); \
- } \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], tr_SSE[0], tr_SSE[1]); \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2], ctab_SSE[3], tr_SSE[2], tr_SSE[3]); \
- GMX_2_M128D_TO_M256D(tr_SSE[0], tr_SSE[2], ctab0_SSE); \
- GMX_2_M128D_TO_M256D(tr_SSE[1], tr_SSE[3], ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm256_sub_pd(ctab1_SSE, ctab0_SSE); \
- }
-
- #define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- __m128d ctab_SSE[8], tr_SSE[4]; \
- int j; \
- \
- _mm_store_si128((__m128i *)ti, ti_SSE); \
- for (j = 0; j < 4; j++) \
- { \
- ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]); \
- } \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], tr_SSE[0], tr_SSE[1]); \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2], ctab_SSE[3], tr_SSE[2], tr_SSE[3]); \
- GMX_2_M128D_TO_M256D(tr_SSE[0], tr_SSE[2], ctab0_SSE); \
- GMX_2_M128D_TO_M256D(tr_SSE[1], tr_SSE[3], ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm256_sub_pd(ctab1_SSE, ctab0_SSE); \
- \
- for (j = 0; j < 4; j++) \
- { \
- ctab_SSE[4+j] = _mm_loadu_pd(tab_coul_V+ti[j]); \
- } \
- /* Shuffle the energy table entries to a single register */ \
- GMX_2_M128D_TO_M256D(_mm_shuffle_pd(ctab_SSE[4], ctab_SSE[5], _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(ctab_SSE[6], ctab_SSE[7], _MM_SHUFFLE2(0, 0)), ctabv_SSE); \
- }
-
- #endif
-
-
- /* Add energy register to possibly multiple terms in the energy array.
- * This function is the same for SSE/AVX single/double.
- */
- static inline void add_ener_grp(gmx_mm_pr e_SSE, real *v, const int *offset_jj)
+
- gmx_mm_pr v_SSE;
++#ifdef UNROLLJ
++/* Add energy register to possibly multiple terms in the energy array */
++static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
+{
+ int jj;
+
+ /* We need to balance the number of store operations with
+ * the rapidly increases number of combinations of energy groups.
+ * We add to a temporary buffer for 1 i-group vs 2 j-groups.
+ */
+ for (jj = 0; jj < (UNROLLJ/2); jj++)
+ {
- v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
- gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_SSE, e_SSE));
++ gmx_mm_pr v_S;
+
- #if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8 && defined gmx_mm_hpr
- /* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
++ v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
++ gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
+ }
+}
++#endif
+
- static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
- real *v0, real *v1, const int *offset_jj)
++#if defined GMX_NBNXN_SIMD_2XNN && defined UNROLLJ
++/* As add_ener_grp, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
- gmx_mm_hpr e_SSE0, e_SSE1;
++static inline void
++add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
+{
- e_SSE0 = _mm256_extractf128_ps(e_SSE, 0);
- e_SSE1 = _mm256_extractf128_ps(e_SSE, 1);
++ gmx_mm_hpr e_S0, e_S1;
+ int jj;
+
- gmx_mm_hpr v_SSE;
++ gmx_pr_to_2hpr(e_S, &e_S0, &e_S1);
+
+ for (jj = 0; jj < (UNROLLJ/2); jj++)
+ {
- gmx_load_hpr(v_SSE, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
- gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE0));
++ gmx_mm_hpr v_S;
+
- gmx_mm_hpr v_SSE;
++ gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
++ gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
+ }
+ for (jj = 0; jj < (UNROLLJ/2); jj++)
+ {
- gmx_load_hpr(v_SSE, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
- gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE1));
++ gmx_mm_hpr v_S;
+
- #endif /* GMX_X86_SSE2 */
-
++ gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
++ gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
+ }
+}
+#endif
+
+#endif /* _nbnxn_kernel_sse_utils_h_ */
--- /dev/null
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #ifdef GMX_DOUBLE
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "sysstuff.h"
+#include "smalloc.h"
+#include "macros.h"
+#include "maths.h"
+#include "vec.h"
+#include "pbc.h"
+#include "nbnxn_consts.h"
++/* nbnxn_internal.h included gmx_simd_macros.h */
+#include "nbnxn_internal.h"
++#ifdef GMX_NBNXN_SIMD
++#include "gmx_simd_vec.h"
++#endif
+#include "nbnxn_atomdata.h"
+#include "nbnxn_search.h"
+#include "gmx_cyclecounter.h"
+#include "gmxfio.h"
+#include "gmx_omp_nthreads.h"
+#include "nrnb.h"
+
+
+/* Pair search box lower and upper corner in x,y,z.
+ * Store this in 4 iso 3 reals, which is useful with SSE.
+ * To avoid complicating the code we also use 4 without SSE.
+ */
+#define NNBSBB_C 4
+#define NNBSBB_B (2*NNBSBB_C)
+/* Pair search box lower and upper bound in z only. */
+#define NNBSBB_D 2
+/* Pair search box lower and upper corner x,y,z indices */
+#define BBL_X 0
+#define BBL_Y 1
+#define BBL_Z 2
+#define BBU_X 4
+#define BBU_Y 5
+#define BBU_Z 6
+
+
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
+
+#ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
+#define NBNXN_SEARCH_SSE_SINGLE
+#endif
+
+/* Include basic SSE2 stuff */
+#include <emmintrin.h>
+
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
+#endif
+
+/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
+ * Here AVX-256 turns out to be slightly slower than AVX-128.
+ */
+#define STRIDE_PBB 4
+#define STRIDE_PBB_2LOG 2
+
+#endif /* NBNXN_SEARCH_BB_SSE */
+
+#ifdef GMX_NBNXN_SIMD
+
+/* The functions below are macros as they are performance sensitive */
+
+/* 4x4 list, pack=4: no complex conversion required */
+/* i-cluster to j-cluster conversion */
+#define CI_TO_CJ_J4(ci) (ci)
+/* cluster index to coordinate array index conversion */
+#define X_IND_CI_J4(ci) ((ci)*STRIDE_P4)
+#define X_IND_CJ_J4(cj) ((cj)*STRIDE_P4)
+
+/* 4x2 list, pack=4: j-cluster size is half the packing width */
+/* i-cluster to j-cluster conversion */
+#define CI_TO_CJ_J2(ci) ((ci)<<1)
+/* cluster index to coordinate array index conversion */
+#define X_IND_CI_J2(ci) ((ci)*STRIDE_P4)
+#define X_IND_CJ_J2(cj) (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
+
+/* 4x8 list, pack=8: i-cluster size is half the packing width */
+/* i-cluster to j-cluster conversion */
+#define CI_TO_CJ_J8(ci) ((ci)>>1)
+/* cluster index to coordinate array index conversion */
+#define X_IND_CI_J8(ci) (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
+#define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
+
+/* The j-cluster size is matched to the SIMD width */
- #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
- #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
- #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
- #endif
- #else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- #ifdef GMX_DOUBLE
++#if GMX_SIMD_WIDTH_HERE == 2
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
+#else
- #endif
++#if GMX_SIMD_WIDTH_HERE == 4
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
+#else
++#if GMX_SIMD_WIDTH_HERE == 8
+#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
+/* Half SIMD with j-cluster size */
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
- /* Interaction masks for 4xN atom interactions.
- * Bit i*CJ_SIZE + j tells if atom i and j interact.
- */
- /* All interaction mask is the same for all kernels */
- #define NBNXN_INT_MASK_ALL 0xffffffff
- /* 4x4 kernel diagonal mask */
- #define NBNXN_INT_MASK_DIAG 0x08ce
- /* 4x2 kernel diagonal masks */
- #define NBNXN_INT_MASK_DIAG_J2_0 0x0002
- #define NBNXN_INT_MASK_DIAG_J2_1 0x002F
- /* 4x8 kernel diagonal masks */
- #define NBNXN_INT_MASK_DIAG_J8_0 0xf0f8fcfe
- #define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
-
-
++#else
++#if GMX_SIMD_WIDTH_HERE == 16
++#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
++#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
++#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
+#else
+#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#endif
+#endif
++#endif
++#endif
+
+#endif /* GMX_NBNXN_SIMD */
+
+
- nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#ifdef NBNXN_SEARCH_BB_SSE
+/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
+#define NBNXN_BBXXXX
+/* Size of bounding box corners quadruplet */
+#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB)
+#endif
+
+/* We shift the i-particles backward for PBC.
+ * This leads to more conditionals than shifting forward.
+ * We do this to get more balanced pair lists.
+ */
+#define NBNXN_SHIFT_BACKWARD
+
+
+/* This define is a lazy way to avoid interdependence of the grid
+ * and searching data structures.
+ */
+#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
+
+
+static void nbs_cycle_clear(nbnxn_cycle_t *cc)
+{
+ int i;
+
+ for (i = 0; i < enbsCCnr; i++)
+ {
+ cc[i].count = 0;
+ cc[i].c = 0;
+ }
+}
+
+static double Mcyc_av(const nbnxn_cycle_t *cc)
+{
+ return (double)cc->c*1e-6/cc->count;
+}
+
+static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
+{
+ int n;
+ int t;
+
+ fprintf(fp, "\n");
+ fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
+ nbs->cc[enbsCCgrid].count,
+ Mcyc_av(&nbs->cc[enbsCCgrid]),
+ Mcyc_av(&nbs->cc[enbsCCsearch]),
+ Mcyc_av(&nbs->cc[enbsCCreducef]));
+
+ if (nbs->nthread_max > 1)
+ {
+ if (nbs->cc[enbsCCcombine].count > 0)
+ {
+ fprintf(fp, " comb %5.2f",
+ Mcyc_av(&nbs->cc[enbsCCcombine]));
+ }
+ fprintf(fp, " s. th");
+ for (t = 0; t < nbs->nthread_max; t++)
+ {
+ fprintf(fp, " %4.1f",
+ Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
+ }
+ }
+ fprintf(fp, "\n");
+}
+
+static void nbnxn_grid_init(nbnxn_grid_t * grid)
+{
+ grid->cxy_na = NULL;
+ grid->cxy_ind = NULL;
+ grid->cxy_nalloc = 0;
+ grid->bb = NULL;
+ grid->bbj = NULL;
+ grid->nc_nalloc = 0;
+}
+
+static int get_2log(int n)
+{
+ int log2;
+
+ log2 = 0;
+ while ((1<<log2) < n)
+ {
+ log2++;
+ }
+ if ((1<<log2) != n)
+ {
+ gmx_fatal(FARGS, "nbnxn na_c (%d) is not a power of 2", n);
+ }
+
+ return log2;
+}
+
+static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
+{
+ switch (nb_kernel_type)
+ {
+ case nbnxnk4x4_PlainC:
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+ return NBNXN_CPU_CLUSTER_I_SIZE;
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ /* The cluster size for super/sub lists is only set here.
+ * Any value should work for the pair-search and atomdata code.
+ * The kernels, of course, might require a particular value.
+ */
+ return NBNXN_GPU_CLUSTER_SIZE;
+ default:
+ gmx_incons("unknown kernel type");
+ }
+
+ return 0;
+}
+
+int nbnxn_kernel_to_cj_size(int nb_kernel_type)
+{
+ int nbnxn_simd_width = 0;
+ int cj_size = 0;
+
+#ifdef GMX_NBNXN_SIMD
- #ifdef NBNXN_SEARCH_BB_SSE
-
++ nbnxn_simd_width = GMX_SIMD_WIDTH_HERE;
+#endif
+
+ switch (nb_kernel_type)
+ {
+ case nbnxnk4x4_PlainC:
+ cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+ break;
+ case nbnxnk4xN_SIMD_4xN:
+ cj_size = nbnxn_simd_width;
+ break;
+ case nbnxnk4xN_SIMD_2xNN:
+ cj_size = nbnxn_simd_width/2;
+ break;
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+ break;
+ default:
+ gmx_incons("unknown kernel type");
+ }
+
+ return cj_size;
+}
+
+static int ci_to_cj(int na_cj_2log, int ci)
+{
+ switch (na_cj_2log)
+ {
+ case 2: return ci; break;
+ case 1: return (ci<<1); break;
+ case 3: return (ci>>1); break;
+ }
+
+ return 0;
+}
+
+gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
+{
+ if (nb_kernel_type == nbnxnkNotSet)
+ {
+ gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
+ }
+
+ switch (nb_kernel_type)
+ {
+ case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_PlainC:
+ return FALSE;
+
+ case nbnxnk4x4_PlainC:
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+ return TRUE;
+
+ default:
+ gmx_incons("Invalid nonbonded kernel type passed!");
+ return FALSE;
+ }
+}
+
+void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
+ ivec *n_dd_cells,
+ gmx_domdec_zones_t *zones,
+ int nthread_max)
+{
+ nbnxn_search_t nbs;
+ int d, g, t;
+
+ snew(nbs, 1);
+ *nbs_ptr = nbs;
+
+ nbs->DomDec = (n_dd_cells != NULL);
+
+ clear_ivec(nbs->dd_dim);
+ nbs->ngrid = 1;
+ if (nbs->DomDec)
+ {
+ nbs->zones = zones;
+
+ for (d = 0; d < DIM; d++)
+ {
+ if ((*n_dd_cells)[d] > 1)
+ {
+ nbs->dd_dim[d] = 1;
+ /* Each grid matches a DD zone */
+ nbs->ngrid *= 2;
+ }
+ }
+ }
+
+ snew(nbs->grid, nbs->ngrid);
+ for (g = 0; g < nbs->ngrid; g++)
+ {
+ nbnxn_grid_init(&nbs->grid[g]);
+ }
+ nbs->cell = NULL;
+ nbs->cell_nalloc = 0;
+ nbs->a = NULL;
+ nbs->a_nalloc = 0;
+
+ nbs->nthread_max = nthread_max;
+
+ /* Initialize the work data structures for each thread */
+ snew(nbs->work, nbs->nthread_max);
+ for (t = 0; t < nbs->nthread_max; t++)
+ {
+ nbs->work[t].cxy_na = NULL;
+ nbs->work[t].cxy_na_nalloc = 0;
+ nbs->work[t].sort_work = NULL;
+ nbs->work[t].sort_work_nalloc = 0;
+ }
+
+ /* Initialize detailed nbsearch cycle counting */
+ nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
+ nbs->search_count = 0;
+ nbs_cycle_clear(nbs->cc);
+ for (t = 0; t < nbs->nthread_max; t++)
+ {
+ nbs_cycle_clear(nbs->work[t].cc);
+ }
+}
+
+static real grid_atom_density(int n, rvec corner0, rvec corner1)
+{
+ rvec size;
+
+ rvec_sub(corner1, corner0, size);
+
+ return n/(size[XX]*size[YY]*size[ZZ]);
+}
+
+static int set_grid_size_xy(const nbnxn_search_t nbs,
+ nbnxn_grid_t *grid,
+ int dd_zone,
+ int n, rvec corner0, rvec corner1,
+ real atom_density)
+{
+ rvec size;
+ int na_c;
+ real adens, tlen, tlen_x, tlen_y, nc_max;
+ int t;
+
+ rvec_sub(corner1, corner0, size);
+
+ if (n > grid->na_sc)
+ {
+ /* target cell length */
+ if (grid->bSimple)
+ {
+ /* To minimize the zero interactions, we should make
+ * the largest of the i/j cell cubic.
+ */
+ na_c = max(grid->na_c, grid->na_cj);
+
+ /* Approximately cubic cells */
+ tlen = pow(na_c/atom_density, 1.0/3.0);
+ tlen_x = tlen;
+ tlen_y = tlen;
+ }
+ else
+ {
+ /* Approximately cubic sub cells */
+ tlen = pow(grid->na_c/atom_density, 1.0/3.0);
+ tlen_x = tlen*GPU_NSUBCELL_X;
+ tlen_y = tlen*GPU_NSUBCELL_Y;
+ }
+ /* We round ncx and ncy down, because we get less cell pairs
+ * in the nbsist when the fixed cell dimensions (x,y) are
+ * larger than the variable one (z) than the other way around.
+ */
+ grid->ncx = max(1, (int)(size[XX]/tlen_x));
+ grid->ncy = max(1, (int)(size[YY]/tlen_y));
+ }
+ else
+ {
+ grid->ncx = 1;
+ grid->ncy = 1;
+ }
+
+ grid->sx = size[XX]/grid->ncx;
+ grid->sy = size[YY]/grid->ncy;
+ grid->inv_sx = 1/grid->sx;
+ grid->inv_sy = 1/grid->sy;
+
+ if (dd_zone > 0)
+ {
+ /* This is a non-home zone, add an extra row of cells
+ * for particles communicated for bonded interactions.
+ * These can be beyond the cut-off. It doesn't matter where
+ * they end up on the grid, but for performance it's better
+ * if they don't end up in cells that can be within cut-off range.
+ */
+ grid->ncx++;
+ grid->ncy++;
+ }
+
+ /* We need one additional cell entry for particles moved by DD */
+ if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
+ {
+ grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
+ srenew(grid->cxy_na, grid->cxy_nalloc);
+ srenew(grid->cxy_ind, grid->cxy_nalloc+1);
+ }
+ for (t = 0; t < nbs->nthread_max; t++)
+ {
+ if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
+ {
+ nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
+ srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
+ }
+ }
+
+ /* Worst case scenario of 1 atom in each last cell */
+ if (grid->na_cj <= grid->na_c)
+ {
+ nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
+ }
+ else
+ {
+ nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
+ }
+
+ if (nc_max > grid->nc_nalloc)
+ {
+ int bb_nalloc;
+
+ grid->nc_nalloc = over_alloc_large(nc_max);
+ srenew(grid->nsubc, grid->nc_nalloc);
+ srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
+#ifdef NBNXN_PBB_SSE
+ bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
+#else
+ bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
+#endif
+ sfree_aligned(grid->bb);
+ /* This snew also zeros the contents, this avoid possible
+ * floating exceptions in SSE with the unused bb elements.
+ */
+ snew_aligned(grid->bb, bb_nalloc, 16);
+
+ if (grid->bSimple)
+ {
+ if (grid->na_cj == grid->na_c)
+ {
+ grid->bbj = grid->bb;
+ }
+ else
+ {
+ sfree_aligned(grid->bbj);
+ snew_aligned(grid->bbj, bb_nalloc*grid->na_c/grid->na_cj, 16);
+ }
+ }
+
+ srenew(grid->flags, grid->nc_nalloc);
+ }
+
+ copy_rvec(corner0, grid->c0);
+ copy_rvec(corner1, grid->c1);
+
+ return nc_max;
+}
+
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
+#define SGSF (SORT_GRID_OVERSIZE + 1)
+
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the 1/length of the sorting range.
+ * n_per_h (>=n) is the expected average number of particles per 1/invh
+ * sort is the sorting work array.
+ * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
+ * or easier, allocate at least n*SGSF elements.
+ */
+static void sort_atoms(int dim, gmx_bool Backwards,
+ int *a, int n, rvec *x,
+ real h0, real invh, int n_per_h,
+ int *sort)
+{
+ int nsort, i, c;
+ int zi, zim, zi_min, zi_max;
+ int cp, tmp;
+
+ if (n <= 1)
+ {
+ /* Nothing to do */
+ return;
+ }
+
+#ifndef NDEBUG
+ if (n > n_per_h)
+ {
+ gmx_incons("n > n_per_h");
+ }
+#endif
+
+ /* Transform the inverse range height into the inverse hole height */
+ invh *= n_per_h*SORT_GRID_OVERSIZE;
+
+ /* Set nsort to the maximum possible number of holes used.
+ * In worst case all n elements end up in the last bin.
+ */
+ nsort = n_per_h*SORT_GRID_OVERSIZE + n;
+
+ /* Determine the index range used, so we can limit it for the second pass */
+ zi_min = INT_MAX;
+ zi_max = -1;
+
+ /* Sort the particles using a simple index sort */
+ for (i = 0; i < n; i++)
+ {
+ /* The cast takes care of float-point rounding effects below zero.
+ * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
+ * times the box height out of the box.
+ */
+ zi = (int)((x[a[i]][dim] - h0)*invh);
+
+#ifndef NDEBUG
+ /* As we can have rounding effect, we use > iso >= here */
+ if (zi < 0 || zi > n_per_h*SORT_GRID_OVERSIZE)
+ {
+ gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
+ a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
+ n_per_h, SORT_GRID_OVERSIZE);
+ }
+#endif
+
+ /* Ideally this particle should go in sort cell zi,
+ * but that might already be in use,
+ * in that case find the first empty cell higher up
+ */
+ if (sort[zi] < 0)
+ {
+ sort[zi] = a[i];
+ zi_min = min(zi_min, zi);
+ zi_max = max(zi_max, zi);
+ }
+ else
+ {
+ /* We have multiple atoms in the same sorting slot.
+ * Sort on real z for minimal bounding box size.
+ * There is an extra check for identical z to ensure
+ * well-defined output order, independent of input order
+ * to ensure binary reproducibility after restarts.
+ */
+ while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
+ (x[a[i]][dim] == x[sort[zi]][dim] &&
+ a[i] > sort[zi])))
+ {
+ zi++;
+ }
+
+ if (sort[zi] >= 0)
+ {
+ /* Shift all elements by one slot until we find an empty slot */
+ cp = sort[zi];
+ zim = zi + 1;
+ while (sort[zim] >= 0)
+ {
+ tmp = sort[zim];
+ sort[zim] = cp;
+ cp = tmp;
+ zim++;
+ }
+ sort[zim] = cp;
+ zi_max = max(zi_max, zim);
+ }
+ sort[zi] = a[i];
+ zi_max = max(zi_max, zi);
+ }
+ }
+
+ c = 0;
+ if (!Backwards)
+ {
+ for (zi = 0; zi < nsort; zi++)
+ {
+ if (sort[zi] >= 0)
+ {
+ a[c++] = sort[zi];
+ sort[zi] = -1;
+ }
+ }
+ }
+ else
+ {
+ for (zi = zi_max; zi >= zi_min; zi--)
+ {
+ if (sort[zi] >= 0)
+ {
+ a[c++] = sort[zi];
+ sort[zi] = -1;
+ }
+ }
+ }
+ if (c < n)
+ {
+ gmx_incons("Lost particles while sorting");
+ }
+}
+
+#ifdef GMX_DOUBLE
+#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
+#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
+#else
+#define R2F_D(x) (x)
+#define R2F_U(x) (x)
+#endif
+
+/* Coordinate order x,y,z, bb order xyz0 */
+static void calc_bounding_box(int na, int stride, const real *x, float *bb)
+{
+ int i, j;
+ real xl, xh, yl, yh, zl, zh;
+
+ i = 0;
+ xl = x[i+XX];
+ xh = x[i+XX];
+ yl = x[i+YY];
+ yh = x[i+YY];
+ zl = x[i+ZZ];
+ zh = x[i+ZZ];
+ i += stride;
+ for (j = 1; j < na; j++)
+ {
+ xl = min(xl, x[i+XX]);
+ xh = max(xh, x[i+XX]);
+ yl = min(yl, x[i+YY]);
+ yh = max(yh, x[i+YY]);
+ zl = min(zl, x[i+ZZ]);
+ zh = max(zh, x[i+ZZ]);
+ i += stride;
+ }
+ /* Note: possible double to float conversion here */
+ bb[BBL_X] = R2F_D(xl);
+ bb[BBL_Y] = R2F_D(yl);
+ bb[BBL_Z] = R2F_D(zl);
+ bb[BBU_X] = R2F_U(xh);
+ bb[BBU_Y] = R2F_U(yh);
+ bb[BBU_Z] = R2F_U(zh);
+}
+
+/* Packed coordinates, bb order xyz0 */
+static void calc_bounding_box_x_x4(int na, const real *x, float *bb)
+{
+ int j;
+ real xl, xh, yl, yh, zl, zh;
+
+ xl = x[XX*PACK_X4];
+ xh = x[XX*PACK_X4];
+ yl = x[YY*PACK_X4];
+ yh = x[YY*PACK_X4];
+ zl = x[ZZ*PACK_X4];
+ zh = x[ZZ*PACK_X4];
+ for (j = 1; j < na; j++)
+ {
+ xl = min(xl, x[j+XX*PACK_X4]);
+ xh = max(xh, x[j+XX*PACK_X4]);
+ yl = min(yl, x[j+YY*PACK_X4]);
+ yh = max(yh, x[j+YY*PACK_X4]);
+ zl = min(zl, x[j+ZZ*PACK_X4]);
+ zh = max(zh, x[j+ZZ*PACK_X4]);
+ }
+ /* Note: possible double to float conversion here */
+ bb[BBL_X] = R2F_D(xl);
+ bb[BBL_Y] = R2F_D(yl);
+ bb[BBL_Z] = R2F_D(zl);
+ bb[BBU_X] = R2F_U(xh);
+ bb[BBU_Y] = R2F_U(yh);
+ bb[BBU_Z] = R2F_U(zh);
+}
+
+/* Packed coordinates, bb order xyz0 */
+static void calc_bounding_box_x_x8(int na, const real *x, float *bb)
+{
+ int j;
+ real xl, xh, yl, yh, zl, zh;
+
+ xl = x[XX*PACK_X8];
+ xh = x[XX*PACK_X8];
+ yl = x[YY*PACK_X8];
+ yh = x[YY*PACK_X8];
+ zl = x[ZZ*PACK_X8];
+ zh = x[ZZ*PACK_X8];
+ for (j = 1; j < na; j++)
+ {
+ xl = min(xl, x[j+XX*PACK_X8]);
+ xh = max(xh, x[j+XX*PACK_X8]);
+ yl = min(yl, x[j+YY*PACK_X8]);
+ yh = max(yh, x[j+YY*PACK_X8]);
+ zl = min(zl, x[j+ZZ*PACK_X8]);
+ zh = max(zh, x[j+ZZ*PACK_X8]);
+ }
+ /* Note: possible double to float conversion here */
+ bb[BBL_X] = R2F_D(xl);
+ bb[BBL_Y] = R2F_D(yl);
+ bb[BBL_Z] = R2F_D(zl);
+ bb[BBU_X] = R2F_U(xh);
+ bb[BBU_Y] = R2F_U(yh);
+ bb[BBU_Z] = R2F_U(zh);
+}
+
- #ifdef NBNXN_SEARCH_BB_SSE
+/* Packed coordinates, bb order xyz0 */
+static void calc_bounding_box_x_x4_halves(int na, const real *x,
+ float *bb, float *bbj)
+{
++#ifndef NBNXN_SEARCH_BB_SSE
++ int i;
++#endif
++
+ calc_bounding_box_x_x4(min(na, 2), x, bbj);
+
+ if (na > 2)
+ {
+ calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+NNBSBB_B);
+ }
+ else
+ {
+ /* Set the "empty" bounding box to the same as the first one,
+ * so we don't need to treat special cases in the rest of the code.
+ */
++#ifdef NBNXN_SEARCH_BB_SSE
+ _mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
+ _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
++#else
++ for (i = 0; i < NNBSBB_B; i++)
++ {
++ bbj[NNBSBB_B + i] = bbj[i];
++ }
++#endif
+ }
+
++#ifdef NBNXN_SEARCH_BB_SSE
+ _mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
+ _mm_load_ps(bbj+NNBSBB_B)));
+ _mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
+ _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
++#else
++ for (i = 0; i < NNBSBB_C; i++)
++ {
++ bb[ i] = min(bbj[ i], bbj[NNBSBB_B + i]);
++ bb[NNBSBB_C + i] = max(bbj[NNBSBB_C + i], bbj[NNBSBB_B + NNBSBB_C + i]);
++ }
++#endif
+}
+
++#ifdef NBNXN_SEARCH_BB_SSE
++
+/* Coordinate order xyz, bb order xxxxyyyyzzzz */
+static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
+{
+ int i, j;
+ real xl, xh, yl, yh, zl, zh;
+
+ i = 0;
+ xl = x[i+XX];
+ xh = x[i+XX];
+ yl = x[i+YY];
+ yh = x[i+YY];
+ zl = x[i+ZZ];
+ zh = x[i+ZZ];
+ i += stride;
+ for (j = 1; j < na; j++)
+ {
+ xl = min(xl, x[i+XX]);
+ xh = max(xh, x[i+XX]);
+ yl = min(yl, x[i+YY]);
+ yh = max(yh, x[i+YY]);
+ zl = min(zl, x[i+ZZ]);
+ zh = max(zh, x[i+ZZ]);
+ i += stride;
+ }
+ /* Note: possible double to float conversion here */
+ bb[0*STRIDE_PBB] = R2F_D(xl);
+ bb[1*STRIDE_PBB] = R2F_D(yl);
+ bb[2*STRIDE_PBB] = R2F_D(zl);
+ bb[3*STRIDE_PBB] = R2F_U(xh);
+ bb[4*STRIDE_PBB] = R2F_U(yh);
+ bb[5*STRIDE_PBB] = R2F_U(zh);
+}
+
+#endif /* NBNXN_SEARCH_BB_SSE */
+
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+
+/* Coordinate order xyz?, bb order xyz0 */
+static void calc_bounding_box_sse(int na, const float *x, float *bb)
+{
+ __m128 bb_0_SSE, bb_1_SSE;
+ __m128 x_SSE;
+
+ int i;
+
+ bb_0_SSE = _mm_load_ps(x);
+ bb_1_SSE = bb_0_SSE;
+
+ for (i = 1; i < na; i++)
+ {
+ x_SSE = _mm_load_ps(x+i*NNBSBB_C);
+ bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
+ bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
+ }
+
+ _mm_store_ps(bb, bb_0_SSE);
+ _mm_store_ps(bb+4, bb_1_SSE);
+}
+
+/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
+static void calc_bounding_box_xxxx_sse(int na, const float *x,
+ float *bb_work,
+ real *bb)
+{
+ calc_bounding_box_sse(na, x, bb_work);
+
+ bb[0*STRIDE_PBB] = bb_work[BBL_X];
+ bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+ bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+ bb[3*STRIDE_PBB] = bb_work[BBU_X];
+ bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+ bb[5*STRIDE_PBB] = bb_work[BBU_Z];
+}
+
+#endif /* NBNXN_SEARCH_SSE_SINGLE */
+
- __m128 min_SSE, max_SSE;
+
+/* Combines pairs of consecutive bounding boxes */
+static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
+{
+ int i, j, sc2, nc2, c2;
- #endif
-
+
+ for (i = 0; i < grid->ncx*grid->ncy; i++)
+ {
+ /* Starting bb in a column is expected to be 2-aligned */
+ sc2 = grid->cxy_ind[i]>>1;
+ /* For odd numbers skip the last bb here */
+ nc2 = (grid->cxy_na[i]+3)>>(2+1);
+ for (c2 = sc2; c2 < sc2+nc2; c2++)
+ {
++#ifdef NBNXN_SEARCH_BB_SSE
++ __m128 min_SSE, max_SSE;
++
+ min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
+ _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
+ max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
+ _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
+ _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
+ _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
++#else
++ for (j = 0; j < NNBSBB_C; j++)
++ {
++ grid->bbj[(c2*2+0)*NNBSBB_C+j] = min(bb[(c2*4+0)*NNBSBB_C+j],
++ bb[(c2*4+2)*NNBSBB_C+j]);
++ grid->bbj[(c2*2+1)*NNBSBB_C+j] = max(bb[(c2*4+1)*NNBSBB_C+j],
++ bb[(c2*4+3)*NNBSBB_C+j]);
++ }
++#endif
+ }
+ if (((grid->cxy_na[i]+3)>>2) & 1)
+ {
+ /* Copy the last bb for odd bb count in this column */
+ for (j = 0; j < NNBSBB_C; j++)
+ {
+ grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
+ grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
+ }
+ }
+ }
+}
+
- #if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
+
+/* Prints the average bb size, used for debug output */
+static void print_bbsizes_simple(FILE *fp,
+ const nbnxn_search_t nbs,
+ const nbnxn_grid_t *grid)
+{
+ int c, d;
+ dvec ba;
+
+ clear_dvec(ba);
+ for (c = 0; c < grid->nc; c++)
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
+ }
+ }
+ dsvmul(1.0/grid->nc, ba, ba);
+
+ fprintf(fp, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
+ nbs->box[XX][XX]/grid->ncx,
+ nbs->box[YY][YY]/grid->ncy,
+ nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
+ ba[XX], ba[YY], ba[ZZ],
+ ba[XX]*grid->ncx/nbs->box[XX][XX],
+ ba[YY]*grid->ncy/nbs->box[YY][YY],
+ ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
+}
+
+/* Prints the average bb size, used for debug output */
+static void print_bbsizes_supersub(FILE *fp,
+ const nbnxn_search_t nbs,
+ const nbnxn_grid_t *grid)
+{
+ int ns, c, s;
+ dvec ba;
+
+ clear_dvec(ba);
+ ns = 0;
+ for (c = 0; c < grid->nc; c++)
+ {
+#ifdef NBNXN_BBXXXX
+ for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
+ {
+ int cs_w, i, d;
+
+ cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+ for (i = 0; i < STRIDE_PBB; i++)
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ ba[d] +=
+ grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+ grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
+ }
+ }
+ }
+#else
+ for (s = 0; s < grid->nsubc[c]; s++)
+ {
+ int cs, d;
+
+ cs = c*GPU_NSUBCELL + s;
+ for (d = 0; d < DIM; d++)
+ {
+ ba[d] +=
+ grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
+ grid->bb[cs*NNBSBB_B +d];
+ }
+ }
+#endif
+ ns += grid->nsubc[c];
+ }
+ dsvmul(1.0/ns, ba, ba);
+
+ fprintf(fp, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
+ nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
+ nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
+ nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
+ ba[XX], ba[YY], ba[ZZ],
+ ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
+ ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
+ ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
+}
+
+/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
+ * Also sets interaction flags.
+ */
+void sort_on_lj(int na_c,
+ int a0, int a1, const int *atinfo,
+ int *order,
+ int *flags)
+{
+ int subc, s, a, n1, n2, a_lj_max, i, j;
+ int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
+ int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
+ gmx_bool haveQ;
+
+ *flags = 0;
+
+ subc = 0;
+ for (s = a0; s < a1; s += na_c)
+ {
+ /* Make lists for this (sub-)cell on atoms with and without LJ */
+ n1 = 0;
+ n2 = 0;
+ haveQ = FALSE;
+ a_lj_max = -1;
+ for (a = s; a < min(s+na_c, a1); a++)
+ {
+ haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
+
+ if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
+ {
+ sort1[n1++] = order[a];
+ a_lj_max = a;
+ }
+ else
+ {
+ sort2[n2++] = order[a];
+ }
+ }
+
+ /* If we don't have atom with LJ, there's nothing to sort */
+ if (n1 > 0)
+ {
+ *flags |= NBNXN_CI_DO_LJ(subc);
+
+ if (2*n1 <= na_c)
+ {
+ /* Only sort when strictly necessary. Ordering particles
+ * Ordering particles can lead to less accurate summation
+ * due to rounding, both for LJ and Coulomb interactions.
+ */
+ if (2*(a_lj_max - s) >= na_c)
+ {
+ for (i = 0; i < n1; i++)
+ {
+ order[a0+i] = sort1[i];
+ }
+ for (j = 0; j < n2; j++)
+ {
+ order[a0+n1+j] = sort2[j];
+ }
+ }
+
+ *flags |= NBNXN_CI_HALF_LJ(subc);
+ }
+ }
+ if (haveQ)
+ {
+ *flags |= NBNXN_CI_DO_COUL(subc);
+ }
+ subc++;
+ }
+}
+
+/* Fill a pair search cell with atoms.
+ * Potentially sorts atoms and sets the interaction flags.
+ */
+void fill_cell(const nbnxn_search_t nbs,
+ nbnxn_grid_t *grid,
+ nbnxn_atomdata_t *nbat,
+ int a0, int a1,
+ const int *atinfo,
+ rvec *x,
+ int sx, int sy, int sz,
+ float *bb_work)
+{
+ int na, a;
+ size_t offset;
+ float *bb_ptr;
+
+ na = a1 - a0;
+
+ if (grid->bSimple)
+ {
+ sort_on_lj(grid->na_c, a0, a1, atinfo, nbs->a,
+ grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
+ }
+
+ /* Now we have sorted the atoms, set the cell indices */
+ for (a = a0; a < a1; a++)
+ {
+ nbs->cell[nbs->a[a]] = a;
+ }
+
+ copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
+ nbat->XFormat, nbat->x, a0,
+ sx, sy, sz);
+
+ if (nbat->XFormat == nbatX4)
+ {
+ /* Store the bounding boxes as xyz.xyz. */
+ offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ bb_ptr = grid->bb + offset;
+
- #ifdef NBNXN_SEARCH_BB_SSE
++#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
+ if (2*grid->na_cj == grid->na_c)
+ {
+ calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
+ grid->bbj+offset*2);
+ }
+ else
+#endif
+ {
+ calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
+ }
+ }
+ else if (nbat->XFormat == nbatX8)
+ {
+ /* Store the bounding boxes as xyz.xyz. */
+ offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ bb_ptr = grid->bb + offset;
+
+ calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
+ }
+#ifdef NBNXN_BBXXXX
+ else if (!grid->bSimple)
+ {
+ /* Store the bounding boxes in a format convenient
+ * for SSE calculations: xxxxyyyyzzzz...
+ */
+ bb_ptr =
+ grid->bb +
+ ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+ (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
+
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+ if (nbat->XFormat == nbatXYZQ)
+ {
+ calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
+ bb_work, bb_ptr);
+ }
+ else
+#endif
+ {
+ calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
+ bb_ptr);
+ }
+ if (gmx_debug_at)
+ {
+ fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
+ sx, sy, sz,
+ bb_ptr[0*STRIDE_PBB], bb_ptr[3*STRIDE_PBB],
+ bb_ptr[1*STRIDE_PBB], bb_ptr[4*STRIDE_PBB],
+ bb_ptr[2*STRIDE_PBB], bb_ptr[5*STRIDE_PBB]);
+ }
+ }
+#endif
+ else
+ {
+ /* Store the bounding boxes as xyz.xyz. */
+ bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+
+ calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
+ bb_ptr);
+
+ if (gmx_debug_at)
+ {
+ int bbo;
+ bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
+ fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
+ sx, sy, sz,
+ (grid->bb+bbo*NNBSBB_B)[BBL_X],
+ (grid->bb+bbo*NNBSBB_B)[BBU_X],
+ (grid->bb+bbo*NNBSBB_B)[BBL_Y],
+ (grid->bb+bbo*NNBSBB_B)[BBU_Y],
+ (grid->bb+bbo*NNBSBB_B)[BBL_Z],
+ (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
+ }
+ }
+}
+
+/* Spatially sort the atoms within one grid column */
+static void sort_columns_simple(const nbnxn_search_t nbs,
+ int dd_zone,
+ nbnxn_grid_t *grid,
+ int a0, int a1,
+ const int *atinfo,
+ rvec *x,
+ nbnxn_atomdata_t *nbat,
+ int cxy_start, int cxy_end,
+ int *sort_work)
+{
+ int cxy;
+ int cx, cy, cz, ncz, cfilled, c;
+ int na, ash, ind, a;
+ int na_c, ash_c;
+
+ if (debug)
+ {
+ fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
+ grid->cell0, cxy_start, cxy_end, a0, a1);
+ }
+
+ /* Sort the atoms within each x,y column in 3 dimensions */
+ for (cxy = cxy_start; cxy < cxy_end; cxy++)
+ {
+ cx = cxy/grid->ncy;
+ cy = cxy - cx*grid->ncy;
+
+ na = grid->cxy_na[cxy];
+ ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
+ ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+
+ /* Sort the atoms within each x,y column on z coordinate */
+ sort_atoms(ZZ, FALSE,
+ nbs->a+ash, na, x,
+ grid->c0[ZZ],
+ 1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
+ sort_work);
+
+ /* Fill the ncz cells in this column */
+ cfilled = grid->cxy_ind[cxy];
+ for (cz = 0; cz < ncz; cz++)
+ {
+ c = grid->cxy_ind[cxy] + cz;
+
+ ash_c = ash + cz*grid->na_sc;
+ na_c = min(grid->na_sc, na-(ash_c-ash));
+
+ fill_cell(nbs, grid, nbat,
+ ash_c, ash_c+na_c, atinfo, x,
+ grid->na_sc*cx + (dd_zone >> 2),
+ grid->na_sc*cy + (dd_zone & 3),
+ grid->na_sc*cz,
+ NULL);
+
+ /* This copy to bbcz is not really necessary.
+ * But it allows to use the same grid search code
+ * for the simple and supersub cell setups.
+ */
+ if (na_c > 0)
+ {
+ cfilled = c;
+ }
+ grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled*NNBSBB_B+2];
+ grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
+ }
+
+ /* Set the unused atom indices to -1 */
+ for (ind = na; ind < ncz*grid->na_sc; ind++)
+ {
+ nbs->a[ash+ind] = -1;
+ }
+ }
+}
+
+/* Spatially sort the atoms within one grid column */
+static void sort_columns_supersub(const nbnxn_search_t nbs,
+ int dd_zone,
+ nbnxn_grid_t *grid,
+ int a0, int a1,
+ const int *atinfo,
+ rvec *x,
+ nbnxn_atomdata_t *nbat,
+ int cxy_start, int cxy_end,
+ int *sort_work)
+{
+ int cxy;
+ int cx, cy, cz = -1, c = -1, ncz;
+ int na, ash, na_c, ind, a;
+ int subdiv_z, sub_z, na_z, ash_z;
+ int subdiv_y, sub_y, na_y, ash_y;
+ int subdiv_x, sub_x, na_x, ash_x;
+
+ /* cppcheck-suppress unassignedVariable */
+ float bb_work_array[NNBSBB_B+3], *bb_work_align;
+
+ bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
+
+ if (debug)
+ {
+ fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
+ grid->cell0, cxy_start, cxy_end, a0, a1);
+ }
+
+ subdiv_x = grid->na_c;
+ subdiv_y = GPU_NSUBCELL_X*subdiv_x;
+ subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
+
+ /* Sort the atoms within each x,y column in 3 dimensions */
+ for (cxy = cxy_start; cxy < cxy_end; cxy++)
+ {
+ cx = cxy/grid->ncy;
+ cy = cxy - cx*grid->ncy;
+
+ na = grid->cxy_na[cxy];
+ ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
+ ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+
+ /* Sort the atoms within each x,y column on z coordinate */
+ sort_atoms(ZZ, FALSE,
+ nbs->a+ash, na, x,
+ grid->c0[ZZ],
+ 1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
+ sort_work);
+
+ /* This loop goes over the supercells and subcells along z at once */
+ for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
+ {
+ ash_z = ash + sub_z*subdiv_z;
+ na_z = min(subdiv_z, na-(ash_z-ash));
+
+ /* We have already sorted on z */
+
+ if (sub_z % GPU_NSUBCELL_Z == 0)
+ {
+ cz = sub_z/GPU_NSUBCELL_Z;
+ c = grid->cxy_ind[cxy] + cz;
+
+ /* The number of atoms in this supercell */
+ na_c = min(grid->na_sc, na-(ash_z-ash));
+
+ grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
+
+ /* Store the z-boundaries of the super cell */
+ grid->bbcz[c*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
+ grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
+ }
+
+#if GPU_NSUBCELL_Y > 1
+ /* Sort the atoms along y */
+ sort_atoms(YY, (sub_z & 1),
+ nbs->a+ash_z, na_z, x,
+ grid->c0[YY]+cy*grid->sy,
+ grid->inv_sy, subdiv_z,
+ sort_work);
+#endif
+
+ for (sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
+ {
+ ash_y = ash_z + sub_y*subdiv_y;
+ na_y = min(subdiv_y, na-(ash_y-ash));
+
+#if GPU_NSUBCELL_X > 1
+ /* Sort the atoms along x */
+ sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
+ nbs->a+ash_y, na_y, x,
+ grid->c0[XX]+cx*grid->sx,
+ grid->inv_sx, subdiv_y,
+ sort_work);
+#endif
+
+ for (sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
+ {
+ ash_x = ash_y + sub_x*subdiv_x;
+ na_x = min(subdiv_x, na-(ash_x-ash));
+
+ fill_cell(nbs, grid, nbat,
+ ash_x, ash_x+na_x, atinfo, x,
+ grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
+ grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
+ grid->na_c*sub_z,
+ bb_work_align);
+ }
+ }
+ }
+
+ /* Set the unused atom indices to -1 */
+ for (ind = na; ind < ncz*grid->na_sc; ind++)
+ {
+ nbs->a[ash+ind] = -1;
+ }
+ }
+}
+
+/* Determine in which grid column atoms should go */
+static void calc_column_indices(nbnxn_grid_t *grid,
+ int a0, int a1,
+ rvec *x,
+ int dd_zone, const int *move,
+ int thread, int nthread,
+ int *cell,
+ int *cxy_na)
+{
+ int n0, n1, i;
+ int cx, cy;
+
+ /* We add one extra cell for particles which moved during DD */
+ for (i = 0; i < grid->ncx*grid->ncy+1; i++)
+ {
+ cxy_na[i] = 0;
+ }
+
+ n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
+ n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
+ if (dd_zone == 0)
+ {
+ /* Home zone */
+ for (i = n0; i < n1; i++)
+ {
+ if (move == NULL || move[i] >= 0)
+ {
+ /* We need to be careful with rounding,
+ * particles might be a few bits outside the local zone.
+ * The int cast takes care of the lower bound,
+ * we will explicitly take care of the upper bound.
+ */
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+#ifndef NDEBUG
+ if (cx < 0 || cx > grid->ncx ||
+ cy < 0 || cy > grid->ncy)
+ {
+ gmx_fatal(FARGS,
+ "grid cell cx %d cy %d out of range (max %d %d)\n"
+ "atom %f %f %f, grid->c0 %f %f",
+ cx, cy, grid->ncx, grid->ncy,
+ x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
+ }
+#endif
+ /* Take care of potential rouding issues */
+ cx = min(cx, grid->ncx - 1);
+ cy = min(cy, grid->ncy - 1);
+
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+ }
+ else
+ {
+ /* Put this moved particle after the end of the grid,
+ * so we can process it later without using conditionals.
+ */
+ cell[i] = grid->ncx*grid->ncy;
+ }
+
+ cxy_na[cell[i]]++;
+ }
+ }
+ else
+ {
+ /* Non-home zone */
+ for (i = n0; i < n1; i++)
+ {
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+ /* For non-home zones there could be particles outside
+ * the non-bonded cut-off range, which have been communicated
+ * for bonded interactions only. For the result it doesn't
+ * matter where these end up on the grid. For performance
+ * we put them in an extra row at the border.
+ */
+ cx = max(cx, 0);
+ cx = min(cx, grid->ncx - 1);
+ cy = max(cy, 0);
+ cy = min(cy, grid->ncy - 1);
+
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+
+ cxy_na[cell[i]]++;
+ }
+ }
+}
+
+/* Determine in which grid cells the atoms should go */
+static void calc_cell_indices(const nbnxn_search_t nbs,
+ int dd_zone,
+ nbnxn_grid_t *grid,
+ int a0, int a1,
+ const int *atinfo,
+ rvec *x,
+ const int *move,
+ nbnxn_atomdata_t *nbat)
+{
+ int n0, n1, i;
+ int cx, cy, cxy, ncz_max, ncz;
+ int nthread, thread;
+ int *cxy_na, cxy_na_i;
+
+ nthread = gmx_omp_nthreads_get(emntPairsearch);
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for (thread = 0; thread < nthread; thread++)
+ {
+ calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
+ nbs->cell, nbs->work[thread].cxy_na);
+ }
+
+ /* Make the cell index as a function of x and y */
+ ncz_max = 0;
+ ncz = 0;
+ grid->cxy_ind[0] = 0;
+ for (i = 0; i < grid->ncx*grid->ncy+1; i++)
+ {
+ /* We set ncz_max at the beginning of the loop iso at the end
+ * to skip i=grid->ncx*grid->ncy which are moved particles
+ * that do not need to be ordered on the grid.
+ */
+ if (ncz > ncz_max)
+ {
+ ncz_max = ncz;
+ }
+ cxy_na_i = nbs->work[0].cxy_na[i];
+ for (thread = 1; thread < nthread; thread++)
+ {
+ cxy_na_i += nbs->work[thread].cxy_na[i];
+ }
+ ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
+ if (nbat->XFormat == nbatX8)
+ {
+ /* Make the number of cell a multiple of 2 */
+ ncz = (ncz + 1) & ~1;
+ }
+ grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
+ /* Clear cxy_na, so we can reuse the array below */
+ grid->cxy_na[i] = 0;
+ }
+ grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
+
+ nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
+
+ if (debug)
+ {
+ fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
+ grid->na_sc, grid->na_c, grid->nc,
+ grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
+ ncz_max);
+ if (gmx_debug_at)
+ {
+ i = 0;
+ for (cy = 0; cy < grid->ncy; cy++)
+ {
+ for (cx = 0; cx < grid->ncx; cx++)
+ {
+ fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
+ i++;
+ }
+ fprintf(debug, "\n");
+ }
+ }
+ }
+
+ /* Make sure the work array for sorting is large enough */
+ if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
+ {
+ for (thread = 0; thread < nbs->nthread_max; thread++)
+ {
+ nbs->work[thread].sort_work_nalloc =
+ over_alloc_large(ncz_max*grid->na_sc*SGSF);
+ srenew(nbs->work[thread].sort_work,
+ nbs->work[thread].sort_work_nalloc);
+ /* When not in use, all elements should be -1 */
+ for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
+ {
+ nbs->work[thread].sort_work[i] = -1;
+ }
+ }
+ }
+
+ /* Now we know the dimensions we can fill the grid.
+ * This is the first, unsorted fill. We sort the columns after this.
+ */
+ for (i = a0; i < a1; i++)
+ {
+ /* At this point nbs->cell contains the local grid x,y indices */
+ cxy = nbs->cell[i];
+ nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
+ }
+
+ if (dd_zone == 0)
+ {
+ /* Set the cell indices for the moved particles */
+ n0 = grid->nc*grid->na_sc;
+ n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+ if (dd_zone == 0)
+ {
+ for (i = n0; i < n1; i++)
+ {
+ nbs->cell[nbs->a[i]] = i;
+ }
+ }
+ }
+
+ /* Sort the super-cell columns along z into the sub-cells. */
+#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
+ for (thread = 0; thread < nbs->nthread_max; thread++)
+ {
+ if (grid->bSimple)
+ {
+ sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
+ ((thread+0)*grid->ncx*grid->ncy)/nthread,
+ ((thread+1)*grid->ncx*grid->ncy)/nthread,
+ nbs->work[thread].sort_work);
+ }
+ else
+ {
+ sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
+ ((thread+0)*grid->ncx*grid->ncy)/nthread,
+ ((thread+1)*grid->ncx*grid->ncy)/nthread,
+ nbs->work[thread].sort_work);
+ }
+ }
+
- #endif
+ if (grid->bSimple && nbat->XFormat == nbatX8)
+ {
+ combine_bounding_box_pairs(grid, grid->bb);
+ }
- #ifdef NBNXN_SEARCH_BB_SSE
+
+ if (!grid->bSimple)
+ {
+ grid->nsubc_tot = 0;
+ for (i = 0; i < grid->nc; i++)
+ {
+ grid->nsubc_tot += grid->nsubc[i];
+ }
+ }
+
+ if (debug)
+ {
+ if (grid->bSimple)
+ {
+ print_bbsizes_simple(debug, nbs, grid);
+ }
+ else
+ {
+ fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
+ grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
+
+ print_bbsizes_supersub(debug, nbs, grid);
+ }
+ }
+}
+
+static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
+ int natoms)
+{
+ int b;
+
+ flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
+ if (flags->nflag > flags->flag_nalloc)
+ {
+ flags->flag_nalloc = over_alloc_large(flags->nflag);
+ srenew(flags->flag, flags->flag_nalloc);
+ }
+ for (b = 0; b < flags->nflag; b++)
+ {
+ flags->flag[b] = 0;
+ }
+}
+
+/* Sets up a grid and puts the atoms on the grid.
+ * This function only operates on one domain of the domain decompostion.
+ * Note that without domain decomposition there is only one domain.
+ */
+void nbnxn_put_on_grid(nbnxn_search_t nbs,
+ int ePBC, matrix box,
+ int dd_zone,
+ rvec corner0, rvec corner1,
+ int a0, int a1,
+ real atom_density,
+ const int *atinfo,
+ rvec *x,
+ int nmoved, int *move,
+ int nb_kernel_type,
+ nbnxn_atomdata_t *nbat)
+{
+ nbnxn_grid_t *grid;
+ int n;
+ int nc_max_grid, nc_max;
+
+ grid = &nbs->grid[dd_zone];
+
+ nbs_cycle_start(&nbs->cc[enbsCCgrid]);
+
+ grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
+
+ grid->na_c = nbnxn_kernel_to_ci_size(nb_kernel_type);
+ grid->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
+ grid->na_sc = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
+ grid->na_c_2log = get_2log(grid->na_c);
+
+ nbat->na_c = grid->na_c;
+
+ if (dd_zone == 0)
+ {
+ grid->cell0 = 0;
+ }
+ else
+ {
+ grid->cell0 =
+ (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
+ nbs->grid[dd_zone-1].na_sc/grid->na_sc;
+ }
+
+ n = a1 - a0;
+
+ if (dd_zone == 0)
+ {
+ nbs->ePBC = ePBC;
+ copy_mat(box, nbs->box);
+
+ if (atom_density >= 0)
+ {
+ grid->atom_density = atom_density;
+ }
+ else
+ {
+ grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
+ }
+
+ grid->cell0 = 0;
+
+ nbs->natoms_local = a1 - nmoved;
+ /* We assume that nbnxn_put_on_grid is called first
+ * for the local atoms (dd_zone=0).
+ */
+ nbs->natoms_nonlocal = a1 - nmoved;
+ }
+ else
+ {
+ nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1);
+ }
+
+ nc_max_grid = set_grid_size_xy(nbs, grid,
+ dd_zone, n-nmoved, corner0, corner1,
+ nbs->grid[0].atom_density);
+
+ nc_max = grid->cell0 + nc_max_grid;
+
+ if (a1 > nbs->cell_nalloc)
+ {
+ nbs->cell_nalloc = over_alloc_large(a1);
+ srenew(nbs->cell, nbs->cell_nalloc);
+ }
+
+ /* To avoid conditionals we store the moved particles at the end of a,
+ * make sure we have enough space.
+ */
+ if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
+ {
+ nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
+ srenew(nbs->a, nbs->a_nalloc);
+ }
+
+ /* We need padding up to a multiple of the buffer flag size: simply add */
+ if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
+ {
+ nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
+ }
+
+ calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
+
+ if (dd_zone == 0)
+ {
+ nbat->natoms_local = nbat->natoms;
+ }
+
+ nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
+}
+
+/* Calls nbnxn_put_on_grid for all non-local domains */
+void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
+ const gmx_domdec_zones_t *zones,
+ const int *atinfo,
+ rvec *x,
+ int nb_kernel_type,
+ nbnxn_atomdata_t *nbat)
+{
+ int zone, d;
+ rvec c0, c1;
+
+ for (zone = 1; zone < zones->n; zone++)
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ c0[d] = zones->size[zone].bb_x0[d];
+ c1[d] = zones->size[zone].bb_x1[d];
+ }
+
+ nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
+ zone, c0, c1,
+ zones->cg_range[zone],
+ zones->cg_range[zone+1],
+ -1,
+ atinfo,
+ x,
+ 0, NULL,
+ nb_kernel_type,
+ nbat);
+ }
+}
+
+/* Add simple grid type information to the local super/sub grid */
+void nbnxn_grid_add_simple(nbnxn_search_t nbs,
+ nbnxn_atomdata_t *nbat)
+{
+ nbnxn_grid_t *grid;
+ float *bbcz, *bb;
+ int ncd, sc;
+
+ grid = &nbs->grid[0];
+
+ if (grid->bSimple)
+ {
+ gmx_incons("nbnxn_grid_simple called with a simple grid");
+ }
+
+ ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
+
+ if (grid->nc*ncd > grid->nc_nalloc_simple)
+ {
+ grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
+ srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
+ srenew(grid->bb_simple, grid->nc_nalloc_simple*NNBSBB_B);
+ srenew(grid->flags_simple, grid->nc_nalloc_simple);
+ if (nbat->XFormat)
+ {
+ sfree_aligned(grid->bbj);
+ snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
+ }
+ }
+
+ bbcz = grid->bbcz_simple;
+ bb = grid->bb_simple;
+
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+ for (sc = 0; sc < grid->nc; sc++)
+ {
+ int c, tx, na;
+
+ for (c = 0; c < ncd; c++)
+ {
+ tx = sc*ncd + c;
+
+ na = NBNXN_CPU_CLUSTER_I_SIZE;
+ while (na > 0 &&
+ nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
+ {
+ na--;
+ }
+
+ if (na > 0)
+ {
+ switch (nbat->XFormat)
+ {
+ case nbatX4:
+ /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
+ calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
+ bb+tx*NNBSBB_B);
+ break;
+ case nbatX8:
+ /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
+ calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
+ bb+tx*NNBSBB_B);
+ break;
+ default:
+ calc_bounding_box(na, nbat->xstride,
+ nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
+ bb+tx*NNBSBB_B);
+ break;
+ }
+ bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B +ZZ];
+ bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
+
+ /* No interaction optimization yet here */
+ grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
+ }
+ else
+ {
+ grid->flags_simple[tx] = 0;
+ }
+ }
+ }
+
- #endif
+ if (grid->bSimple && nbat->XFormat == nbatX8)
+ {
+ combine_bounding_box_pairs(grid, grid->bb_simple);
+ }
- excl->pair[t] = NBNXN_INT_MASK_ALL;
+}
+
+void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
+{
+ *ncx = nbs->grid[0].ncx;
+ *ncy = nbs->grid[0].ncy;
+}
+
+void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)
+{
+ const nbnxn_grid_t *grid;
+
+ grid = &nbs->grid[0];
+
+ /* Return the atom order for the home cell (index 0) */
+ *a = nbs->a;
+
+ *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
+}
+
+void nbnxn_set_atomorder(nbnxn_search_t nbs)
+{
+ nbnxn_grid_t *grid;
+ int ao, cx, cy, cxy, cz, j;
+
+ /* Set the atom order for the home cell (index 0) */
+ grid = &nbs->grid[0];
+
+ ao = 0;
+ for (cx = 0; cx < grid->ncx; cx++)
+ {
+ for (cy = 0; cy < grid->ncy; cy++)
+ {
+ cxy = cx*grid->ncy + cy;
+ j = grid->cxy_ind[cxy]*grid->na_sc;
+ for (cz = 0; cz < grid->cxy_na[cxy]; cz++)
+ {
+ nbs->a[j] = ao;
+ nbs->cell[ao] = j;
+ ao++;
+ j++;
+ }
+ }
+ }
+}
+
+/* Determines the cell range along one dimension that
+ * the bounding box b0 - b1 sees.
+ */
+static void get_cell_range(real b0, real b1,
+ int nc, real c0, real s, real invs,
+ real d2, real r2, int *cf, int *cl)
+{
+ *cf = max((int)((b0 - c0)*invs), 0);
+
+ while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
+ {
+ (*cf)--;
+ }
+
+ *cl = min((int)((b1 - c0)*invs), nc-1);
+ while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
+ {
+ (*cl)++;
+ }
+}
+
+/* Reference code calculating the distance^2 between two bounding boxes */
+static float box_dist2(float bx0, float bx1, float by0,
+ float by1, float bz0, float bz1,
+ const float *bb)
+{
+ float d2;
+ float dl, dh, dm, dm0;
+
+ d2 = 0;
+
+ dl = bx0 - bb[BBU_X];
+ dh = bb[BBL_X] - bx1;
+ dm = max(dl, dh);
+ dm0 = max(dm, 0);
+ d2 += dm0*dm0;
+
+ dl = by0 - bb[BBU_Y];
+ dh = bb[BBL_Y] - by1;
+ dm = max(dl, dh);
+ dm0 = max(dm, 0);
+ d2 += dm0*dm0;
+
+ dl = bz0 - bb[BBU_Z];
+ dh = bb[BBL_Z] - bz1;
+ dm = max(dl, dh);
+ dm0 = max(dm, 0);
+ d2 += dm0*dm0;
+
+ return d2;
+}
+
+/* Plain C code calculating the distance^2 between two bounding boxes */
+static float subc_bb_dist2(int si, const float *bb_i_ci,
+ int csj, const float *bb_j_all)
+{
+ const float *bb_i, *bb_j;
+ float d2;
+ float dl, dh, dm, dm0;
+
+ bb_i = bb_i_ci + si*NNBSBB_B;
+ bb_j = bb_j_all + csj*NNBSBB_B;
+
+ d2 = 0;
+
+ dl = bb_i[BBL_X] - bb_j[BBU_X];
+ dh = bb_j[BBL_X] - bb_i[BBU_X];
+ dm = max(dl, dh);
+ dm0 = max(dm, 0);
+ d2 += dm0*dm0;
+
+ dl = bb_i[BBL_Y] - bb_j[BBU_Y];
+ dh = bb_j[BBL_Y] - bb_i[BBU_Y];
+ dm = max(dl, dh);
+ dm0 = max(dm, 0);
+ d2 += dm0*dm0;
+
+ dl = bb_i[BBL_Z] - bb_j[BBU_Z];
+ dh = bb_j[BBL_Z] - bb_i[BBU_Z];
+ dm = max(dl, dh);
+ dm0 = max(dm, 0);
+ d2 += dm0*dm0;
+
+ return d2;
+}
+
+#ifdef NBNXN_SEARCH_BB_SSE
+
+/* SSE code for bb distance for bb format xyz0 */
+static float subc_bb_dist2_sse(int si, const float *bb_i_ci,
+ int csj, const float *bb_j_all)
+{
+ const float *bb_i, *bb_j;
+
+ __m128 bb_i_SSE0, bb_i_SSE1;
+ __m128 bb_j_SSE0, bb_j_SSE1;
+ __m128 dl_SSE;
+ __m128 dh_SSE;
+ __m128 dm_SSE;
+ __m128 dm0_SSE;
+ __m128 d2_SSE;
+#ifndef GMX_X86_SSE4_1
+ float d2_array[7], *d2_align;
+
+ d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
+#else
+ float d2;
+#endif
+
+ bb_i = bb_i_ci + si*NNBSBB_B;
+ bb_j = bb_j_all + csj*NNBSBB_B;
+
+ bb_i_SSE0 = _mm_load_ps(bb_i);
+ bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
+ bb_j_SSE0 = _mm_load_ps(bb_j);
+ bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
+
+ dl_SSE = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
+ dh_SSE = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
+
+ dm_SSE = _mm_max_ps(dl_SSE, dh_SSE);
+ dm0_SSE = _mm_max_ps(dm_SSE, _mm_setzero_ps());
+#ifndef GMX_X86_SSE4_1
+ d2_SSE = _mm_mul_ps(dm0_SSE, dm0_SSE);
+
+ _mm_store_ps(d2_align, d2_SSE);
+
+ return d2_align[0] + d2_align[1] + d2_align[2];
+#else
+ /* SSE4.1 dot product of components 0,1,2 */
+ d2_SSE = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
+
+ _mm_store_ss(&d2, d2_SSE);
+
+ return d2;
+#endif
+}
+
+/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
+#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
+ { \
+ int shi; \
+ \
+ __m128 dx_0, dy_0, dz_0; \
+ __m128 dx_1, dy_1, dz_1; \
+ \
+ __m128 mx, my, mz; \
+ __m128 m0x, m0y, m0z; \
+ \
+ __m128 d2x, d2y, d2z; \
+ __m128 d2s, d2t; \
+ \
+ shi = si*NNBSBB_D*DIM; \
+ \
+ xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB); \
+ \
+ dx_0 = _mm_sub_ps(xi_l, xj_h); \
+ dy_0 = _mm_sub_ps(yi_l, yj_h); \
+ dz_0 = _mm_sub_ps(zi_l, zj_h); \
+ \
+ dx_1 = _mm_sub_ps(xj_l, xi_h); \
+ dy_1 = _mm_sub_ps(yj_l, yi_h); \
+ dz_1 = _mm_sub_ps(zj_l, zi_h); \
+ \
+ mx = _mm_max_ps(dx_0, dx_1); \
+ my = _mm_max_ps(dy_0, dy_1); \
+ mz = _mm_max_ps(dz_0, dz_1); \
+ \
+ m0x = _mm_max_ps(mx, zero); \
+ m0y = _mm_max_ps(my, zero); \
+ m0z = _mm_max_ps(mz, zero); \
+ \
+ d2x = _mm_mul_ps(m0x, m0x); \
+ d2y = _mm_mul_ps(m0y, m0y); \
+ d2z = _mm_mul_ps(m0z, m0z); \
+ \
+ d2s = _mm_add_ps(d2x, d2y); \
+ d2t = _mm_add_ps(d2s, d2z); \
+ \
+ _mm_store_ps(d2+si, d2t); \
+ }
+
+/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
+static void subc_bb_dist2_sse_xxxx(const float *bb_j,
+ int nsi, const float *bb_i,
+ float *d2)
+{
+ __m128 xj_l, yj_l, zj_l;
+ __m128 xj_h, yj_h, zj_h;
+ __m128 xi_l, yi_l, zi_l;
+ __m128 xi_h, yi_h, zi_h;
+
+ __m128 zero;
+
+ zero = _mm_setzero_ps();
+
+ xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+ yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+ zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+ xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+ yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+ zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
+
+ /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
+ * But as we know the number of iterations is 1 or 2, we unroll manually.
+ */
+ SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
+ if (STRIDE_PBB < nsi)
+ {
+ SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
+ }
+}
+
+#endif /* NBNXN_SEARCH_BB_SSE */
+
+/* Plain C function which determines if any atom pair between two cells
+ * is within distance sqrt(rl2).
+ */
+static gmx_bool subc_in_range_x(int na_c,
+ int si, const real *x_i,
+ int csj, int stride, const real *x_j,
+ real rl2)
+{
+ int i, j, i0, j0;
+ real d2;
+
+ for (i = 0; i < na_c; i++)
+ {
+ i0 = (si*na_c + i)*DIM;
+ for (j = 0; j < na_c; j++)
+ {
+ j0 = (csj*na_c + j)*stride;
+
+ d2 = sqr(x_i[i0 ] - x_j[j0 ]) +
+ sqr(x_i[i0+1] - x_j[j0+1]) +
+ sqr(x_i[i0+2] - x_j[j0+2]);
+
+ if (d2 < rl2)
+ {
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
++#ifdef NBNXN_SEARCH_SSE_SINGLE
++/* When we make seperate single/double precision SIMD vector operation
++ * include files, this function should be moved there (also using FMA).
++ */
++static inline __m128
++gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z)
++{
++ return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) );
++}
++#endif
++
+/* SSE function which determines if any atom pair between two cells,
+ * both with 8 atoms, is within distance sqrt(rl2).
++ * Not performance critical, so only uses plain SSE.
+ */
+static gmx_bool subc_in_range_sse8(int na_c,
+ int si, const real *x_i,
+ int csj, int stride, const real *x_j,
+ real rl2)
+{
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+ __m128 ix_SSE0, iy_SSE0, iz_SSE0;
+ __m128 ix_SSE1, iy_SSE1, iz_SSE1;
+
+ __m128 rc2_SSE;
+
+ int na_c_sse;
+ int j0, j1;
+
+ rc2_SSE = _mm_set1_ps(rl2);
+
+ na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+ ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+ iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+ iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+ ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+ iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+ iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
+
+ /* We loop from the outer to the inner particles to maximize
+ * the chance that we find a pair in range quickly and return.
+ */
+ j0 = csj*na_c;
+ j1 = j0 + na_c - 1;
+ while (j0 < j1)
+ {
+ __m128 jx0_SSE, jy0_SSE, jz0_SSE;
+ __m128 jx1_SSE, jy1_SSE, jz1_SSE;
+
+ __m128 dx_SSE0, dy_SSE0, dz_SSE0;
+ __m128 dx_SSE1, dy_SSE1, dz_SSE1;
+ __m128 dx_SSE2, dy_SSE2, dz_SSE2;
+ __m128 dx_SSE3, dy_SSE3, dz_SSE3;
+
+ __m128 rsq_SSE0;
+ __m128 rsq_SSE1;
+ __m128 rsq_SSE2;
+ __m128 rsq_SSE3;
+
+ __m128 wco_SSE0;
+ __m128 wco_SSE1;
+ __m128 wco_SSE2;
+ __m128 wco_SSE3;
+ __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+
+ jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
+ jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
+ jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
+
+ jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
+ jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
+ jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
+
+ /* Calculate distance */
+ dx_SSE0 = _mm_sub_ps(ix_SSE0, jx0_SSE);
+ dy_SSE0 = _mm_sub_ps(iy_SSE0, jy0_SSE);
+ dz_SSE0 = _mm_sub_ps(iz_SSE0, jz0_SSE);
+ dx_SSE1 = _mm_sub_ps(ix_SSE1, jx0_SSE);
+ dy_SSE1 = _mm_sub_ps(iy_SSE1, jy0_SSE);
+ dz_SSE1 = _mm_sub_ps(iz_SSE1, jz0_SSE);
+ dx_SSE2 = _mm_sub_ps(ix_SSE0, jx1_SSE);
+ dy_SSE2 = _mm_sub_ps(iy_SSE0, jy1_SSE);
+ dz_SSE2 = _mm_sub_ps(iz_SSE0, jz1_SSE);
+ dx_SSE3 = _mm_sub_ps(ix_SSE1, jx1_SSE);
+ dy_SSE3 = _mm_sub_ps(iy_SSE1, jy1_SSE);
+ dz_SSE3 = _mm_sub_ps(iz_SSE1, jz1_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+ rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+ rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+ rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
+ wco_SSE0 = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
+ wco_SSE1 = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
+ wco_SSE2 = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
+ wco_SSE3 = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
+
+ wco_any_SSE01 = _mm_or_ps(wco_SSE0, wco_SSE1);
+ wco_any_SSE23 = _mm_or_ps(wco_SSE2, wco_SSE3);
+ wco_any_SSE = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
+
+ if (_mm_movemask_ps(wco_any_SSE))
+ {
+ return TRUE;
+ }
+
+ j0++;
+ j1--;
+ }
+ return FALSE;
+
+#else
+ /* No SSE */
+ gmx_incons("SSE function called without SSE support");
+
+ return TRUE;
+#endif
+}
+
+/* Returns the j sub-cell for index cj_ind */
+static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)
+{
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
+}
+
+/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
+static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)
+{
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
+}
+
+/* Ensures there is enough space for extra extra exclusion masks */
+static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
+{
+ if (nbl->nexcl+extra > nbl->excl_nalloc)
+ {
+ nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
+ nbnxn_realloc_void((void **)&nbl->excl,
+ nbl->nexcl*sizeof(*nbl->excl),
+ nbl->excl_nalloc*sizeof(*nbl->excl),
+ nbl->alloc, nbl->free);
+ }
+}
+
+/* Ensures there is enough space for ncell extra j-cells in the list */
+static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
+ int ncell)
+{
+ int cj_max;
+
+ cj_max = nbl->ncj + ncell;
+
+ if (cj_max > nbl->cj_nalloc)
+ {
+ nbl->cj_nalloc = over_alloc_small(cj_max);
+ nbnxn_realloc_void((void **)&nbl->cj,
+ nbl->ncj*sizeof(*nbl->cj),
+ nbl->cj_nalloc*sizeof(*nbl->cj),
+ nbl->alloc, nbl->free);
+ }
+}
+
+/* Ensures there is enough space for ncell extra j-subcells in the list */
+static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
+ int nsupercell)
+{
+ int ncj4_max, j4, j, w, t;
+
+#define NWARP 2
+#define WARP_SIZE 32
+
+ /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
+ /* We can store 4 j-subcell - i-supercell pairs in one struct.
+ * since we round down, we need one extra entry.
+ */
+ ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+
+ if (ncj4_max > nbl->cj4_nalloc)
+ {
+ nbl->cj4_nalloc = over_alloc_small(ncj4_max);
+ nbnxn_realloc_void((void **)&nbl->cj4,
+ nbl->work->cj4_init*sizeof(*nbl->cj4),
+ nbl->cj4_nalloc*sizeof(*nbl->cj4),
+ nbl->alloc, nbl->free);
+ }
+
+ if (ncj4_max > nbl->work->cj4_init)
+ {
+ for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
+ {
+ /* No i-subcells and no excl's in the list initially */
+ for (w = 0; w < NWARP; w++)
+ {
+ nbl->cj4[j4].imei[w].imask = 0U;
+ nbl->cj4[j4].imei[w].excl_ind = 0;
+
+ }
+ }
+ nbl->work->cj4_init = ncj4_max;
+ }
+}
+
+/* Set all excl masks for one GPU warp no exclusions */
+static void set_no_excls(nbnxn_excl_t *excl)
+{
+ int t;
+
+ for (t = 0; t < WARP_SIZE; t++)
+ {
+ /* Turn all interaction bits on */
- nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
++ excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
+ }
+}
+
+/* Initializes a single nbnxn_pairlist_t data structure */
+static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
+ gmx_bool bSimple,
+ nbnxn_alloc_t *alloc,
+ nbnxn_free_t *free)
+{
+ if (alloc == NULL)
+ {
+ nbl->alloc = nbnxn_alloc_aligned;
+ }
+ else
+ {
+ nbl->alloc = alloc;
+ }
+ if (free == NULL)
+ {
+ nbl->free = nbnxn_free_aligned;
+ }
+ else
+ {
+ nbl->free = free;
+ }
+
+ nbl->bSimple = bSimple;
+ nbl->na_sc = 0;
+ nbl->na_ci = 0;
+ nbl->na_cj = 0;
+ nbl->nci = 0;
+ nbl->ci = NULL;
+ nbl->ci_nalloc = 0;
+ nbl->ncj = 0;
+ nbl->cj = NULL;
+ nbl->cj_nalloc = 0;
+ nbl->ncj4 = 0;
+ /* We need one element extra in sj, so alloc initially with 1 */
+ nbl->cj4_nalloc = 0;
+ nbl->cj4 = NULL;
+ nbl->nci_tot = 0;
+
+ if (!nbl->bSimple)
+ {
+ nbl->excl = NULL;
+ nbl->excl_nalloc = 0;
+ nbl->nexcl = 0;
+ check_excl_space(nbl, 1);
+ nbl->nexcl = 1;
+ set_no_excls(&nbl->excl[0]);
+ }
+
+ snew(nbl->work, 1);
+#ifdef NBNXN_BBXXXX
+ snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
+#else
+ snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL*NNBSBB_B, NBNXN_MEM_ALIGN);
+#endif
+ snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
+#ifdef GMX_NBNXN_SIMD
+ snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
+#endif
+ snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
+
+ nbl->work->sort = NULL;
+ nbl->work->sort_nalloc = 0;
+ nbl->work->sci_sort = NULL;
+ nbl->work->sci_sort_nalloc = 0;
+}
+
+void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
+ gmx_bool bSimple, gmx_bool bCombined,
+ nbnxn_alloc_t *alloc,
+ nbnxn_free_t *free)
+{
+ int i;
+
+ nbl_list->bSimple = bSimple;
+ nbl_list->bCombined = bCombined;
+
+ nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+
+ if (!nbl_list->bCombined &&
+ nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
+ {
+ gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
+ nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+ }
+
+ snew(nbl_list->nbl, nbl_list->nnbl);
+ /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
+ for (i = 0; i < nbl_list->nnbl; i++)
+ {
+ /* Allocate the nblist data structure locally on each thread
+ * to optimize memory access for NUMA architectures.
+ */
+ snew(nbl_list->nbl[i], 1);
+
+ /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+ if (i == 0)
+ {
+ nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);
+ }
+ else
+ {
+ nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL, NULL);
+ }
+ }
+}
+
+/* Print statistics of a pair list, used for debug output */
+static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
+ const nbnxn_search_t nbs, real rl)
+{
+ const nbnxn_grid_t *grid;
+ int cs[SHIFTS];
+ int s, i, j;
+ int npexcl;
+
+ /* This code only produces correct statistics with domain decomposition */
+ grid = &nbs->grid[0];
+
+ fprintf(fp, "nbl nci %d ncj %d\n",
+ nbl->nci, nbl->ncj);
+ fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+ nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,
+ nbl->ncj/(double)grid->nc*grid->na_sc,
+ nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
+
+ fprintf(fp, "nbl average j cell list length %.1f\n",
+ 0.25*nbl->ncj/(double)nbl->nci);
+
+ for (s = 0; s < SHIFTS; s++)
+ {
+ cs[s] = 0;
+ }
+ npexcl = 0;
+ for (i = 0; i < nbl->nci; i++)
+ {
+ cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
+ nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
+
+ j = nbl->ci[i].cj_ind_start;
+ while (j < nbl->ci[i].cj_ind_end &&
- return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
++ nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+ npexcl++;
+ j++;
+ }
+ }
+ fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
+ nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);
+ for (s = 0; s < SHIFTS; s++)
+ {
+ if (cs[s] > 0)
+ {
+ fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
+ }
+ }
+}
+
+/* Print statistics of a pair lists, used for debug output */
+static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
+ const nbnxn_search_t nbs, real rl)
+{
+ const nbnxn_grid_t *grid;
+ int i, j4, j, si, b;
+ int c[GPU_NSUBCELL+1];
+
+ /* This code only produces correct statistics with domain decomposition */
+ grid = &nbs->grid[0];
+
+ fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
+ nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
+ fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+ nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
+ nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
+ nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
+
+ fprintf(fp, "nbl average j super cell list length %.1f\n",
+ 0.25*nbl->ncj4/(double)nbl->nsci);
+ fprintf(fp, "nbl average i sub cell list length %.1f\n",
+ nbl->nci_tot/((double)nbl->ncj4));
+
+ for (si = 0; si <= GPU_NSUBCELL; si++)
+ {
+ c[si] = 0;
+ }
+ for (i = 0; i < nbl->nsci; i++)
+ {
+ for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
+ {
+ for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
+ {
+ b = 0;
+ for (si = 0; si < GPU_NSUBCELL; si++)
+ {
+ if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
+ {
+ b++;
+ }
+ }
+ c[b]++;
+ }
+ }
+ }
+ for (b = 0; b <= GPU_NSUBCELL; b++)
+ {
+ fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
+ b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
+ }
+}
+
+/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
+static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,
+ int warp, nbnxn_excl_t **excl)
+{
+ if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+ {
+ /* No exclusions set, make a new list entry */
+ nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
+ nbl->nexcl++;
+ *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+ set_no_excls(*excl);
+ }
+ else
+ {
+ /* We already have some exclusions, new ones can be added to the list */
+ *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+ }
+}
+
+/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
+ * allocates extra memory, if necessary.
+ */
+static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,
+ int warp, nbnxn_excl_t **excl)
+{
+ if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+ {
+ /* We need to make a new list entry, check if we have space */
+ check_excl_space(nbl, 1);
+ }
+ low_get_nbl_exclusions(nbl, cj4, warp, excl);
+}
+
+/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
+ * allocates extra memory, if necessary.
+ */
+static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,
+ nbnxn_excl_t **excl_w0,
+ nbnxn_excl_t **excl_w1)
+{
+ /* Check for space we might need */
+ check_excl_space(nbl, 2);
+
+ low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
+ low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
+}
+
+/* Sets the self exclusions i=j and pair exclusions i>j */
+static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
+ int cj4_ind, int sj_offset,
+ int si)
+{
+ nbnxn_excl_t *excl[2];
+ int ei, ej, w;
+
+ /* Here we only set the set self and double pair exclusions */
+
+ get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
+
+ /* Only minor < major bits set */
+ for (ej = 0; ej < nbl->na_ci; ej++)
+ {
+ w = (ej>>2);
+ for (ei = ej; ei < nbl->na_ci; ei++)
+ {
+ excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+ ~(1U << (sj_offset*GPU_NSUBCELL + si));
+ }
+ }
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
+{
- /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
- static unsigned int get_imask_simd128(gmx_bool rdiag, int ci, int cj)
++ return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
- #ifndef GMX_DOUBLE /* cj-size = 4 */
- return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
- #else /* cj-size = 2 */
- return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
- (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
- NBNXN_INT_MASK_ALL));
- #endif
++/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
++static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
+{
- /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
- static unsigned int get_imask_simd256(gmx_bool rdiag, int ci, int cj)
++ return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
++ (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
++ NBNXN_INTERACTION_MASK_ALL));
+}
+
- #ifndef GMX_DOUBLE /* cj-size = 8 */
- return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
- (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
- NBNXN_INT_MASK_ALL));
- #else /* cj-size = 4 */
- return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
- #endif
++/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
++static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
+{
- #if GMX_NBNXN_SIMD_BITWIDTH == 128
- #define get_imask_simd_4xn get_imask_simd128
- #else
- #if GMX_NBNXN_SIMD_BITWIDTH == 256
- #define get_imask_simd_4xn get_imask_simd256
- #define get_imask_simd_2xnn get_imask_simd128
- #else
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
++ return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
++}
++
++/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
++static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
++{
++ return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
++ (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
++ NBNXN_INTERACTION_MASK_ALL));
+}
+
+#ifdef GMX_NBNXN_SIMD
- if (cj[j].excl != NBNXN_INT_MASK_ALL)
++#if GMX_SIMD_WIDTH_HERE == 2
++#define get_imask_simd_4xn get_imask_simd_j2
++#endif
++#if GMX_SIMD_WIDTH_HERE == 4
++#define get_imask_simd_4xn get_imask_simd_j4
++#endif
++#if GMX_SIMD_WIDTH_HERE == 8
++#define get_imask_simd_4xn get_imask_simd_j8
++#define get_imask_simd_2xnn get_imask_simd_j4
+#endif
++#if GMX_SIMD_WIDTH_HERE == 16
++#define get_imask_simd_2xnn get_imask_simd_j8
+#endif
+#endif
+
+/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
+ * Checks bounding box distances and possibly atom pair distances.
+ */
+static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci, int cjf, int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2, float rbb2,
+ int *ndistc)
+{
+ const nbnxn_list_work_t *work;
+
+ const float *bb_ci;
+ const real *x_ci;
+
+ gmx_bool InRange;
+ real d2;
+ int cjf_gl, cjl_gl, cj;
+
+ work = nbl->work;
+
+ bb_ci = nbl->work->bb_ci;
+ x_ci = nbl->work->x_ci;
+
+ InRange = FALSE;
+ while (!InRange && cjf <= cjl)
+ {
+ d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ int i, j;
+
+ cjf_gl = gridj->cell0 + cjf;
+ for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
+ {
+ for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
+ {
+ InRange = InRange ||
+ (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
+ sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
+ sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
+ }
+ }
+ *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+ }
+ if (!InRange)
+ {
+ cjf++;
+ }
+ }
+ if (!InRange)
+ {
+ return;
+ }
+
+ InRange = FALSE;
+ while (!InRange && cjl > cjf)
+ {
+ d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ int i, j;
+
+ cjl_gl = gridj->cell0 + cjl;
+ for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
+ {
+ for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
+ {
+ InRange = InRange ||
+ (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
+ sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
+ sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
+ }
+ }
+ *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+ }
+ if (!InRange)
+ {
+ cjl--;
+ }
+ }
+
+ if (cjf <= cjl)
+ {
+ for (cj = cjf; cj <= cjl; cj++)
+ {
+ /* Store cj and the interaction mask */
+ nbl->cj[nbl->ncj].cj = gridj->cell0 + cj;
+ nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);
+ nbl->ncj++;
+ }
+ /* Increase the closing index in i super-cell list */
+ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ }
+}
+
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "nbnxn_search_simd_4xn.h"
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "nbnxn_search_simd_2xnn.h"
+#endif
+
+/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
+ * Checks bounding box distances and possibly atom pair distances.
+ */
+static void make_cluster_list_supersub(const nbnxn_grid_t *gridi,
+ const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int sci, int scj,
+ gmx_bool sci_equals_scj,
+ int stride, const real *x,
+ real rl2, float rbb2,
+ int *ndistc)
+{
+ int na_c;
+ int npair;
+ int cjo, ci1, ci, cj, cj_gl;
+ int cj4_ind, cj_offset;
+ unsigned imask;
+ nbnxn_cj4_t *cj4;
+ const float *bb_ci;
+ const real *x_ci;
+ float *d2l, d2;
+ int w;
+#define PRUNE_LIST_CPU_ONE
+#ifdef PRUNE_LIST_CPU_ONE
+ int ci_last = -1;
+#endif
+
+ d2l = nbl->work->d2;
+
+ bb_ci = nbl->work->bb_ci;
+ x_ci = nbl->work->x_ci;
+
+ na_c = gridj->na_c;
+
+ for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)
+ {
+ cj4_ind = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
+ cj4 = &nbl->cj4[cj4_ind];
+
+ cj = scj*GPU_NSUBCELL + cjo;
+
+ cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
+
+ /* Initialize this j-subcell i-subcell list */
+ cj4->cj[cj_offset] = cj_gl;
+ imask = 0;
+
+ if (sci_equals_scj)
+ {
+ ci1 = cjo + 1;
+ }
+ else
+ {
+ ci1 = gridi->nsubc[sci];
+ }
+
+#ifdef NBNXN_BBXXXX
+ /* Determine all ci1 bb distances in one call with SSE */
+ subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
+ ci1, bb_ci, d2l);
+ *ndistc += na_c*2;
+#endif
+
+ npair = 0;
+ /* We use a fixed upper-bound instead of ci1 to help optimization */
+ for (ci = 0; ci < GPU_NSUBCELL; ci++)
+ {
+ if (ci == ci1)
+ {
+ break;
+ }
+
+#ifndef NBNXN_BBXXXX
+ /* Determine the bb distance between ci and cj */
+ d2l[ci] = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);
+ *ndistc += 2;
+#endif
+ d2 = d2l[ci];
+
+#ifdef PRUNE_LIST_CPU_ALL
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off. This check is very costly.
+ */
+ *ndistc += na_c*na_c;
+ if (d2 < rbb2 ||
+ (d2 < rl2 &&
+#ifdef NBNXN_PBB_SSE
+ subc_in_range_sse8
+#else
+ subc_in_range_x
+#endif
+ (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
+#else
+ /* Check if the distance between the two bounding boxes
+ * in within the pair-list cut-off.
+ */
+ if (d2 < rl2)
+#endif
+ {
+ /* Flag this i-subcell to be taken into account */
+ imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
+
+#ifdef PRUNE_LIST_CPU_ONE
+ ci_last = ci;
+#endif
+
+ npair++;
+ }
+ }
+
+#ifdef PRUNE_LIST_CPU_ONE
+ /* If we only found 1 pair, check if any atoms are actually
+ * within the cut-off, so we could get rid of it.
+ */
+ if (npair == 1 && d2l[ci_last] >= rbb2)
+ {
+ /* Avoid using function pointers here, as it's slower */
+ if (
+#ifdef NBNXN_PBB_SSE
+ !subc_in_range_sse8
+#else
+ !subc_in_range_x
+#endif
+ (na_c, ci_last, x_ci, cj_gl, stride, x, rl2))
+ {
+ imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
+ npair--;
+ }
+ }
+#endif
+
+ if (npair > 0)
+ {
+ /* We have a useful sj entry, close it now */
+
+ /* Set the exclucions for the ci== sj entry.
+ * Here we don't bother to check if this entry is actually flagged,
+ * as it will nearly always be in the list.
+ */
+ if (sci_equals_scj)
+ {
+ set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);
+ }
+
+ /* Copy the cluster interaction mask to the list */
+ for (w = 0; w < NWARP; w++)
+ {
+ cj4->imei[w].imask |= imask;
+ }
+
+ nbl->work->cj_ind++;
+
+ /* Keep the count */
+ nbl->nci_tot += npair;
+
+ /* Increase the closing index in i super-cell list */
+ nbl->sci[nbl->nsci].cj4_ind_end =
+ ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ }
+ }
+}
+
+/* Set all atom-pair exclusions from the topology stored in excl
+ * as masks in the pair-list for simple list i-entry nbl_ci
+ */
+static void set_ci_top_excls(const nbnxn_search_t nbs,
+ nbnxn_pairlist_t *nbl,
+ gmx_bool diagRemoved,
+ int na_ci_2log,
+ int na_cj_2log,
+ const nbnxn_ci_t *nbl_ci,
+ const t_blocka *excl)
+{
+ const int *cell;
+ int ci;
+ int cj_ind_first, cj_ind_last;
+ int cj_first, cj_last;
+ int ndirect;
+ int i, ai, aj, si, eind, ge, se;
+ int found, cj_ind_0, cj_ind_1, cj_ind_m;
+ int cj_m;
+ gmx_bool Found_si;
+ int si_ind;
+ nbnxn_excl_t *nbl_excl;
+ int inner_i, inner_e;
+
+ cell = nbs->cell;
+
+ if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
+ {
+ /* Empty list */
+ return;
+ }
+
+ ci = nbl_ci->ci;
+
+ cj_ind_first = nbl_ci->cj_ind_start;
+ cj_ind_last = nbl->ncj - 1;
+
+ cj_first = nbl->cj[cj_ind_first].cj;
+ cj_last = nbl->cj[cj_ind_last].cj;
+
+ /* Determine how many contiguous j-cells we have starting
+ * from the first i-cell. This number can be used to directly
+ * calculate j-cell indices for excluded atoms.
+ */
+ ndirect = 0;
+ if (na_ci_2log == na_cj_2log)
+ {
+ while (cj_ind_first + ndirect <= cj_ind_last &&
+ nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
+ {
+ ndirect++;
+ }
+ }
+#ifdef NBNXN_SEARCH_BB_SSE
+ else
+ {
+ while (cj_ind_first + ndirect <= cj_ind_last &&
+ nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)
+ {
+ ndirect++;
+ }
+ }
+#endif
+
+ /* Loop over the atoms in the i super-cell */
+ for (i = 0; i < nbl->na_sc; i++)
+ {
+ ai = nbs->a[ci*nbl->na_sc+i];
+ if (ai >= 0)
+ {
+ si = (i>>na_ci_2log);
+
+ /* Loop over the topology-based exclusions for this i-atom */
+ for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
+ {
+ aj = excl->a[eind];
+
+ if (aj == ai)
+ {
+ /* The self exclusion are already set, save some time */
+ continue;
+ }
+
+ ge = cell[aj];
+
+ /* Without shifts we only calculate interactions j>i
+ * for one-way pair-lists.
+ */
+ if (diagRemoved && ge <= ci*nbl->na_sc + i)
+ {
+ continue;
+ }
+
+ se = (ge >> na_cj_2log);
+
+ /* Could the cluster se be in our list? */
+ if (se >= cj_first && se <= cj_last)
+ {
+ if (se < cj_first + ndirect)
+ {
+ /* We can calculate cj_ind directly from se */
+ found = cj_ind_first + se - cj_first;
+ }
+ else
+ {
+ /* Search for se using bisection */
+ found = -1;
+ cj_ind_0 = cj_ind_first + ndirect;
+ cj_ind_1 = cj_ind_last + 1;
+ while (found == -1 && cj_ind_0 < cj_ind_1)
+ {
+ cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
+
+ cj_m = nbl->cj[cj_ind_m].cj;
+
+ if (se == cj_m)
+ {
+ found = cj_ind_m;
+ }
+ else if (se < cj_m)
+ {
+ cj_ind_1 = cj_ind_m;
+ }
+ else
+ {
+ cj_ind_0 = cj_ind_m + 1;
+ }
+ }
+ }
+
+ if (found >= 0)
+ {
+ inner_i = i - (si << na_ci_2log);
+ inner_e = ge - (se << na_cj_2log);
+
+ nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Set all atom-pair exclusions from the topology stored in excl
+ * as masks in the pair-list for i-super-cell entry nbl_sci
+ */
+static void set_sci_top_excls(const nbnxn_search_t nbs,
+ nbnxn_pairlist_t *nbl,
+ gmx_bool diagRemoved,
+ int na_c_2log,
+ const nbnxn_sci_t *nbl_sci,
+ const t_blocka *excl)
+{
+ const int *cell;
+ int na_c;
+ int sci;
+ int cj_ind_first, cj_ind_last;
+ int cj_first, cj_last;
+ int ndirect;
+ int i, ai, aj, si, eind, ge, se;
+ int found, cj_ind_0, cj_ind_1, cj_ind_m;
+ int cj_m;
+ gmx_bool Found_si;
+ int si_ind;
+ nbnxn_excl_t *nbl_excl;
+ int inner_i, inner_e, w;
+
+ cell = nbs->cell;
+
+ na_c = nbl->na_ci;
+
+ if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
+ {
+ /* Empty list */
+ return;
+ }
+
+ sci = nbl_sci->sci;
+
+ cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
+ cj_ind_last = nbl->work->cj_ind - 1;
+
+ cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
+ cj_last = nbl_cj(nbl, cj_ind_last);
+
+ /* Determine how many contiguous j-clusters we have starting
+ * from the first i-cluster. This number can be used to directly
+ * calculate j-cluster indices for excluded atoms.
+ */
+ ndirect = 0;
+ while (cj_ind_first + ndirect <= cj_ind_last &&
+ nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
+ {
+ ndirect++;
+ }
+
+ /* Loop over the atoms in the i super-cell */
+ for (i = 0; i < nbl->na_sc; i++)
+ {
+ ai = nbs->a[sci*nbl->na_sc+i];
+ if (ai >= 0)
+ {
+ si = (i>>na_c_2log);
+
+ /* Loop over the topology-based exclusions for this i-atom */
+ for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
+ {
+ aj = excl->a[eind];
+
+ if (aj == ai)
+ {
+ /* The self exclusion are already set, save some time */
+ continue;
+ }
+
+ ge = cell[aj];
+
+ /* Without shifts we only calculate interactions j>i
+ * for one-way pair-lists.
+ */
+ if (diagRemoved && ge <= sci*nbl->na_sc + i)
+ {
+ continue;
+ }
+
+ se = ge>>na_c_2log;
+ /* Could the cluster se be in our list? */
+ if (se >= cj_first && se <= cj_last)
+ {
+ if (se < cj_first + ndirect)
+ {
+ /* We can calculate cj_ind directly from se */
+ found = cj_ind_first + se - cj_first;
+ }
+ else
+ {
+ /* Search for se using bisection */
+ found = -1;
+ cj_ind_0 = cj_ind_first + ndirect;
+ cj_ind_1 = cj_ind_last + 1;
+ while (found == -1 && cj_ind_0 < cj_ind_1)
+ {
+ cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
+
+ cj_m = nbl_cj(nbl, cj_ind_m);
+
+ if (se == cj_m)
+ {
+ found = cj_ind_m;
+ }
+ else if (se < cj_m)
+ {
+ cj_ind_1 = cj_ind_m;
+ }
+ else
+ {
+ cj_ind_0 = cj_ind_m + 1;
+ }
+ }
+ }
+
+ if (found >= 0)
+ {
+ inner_i = i - si*na_c;
+ inner_e = ge - se*na_c;
+
+/* Macro for getting the index of atom a within a cluster */
+#define AMODCJ4(a) ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
+/* Macro for converting an atom number to a cluster number */
+#define A2CJ4(a) ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a) ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
+
+ if (nbl_imask0(nbl, found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
+ {
+ w = (inner_e >> 2);
+
+ get_nbl_exclusions_1(nbl, A2CJ4(found), w, &nbl_excl);
+
+ nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+ ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
+ }
+
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Reallocate the simple ci list for at least n entries */
+static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
+{
+ nbl->ci_nalloc = over_alloc_small(n);
+ nbnxn_realloc_void((void **)&nbl->ci,
+ nbl->nci*sizeof(*nbl->ci),
+ nbl->ci_nalloc*sizeof(*nbl->ci),
+ nbl->alloc, nbl->free);
+}
+
+/* Reallocate the super-cell sci list for at least n entries */
+static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
+{
+ nbl->sci_nalloc = over_alloc_small(n);
+ nbnxn_realloc_void((void **)&nbl->sci,
+ nbl->nsci*sizeof(*nbl->sci),
+ nbl->sci_nalloc*sizeof(*nbl->sci),
+ nbl->alloc, nbl->free);
+}
+
+/* Make a new ci entry at index nbl->nci */
+static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags)
+{
+ if (nbl->nci + 1 > nbl->ci_nalloc)
+ {
+ nb_realloc_ci(nbl, nbl->nci+1);
+ }
+ nbl->ci[nbl->nci].ci = ci;
+ nbl->ci[nbl->nci].shift = shift;
+ /* Store the interaction flags along with the shift */
+ nbl->ci[nbl->nci].shift |= flags;
+ nbl->ci[nbl->nci].cj_ind_start = nbl->ncj;
+ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+}
+
+/* Make a new sci entry at index nbl->nsci */
+static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift)
+{
+ if (nbl->nsci + 1 > nbl->sci_nalloc)
+ {
+ nb_realloc_sci(nbl, nbl->nsci+1);
+ }
+ nbl->sci[nbl->nsci].sci = sci;
+ nbl->sci[nbl->nsci].shift = shift;
+ nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
+ nbl->sci[nbl->nsci].cj4_ind_end = nbl->ncj4;
+}
+
+/* Sort the simple j-list cj on exclusions.
+ * Entries with exclusions will all be sorted to the beginning of the list.
+ */
+static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
+ nbnxn_list_work_t *work)
+{
+ int jnew, j;
+
+ if (ncj > work->cj_nalloc)
+ {
+ work->cj_nalloc = over_alloc_large(ncj);
+ srenew(work->cj, work->cj_nalloc);
+ }
+
+ /* Make a list of the j-cells involving exclusions */
+ jnew = 0;
+ for (j = 0; j < ncj; j++)
+ {
- (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
++ if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+ work->cj[jnew++] = cj[j];
+ }
+ }
+ /* Check if there are exclusions at all or not just the first entry */
+ if (!((jnew == 0) ||
- if (cj[j].excl == NBNXN_INT_MASK_ALL)
++ (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
+ {
+ for (j = 0; j < ncj; j++)
+ {
++ if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
+ {
+ work->cj[jnew++] = cj[j];
+ }
+ }
+ for (j = 0; j < ncj; j++)
+ {
+ cj[j] = work->cj[j];
+ }
+ }
+}
+
+/* Close this simple list i entry */
+static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
+{
+ int jlen;
+
+ /* All content of the new ci entry have already been filled correctly,
+ * we only need to increase the count here (for non empty lists).
+ */
+ jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
+ if (jlen > 0)
+ {
+ sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);
+
+ /* The counts below are used for non-bonded pair/flop counts
+ * and should therefore match the available kernel setups.
+ */
+ if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+ {
+ nbl->work->ncj_noq += jlen;
+ }
+ else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+ !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
+ {
+ nbl->work->ncj_hlj += jlen;
+ }
+
+ nbl->nci++;
+ }
+}
+
+/* Split sci entry for load balancing on the GPU.
+ * Splitting ensures we have enough lists to fully utilize the whole GPU.
+ * With progBal we generate progressively smaller lists, which improves
+ * load balancing. As we only know the current count on our own thread,
+ * we will need to estimate the current total amount of i-entries.
+ * As the lists get concatenated later, this estimate depends
+ * both on nthread and our own thread index.
+ */
+static void split_sci_entry(nbnxn_pairlist_t *nbl,
+ int nsp_max_av, gmx_bool progBal, int nc_bal,
+ int thread, int nthread)
+{
+ int nsci_est;
+ int nsp_max;
+ int cj4_start, cj4_end, j4len, cj4;
+ int sci;
+ int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
+ int p;
+
+ if (progBal)
+ {
+ /* Estimate the total numbers of ci's of the nblist combined
+ * over all threads using the target number of ci's.
+ */
+ nsci_est = nc_bal*thread/nthread + nbl->nsci;
+
+ /* The first ci blocks should be larger, to avoid overhead.
+ * The last ci blocks should be smaller, to improve load balancing.
+ */
+ nsp_max = max(1,
+ nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
+ }
+ else
+ {
+ nsp_max = nsp_max_av;
+ }
+
+ cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
+ cj4_end = nbl->sci[nbl->nsci-1].cj4_ind_end;
+ j4len = cj4_end - cj4_start;
+
+ if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
+ {
+ /* Remove the last ci entry and process the cj4's again */
+ nbl->nsci -= 1;
+
+ sci = nbl->nsci;
+ nsp = 0;
+ nsp_sci = 0;
+ nsp_cj4_e = 0;
+ nsp_cj4 = 0;
+ for (cj4 = cj4_start; cj4 < cj4_end; cj4++)
+ {
+ nsp_cj4_p = nsp_cj4;
+ /* Count the number of cluster pairs in this cj4 group */
+ nsp_cj4 = 0;
+ for (p = 0; p < GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
+ {
+ nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
+ }
+
+ if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
+ {
+ /* Split the list at cj4 */
+ nbl->sci[sci].cj4_ind_end = cj4;
+ /* Create a new sci entry */
+ sci++;
+ nbl->nsci++;
+ if (nbl->nsci+1 > nbl->sci_nalloc)
+ {
+ nb_realloc_sci(nbl, nbl->nsci+1);
+ }
+ nbl->sci[sci].sci = nbl->sci[nbl->nsci-1].sci;
+ nbl->sci[sci].shift = nbl->sci[nbl->nsci-1].shift;
+ nbl->sci[sci].cj4_ind_start = cj4;
+ nsp_sci = nsp;
+ nsp_cj4_e = nsp_cj4_p;
+ nsp = 0;
+ }
+ nsp += nsp_cj4;
+ }
+
+ /* Put the remaining cj4's in the last sci entry */
+ nbl->sci[sci].cj4_ind_end = cj4_end;
+
+ /* Possibly balance out the last two sci's
+ * by moving the last cj4 of the second last sci.
+ */
+ if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
+ {
+ nbl->sci[sci-1].cj4_ind_end--;
+ nbl->sci[sci].cj4_ind_start--;
+ }
+
+ nbl->nsci++;
+ }
+}
+
+/* Clost this super/sub list i entry */
+static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
+ int nsp_max_av,
+ gmx_bool progBal, int nc_bal,
+ int thread, int nthread)
+{
+ int j4len, tlen;
+ int nb, b;
+
+ /* All content of the new ci entry have already been filled correctly,
+ * we only need to increase the count here (for non empty lists).
+ */
+ j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
+ if (j4len > 0)
+ {
+ /* We can only have complete blocks of 4 j-entries in a list,
+ * so round the count up before closing.
+ */
+ nbl->ncj4 = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
+ nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
+
+ nbl->nsci++;
+
+ if (nsp_max_av > 0)
+ {
+ /* Measure the size of the new entry and potentially split it */
+ split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);
+ }
+ }
+}
+
+/* Syncs the working array before adding another grid pair to the list */
+static void sync_work(nbnxn_pairlist_t *nbl)
+{
+ if (!nbl->bSimple)
+ {
+ nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
+ nbl->work->cj4_init = nbl->ncj4;
+ }
+}
+
+/* Clears an nbnxn_pairlist_t data structure */
+static void clear_pairlist(nbnxn_pairlist_t *nbl)
+{
+ nbl->nci = 0;
+ nbl->nsci = 0;
+ nbl->ncj = 0;
+ nbl->ncj4 = 0;
+ nbl->nci_tot = 0;
+ nbl->nexcl = 1;
+
+ nbl->work->ncj_noq = 0;
+ nbl->work->ncj_hlj = 0;
+}
+
+/* Sets a simple list i-cell bounding box, including PBC shift */
+static void set_icell_bb_simple(const float *bb, int ci,
+ real shx, real shy, real shz,
+ float *bb_ci)
+{
+ int ia;
+
+ ia = ci*NNBSBB_B;
+ bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
+ bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
+ bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
+ bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
+ bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
+ bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
+}
+
+/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+static void set_icell_bb_supersub(const float *bb, int ci,
+ real shx, real shy, real shz,
+ float *bb_ci)
+{
+ int ia, m, i;
+
+#ifdef NBNXN_BBXXXX
+ ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+ for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
+ {
+ for (i = 0; i < STRIDE_PBB; i++)
+ {
+ bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+ bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+ bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+ bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+ bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+ bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
+ }
+ }
+#else
+ ia = ci*GPU_NSUBCELL*NNBSBB_B;
+ for (i = 0; i < GPU_NSUBCELL*NNBSBB_B; i += NNBSBB_B)
+ {
+ bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
+ bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
+ bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
+ bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
+ bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
+ bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
+ }
+#endif
+}
+
+/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
+static void icell_set_x_simple(int ci,
+ real shx, real shy, real shz,
+ int gmx_unused na_c,
+ int stride, const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia, i;
+
+ ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
+
+ for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE; i++)
+ {
+ work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
+ work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
+ work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
+ }
+}
+
+/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
+static void icell_set_x_supersub(int ci,
+ real shx, real shy, real shz,
+ int na_c,
+ int stride, const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia, i;
+ real *x_ci;
+
+ x_ci = work->x_ci;
+
+ ia = ci*GPU_NSUBCELL*na_c;
+ for (i = 0; i < GPU_NSUBCELL*na_c; i++)
+ {
+ x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
+ x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
+ x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
+ }
+}
+
+#ifdef NBNXN_SEARCH_BB_SSE
+/* Copies PBC shifted super-cell packed atom coordinates to working array */
+static void icell_set_x_supersub_sse8(int ci,
+ real shx, real shy, real shz,
+ int na_c,
+ int stride, const real *x,
+ nbnxn_list_work_t *work)
+{
+ int si, io, ia, i, j;
+ real *x_ci;
+
+ x_ci = work->x_ci;
+
+ for (si = 0; si < GPU_NSUBCELL; si++)
+ {
+ for (i = 0; i < na_c; i += STRIDE_PBB)
+ {
+ io = si*na_c + i;
+ ia = ci*GPU_NSUBCELL*na_c + io;
+ for (j = 0; j < STRIDE_PBB; j++)
+ {
+ x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+ x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+ x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
+ }
+ }
+ }
+}
+#endif
+
+static real nbnxn_rlist_inc_nonloc_fac = 0.6;
+
+/* Due to the cluster size the effective pair-list is longer than
+ * that of a simple atom pair-list. This function gives the extra distance.
+ */
+real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density)
+{
+ return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density), 1.0/3.0));
+}
+
+/* Estimates the interaction volume^2 for non-local interactions */
+static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)
+{
+ int z, d;
+ real cl, ca, za;
+ real vold_est;
+ real vol2_est_tot;
+
+ vol2_est_tot = 0;
+
+ /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
+ * not home interaction volume^2. As these volumes are not additive,
+ * this is an overestimate, but it would only be significant in the limit
+ * of small cells, where we anyhow need to split the lists into
+ * as small parts as possible.
+ */
+
+ for (z = 0; z < zones->n; z++)
+ {
+ if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
+ {
+ cl = 0;
+ ca = 1;
+ za = 1;
+ for (d = 0; d < DIM; d++)
+ {
+ if (zones->shift[z][d] == 0)
+ {
+ cl += 0.5*ls[d];
+ ca *= ls[d];
+ za *= zones->size[z].x1[d] - zones->size[z].x0[d];
+ }
+ }
+
+ /* 4 octants of a sphere */
+ vold_est = 0.25*M_PI*r*r*r*r;
+ /* 4 quarter pie slices on the edges */
+ vold_est += 4*cl*M_PI/6.0*r*r*r;
+ /* One rectangular volume on a face */
+ vold_est += ca*0.5*r*r;
+
+ vol2_est_tot += vold_est*za;
+ }
+ }
+
+ return vol2_est_tot;
+}
+
+/* Estimates the average size of a full j-list for super/sub setup */
+static int get_nsubpair_max(const nbnxn_search_t nbs,
+ int iloc,
+ real rlist,
+ int min_ci_balanced)
+{
+ const nbnxn_grid_t *grid;
+ rvec ls;
+ real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;
+ int nsubpair_max;
+
+ grid = &nbs->grid[0];
+
+ ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
+ ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
+ ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
+
+ /* The average squared length of the diagonal of a sub cell */
+ xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
+
+ /* The formulas below are a heuristic estimate of the average nsj per si*/
+ r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
+
+ if (!nbs->DomDec || nbs->zones->n == 1)
+ {
+ nsp_est_nl = 0;
+ }
+ else
+ {
+ nsp_est_nl =
+ sqr(grid->atom_density/grid->na_c)*
+ nonlocal_vol2(nbs->zones, ls, r_eff_sup);
+ }
+
+ if (LOCAL_I(iloc))
+ {
+ /* Sub-cell interacts with itself */
+ vol_est = ls[XX]*ls[YY]*ls[ZZ];
+ /* 6/2 rectangular volume on the faces */
+ vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
+ /* 12/2 quarter pie slices on the edges */
+ vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
+ /* 4 octants of a sphere */
+ vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
+
+ nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
+
+ /* Subtract the non-local pair count */
+ nsp_est -= nsp_est_nl;
+
+ if (debug)
+ {
+ fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
+ nsp_est, nsp_est_nl);
+ }
+ }
+ else
+ {
+ nsp_est = nsp_est_nl;
+ }
+
+ if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
+ {
+ /* We don't need to worry */
+ nsubpair_max = -1;
+ }
+ else
+ {
+ /* Thus the (average) maximum j-list size should be as follows */
+ nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5));
+
+ /* Since the target value is a maximum (this avoids high outliers,
+ * which lead to load imbalance), not average, we add half the
+ * number of pairs in a cj4 block to get the average about right.
+ */
+ nsubpair_max += GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE/2;
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",
+ nsp_est, nsubpair_max);
+ }
+
+ return nsubpair_max;
+}
+
+/* Debug list print function */
+static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
+{
+ int i, j;
+
+ for (i = 0; i < nbl->nci; i++)
+ {
+ fprintf(fp, "ci %4d shift %2d ncj %3d\n",
+ nbl->ci[i].ci, nbl->ci[i].shift,
+ nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
+
+ for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)
+ {
+ fprintf(fp, " cj %5d imask %x\n",
+ nbl->cj[j].cj,
+ nbl->cj[j].excl);
+ }
+ }
+}
+
+/* Debug list print function */
+static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
+{
+ int i, j4, j, ncp, si;
+
+ for (i = 0; i < nbl->nsci; i++)
+ {
+ fprintf(fp, "ci %4d shift %2d ncj4 %2d\n",
+ nbl->sci[i].sci, nbl->sci[i].shift,
+ nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
+
+ ncp = 0;
+ for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
+ {
+ for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
+ {
+ fprintf(fp, " sj %5d imask %x\n",
+ nbl->cj4[j4].cj[j],
+ nbl->cj4[j4].imei[0].imask);
+ for (si = 0; si < GPU_NSUBCELL; si++)
+ {
+ if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
+ {
+ ncp++;
+ }
+ }
+ }
+ }
+ fprintf(fp, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
+ nbl->sci[i].sci, nbl->sci[i].shift,
+ nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
+ ncp);
+ }
+}
+
+/* Combine pair lists *nbl generated on multiple threads nblc */
+static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
+ nbnxn_pairlist_t *nblc)
+{
+ int nsci, ncj4, nexcl;
+ int n, i;
+
+ if (nblc->bSimple)
+ {
+ gmx_incons("combine_nblists does not support simple lists");
+ }
+
+ nsci = nblc->nsci;
+ ncj4 = nblc->ncj4;
+ nexcl = nblc->nexcl;
+ for (i = 0; i < nnbl; i++)
+ {
+ nsci += nbl[i]->nsci;
+ ncj4 += nbl[i]->ncj4;
+ nexcl += nbl[i]->nexcl;
+ }
+
+ if (nsci > nblc->sci_nalloc)
+ {
+ nb_realloc_sci(nblc, nsci);
+ }
+ if (ncj4 > nblc->cj4_nalloc)
+ {
+ nblc->cj4_nalloc = over_alloc_small(ncj4);
+ nbnxn_realloc_void((void **)&nblc->cj4,
+ nblc->ncj4*sizeof(*nblc->cj4),
+ nblc->cj4_nalloc*sizeof(*nblc->cj4),
+ nblc->alloc, nblc->free);
+ }
+ if (nexcl > nblc->excl_nalloc)
+ {
+ nblc->excl_nalloc = over_alloc_small(nexcl);
+ nbnxn_realloc_void((void **)&nblc->excl,
+ nblc->nexcl*sizeof(*nblc->excl),
+ nblc->excl_nalloc*sizeof(*nblc->excl),
+ nblc->alloc, nblc->free);
+ }
+
+ /* Each thread should copy its own data to the combined arrays,
+ * as otherwise data will go back and forth between different caches.
+ */
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+ for (n = 0; n < nnbl; n++)
+ {
+ int sci_offset;
+ int cj4_offset;
+ int ci_offset;
+ int excl_offset;
+ int i, j4;
+ const nbnxn_pairlist_t *nbli;
+
+ /* Determine the offset in the combined data for our thread */
+ sci_offset = nblc->nsci;
+ cj4_offset = nblc->ncj4;
+ ci_offset = nblc->nci_tot;
+ excl_offset = nblc->nexcl;
+
+ for (i = 0; i < n; i++)
+ {
+ sci_offset += nbl[i]->nsci;
+ cj4_offset += nbl[i]->ncj4;
+ ci_offset += nbl[i]->nci_tot;
+ excl_offset += nbl[i]->nexcl;
+ }
+
+ nbli = nbl[n];
+
+ for (i = 0; i < nbli->nsci; i++)
+ {
+ nblc->sci[sci_offset+i] = nbli->sci[i];
+ nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
+ nblc->sci[sci_offset+i].cj4_ind_end += cj4_offset;
+ }
+
+ for (j4 = 0; j4 < nbli->ncj4; j4++)
+ {
+ nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
+ nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
+ nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
+ }
+
+ for (j4 = 0; j4 < nbli->nexcl; j4++)
+ {
+ nblc->excl[excl_offset+j4] = nbli->excl[j4];
+ }
+ }
+
+ for (n = 0; n < nnbl; n++)
+ {
+ nblc->nsci += nbl[n]->nsci;
+ nblc->ncj4 += nbl[n]->ncj4;
+ nblc->nci_tot += nbl[n]->nci_tot;
+ nblc->nexcl += nbl[n]->nexcl;
+ }
+}
+
+/* Returns the next ci to be processes by our thread */
+static gmx_bool next_ci(const nbnxn_grid_t *grid,
+ int conv,
+ int nth, int ci_block,
+ int *ci_x, int *ci_y,
+ int *ci_b, int *ci)
+{
+ (*ci_b)++;
+ (*ci)++;
+
+ if (*ci_b == ci_block)
+ {
+ /* Jump to the next block assigned to this task */
+ *ci += (nth - 1)*ci_block;
+ *ci_b = 0;
+ }
+
+ if (*ci >= grid->nc*conv)
+ {
+ return FALSE;
+ }
+
+ while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
+ {
+ *ci_y += 1;
+ if (*ci_y == grid->ncy)
+ {
+ *ci_x += 1;
+ *ci_y = 0;
+ }
+ }
+
+ return TRUE;
+}
+
+/* Returns the distance^2 for which we put cell pairs in the list
+ * without checking atom pair distances. This is usually < rlist^2.
+ */
+static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
+ const nbnxn_grid_t *gridj,
+ real rlist,
+ gmx_bool simple)
+{
+ /* If the distance between two sub-cell bounding boxes is less
+ * than this distance, do not check the distance between
+ * all particle pairs in the sub-cell, since then it is likely
+ * that the box pair has atom pairs within the cut-off.
+ * We use the nblist cut-off minus 0.5 times the average x/y diagonal
+ * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
+ * Using more than 0.5 gains at most 0.5%.
+ * If forces are calculated more than twice, the performance gain
+ * in the force calculation outweighs the cost of checking.
+ * Note that with subcell lists, the atom-pair distance check
+ * is only performed when only 1 out of 8 sub-cells in within range,
+ * this is because the GPU is much faster than the cpu.
+ */
+ real bbx, bby;
+ real rbb2;
+
+ bbx = 0.5*(gridi->sx + gridj->sx);
+ bby = 0.5*(gridi->sy + gridj->sy);
+ if (!simple)
+ {
+ bbx /= GPU_NSUBCELL_X;
+ bby /= GPU_NSUBCELL_Y;
+ }
+
+ rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
+
+#ifndef GMX_DOUBLE
+ return rbb2;
+#else
+ return (float)((1+GMX_FLOAT_EPS)*rbb2);
+#endif
+}
+
+static int get_ci_block_size(const nbnxn_grid_t *gridi,
+ gmx_bool bDomDec, int nth)
+{
+ const int ci_block_enum = 5;
+ const int ci_block_denom = 11;
+ const int ci_block_min_atoms = 16;
+ int ci_block;
+
+ /* Here we decide how to distribute the blocks over the threads.
+ * We use prime numbers to try to avoid that the grid size becomes
+ * a multiple of the number of threads, which would lead to some
+ * threads getting "inner" pairs and others getting boundary pairs,
+ * which in turns will lead to load imbalance between threads.
+ * Set the block size as 5/11/ntask times the average number of cells
+ * in a y,z slab. This should ensure a quite uniform distribution
+ * of the grid parts of the different thread along all three grid
+ * zone boundaries with 3D domain decomposition. At the same time
+ * the blocks will not become too small.
+ */
+ ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
+
+ /* Ensure the blocks are not too small: avoids cache invalidation */
+ if (ci_block*gridi->na_sc < ci_block_min_atoms)
+ {
+ ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
+ }
+
+ /* Without domain decomposition
+ * or with less than 3 blocks per task, divide in nth blocks.
+ */
+ if (!bDomDec || ci_block*3*nth > gridi->nc)
+ {
+ ci_block = (gridi->nc + nth - 1)/nth;
+ }
+
+ return ci_block;
+}
+
+/* Generates the part of pair-list nbl assigned to our thread */
+static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
+ const nbnxn_grid_t *gridi,
+ const nbnxn_grid_t *gridj,
+ nbnxn_search_work_t *work,
+ const nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ real rlist,
+ int nb_kernel_type,
+ int ci_block,
+ gmx_bool bFBufferFlag,
+ int nsubpair_max,
+ gmx_bool progBal,
+ int min_ci_balanced,
+ int th, int nth,
+ nbnxn_pairlist_t *nbl)
+{
+ int na_cj_2log;
+ matrix box;
+ real rl2;
+ float rbb2;
+ int d;
+ int ci_b, ci, ci_x, ci_y, ci_xy, cj;
+ ivec shp;
+ int tx, ty, tz;
+ int shift;
+ gmx_bool bMakeList;
+ real shx, shy, shz;
+ int conv_i, cell0_i;
+ const float *bb_i, *bbcz_i, *bbcz_j;
+ const int *flags_i;
+ real bx0, bx1, by0, by1, bz0, bz1;
+ real bz1_frac;
+ real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
+ int cxf, cxl, cyf, cyf_x, cyl;
+ int cx, cy;
+ int c0, c1, cs, cf, cl;
+ int ndistc;
+ int ncpcheck;
+ int gridi_flag_shift = 0, gridj_flag_shift = 0;
+ unsigned *gridj_flag = NULL;
+ int ncj_old_i, ncj_old_j;
+
+ nbs_cycle_start(&work->cc[enbsCCsearch]);
+
+ if (gridj->bSimple != nbl->bSimple)
+ {
+ gmx_incons("Grid incompatible with pair-list");
+ }
+
+ sync_work(nbl);
+ nbl->na_sc = gridj->na_sc;
+ nbl->na_ci = gridj->na_c;
+ nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
+ na_cj_2log = get_2log(nbl->na_cj);
+
+ nbl->rlist = rlist;
+
+ if (bFBufferFlag)
+ {
+ /* Determine conversion of clusters to flag blocks */
+ gridi_flag_shift = 0;
+ while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
+ {
+ gridi_flag_shift++;
+ }
+ gridj_flag_shift = 0;
+ while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
+ {
+ gridj_flag_shift++;
+ }
+
+ gridj_flag = work->buffer_flags.flag;
+ }
+
+ copy_mat(nbs->box, box);
+
+ rl2 = nbl->rlist*nbl->rlist;
+
+ rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);
+
+ if (debug)
+ {
+ fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));
+ }
+
+ /* Set the shift range */
+ for (d = 0; d < DIM; d++)
+ {
+ /* Check if we need periodicity shifts.
+ * Without PBC or with domain decomposition we don't need them.
+ */
+ if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
+ {
+ shp[d] = 0;
+ }
+ else
+ {
+ if (d == XX &&
+ box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
+ {
+ shp[d] = 2;
+ }
+ else
+ {
+ shp[d] = 1;
+ }
+ }
+ }
+
+ if (nbl->bSimple && !gridi->bSimple)
+ {
+ conv_i = gridi->na_sc/gridj->na_sc;
+ bb_i = gridi->bb_simple;
+ bbcz_i = gridi->bbcz_simple;
+ flags_i = gridi->flags_simple;
+ }
+ else
+ {
+ conv_i = 1;
+ bb_i = gridi->bb;
+ bbcz_i = gridi->bbcz;
+ flags_i = gridi->flags;
+ }
+ cell0_i = gridi->cell0*conv_i;
+
+ bbcz_j = gridj->bbcz;
+
+ if (conv_i != 1)
+ {
+ /* Blocks of the conversion factor - 1 give a large repeat count
+ * combined with a small block size. This should result in good
+ * load balancing for both small and large domains.
+ */
+ ci_block = conv_i - 1;
+ }
+ if (debug)
+ {
+ fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
+ gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
+ }
+
+ ndistc = 0;
+ ncpcheck = 0;
+
+ /* Initially ci_b and ci to 1 before where we want them to start,
+ * as they will both be incremented in next_ci.
+ */
+ ci_b = -1;
+ ci = th*ci_block - 1;
+ ci_x = 0;
+ ci_y = 0;
+ while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
+ {
+ if (nbl->bSimple && flags_i[ci] == 0)
+ {
+ continue;
+ }
+
+ ncj_old_i = nbl->ncj;
+
+ d2cx = 0;
+ if (gridj != gridi && shp[XX] == 0)
+ {
+ if (nbl->bSimple)
+ {
+ bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
+ }
+ else
+ {
+ bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
+ }
+ if (bx1 < gridj->c0[XX])
+ {
+ d2cx = sqr(gridj->c0[XX] - bx1);
+
+ if (d2cx >= rl2)
+ {
+ continue;
+ }
+ }
+ }
+
+ ci_xy = ci_x*gridi->ncy + ci_y;
+
+ /* Loop over shift vectors in three dimensions */
+ for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
+ {
+ shz = tz*box[ZZ][ZZ];
+
+ bz0 = bbcz_i[ci*NNBSBB_D ] + shz;
+ bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
+
+ if (tz == 0)
+ {
+ d2z = 0;
+ }
+ else if (tz < 0)
+ {
+ d2z = sqr(bz1);
+ }
+ else
+ {
+ d2z = sqr(bz0 - box[ZZ][ZZ]);
+ }
+
+ d2z_cx = d2z + d2cx;
+
+ if (d2z_cx >= rl2)
+ {
+ continue;
+ }
+
+ bz1_frac =
+ bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
+ if (bz1_frac < 0)
+ {
+ bz1_frac = 0;
+ }
+ /* The check with bz1_frac close to or larger than 1 comes later */
+
+ for (ty = -shp[YY]; ty <= shp[YY]; ty++)
+ {
+ shy = ty*box[YY][YY] + tz*box[ZZ][YY];
+
+ if (nbl->bSimple)
+ {
+ by0 = bb_i[ci*NNBSBB_B +YY] + shy;
+ by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
+ }
+ else
+ {
+ by0 = gridi->c0[YY] + (ci_y )*gridi->sy + shy;
+ by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
+ }
+
+ get_cell_range(by0, by1,
+ gridj->ncy, gridj->c0[YY], gridj->sy, gridj->inv_sy,
+ d2z_cx, rl2,
+ &cyf, &cyl);
+
+ if (cyf > cyl)
+ {
+ continue;
+ }
+
+ d2z_cy = d2z;
+ if (by1 < gridj->c0[YY])
+ {
+ d2z_cy += sqr(gridj->c0[YY] - by1);
+ }
+ else if (by0 > gridj->c1[YY])
+ {
+ d2z_cy += sqr(by0 - gridj->c1[YY]);
+ }
+
+ for (tx = -shp[XX]; tx <= shp[XX]; tx++)
+ {
+ shift = XYZ2IS(tx, ty, tz);
+
+#ifdef NBNXN_SHIFT_BACKWARD
+ if (gridi == gridj && shift > CENTRAL)
+ {
+ continue;
+ }
+#endif
+
+ shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
+
+ if (nbl->bSimple)
+ {
+ bx0 = bb_i[ci*NNBSBB_B +XX] + shx;
+ bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
+ }
+ else
+ {
+ bx0 = gridi->c0[XX] + (ci_x )*gridi->sx + shx;
+ bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
+ }
+
+ get_cell_range(bx0, bx1,
+ gridj->ncx, gridj->c0[XX], gridj->sx, gridj->inv_sx,
+ d2z_cy, rl2,
+ &cxf, &cxl);
+
+ if (cxf > cxl)
+ {
+ continue;
+ }
+
+ if (nbl->bSimple)
+ {
+ new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci]);
+ }
+ else
+ {
+ new_sci_entry(nbl, cell0_i+ci, shift);
+ }
+
+#ifndef NBNXN_SHIFT_BACKWARD
+ if (cxf < ci_x)
+#else
+ if (shift == CENTRAL && gridi == gridj &&
+ cxf < ci_x)
+#endif
+ {
+ /* Leave the pairs with i > j.
+ * x is the major index, so skip half of it.
+ */
+ cxf = ci_x;
+ }
+
+ if (nbl->bSimple)
+ {
+ set_icell_bb_simple(bb_i, ci, shx, shy, shz,
+ nbl->work->bb_ci);
+ }
+ else
+ {
+ set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
+ nbl->work->bb_ci);
+ }
+
+ nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
+ gridi->na_c, nbat->xstride, nbat->x,
+ nbl->work);
+
+ for (cx = cxf; cx <= cxl; cx++)
+ {
+ d2zx = d2z;
+ if (gridj->c0[XX] + cx*gridj->sx > bx1)
+ {
+ d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
+ }
+ else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
+ {
+ d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
+ }
+
+#ifndef NBNXN_SHIFT_BACKWARD
+ if (gridi == gridj &&
+ cx == 0 && cyf < ci_y)
+#else
+ if (gridi == gridj &&
+ cx == 0 && shift == CENTRAL && cyf < ci_y)
+#endif
+ {
+ /* Leave the pairs with i > j.
+ * Skip half of y when i and j have the same x.
+ */
+ cyf_x = ci_y;
+ }
+ else
+ {
+ cyf_x = cyf;
+ }
+
+ for (cy = cyf_x; cy <= cyl; cy++)
+ {
+ c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
+ c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
+#ifdef NBNXN_SHIFT_BACKWARD
+ if (gridi == gridj &&
+ shift == CENTRAL && c0 < ci)
+ {
+ c0 = ci;
+ }
+#endif
+
+ d2zxy = d2zx;
+ if (gridj->c0[YY] + cy*gridj->sy > by1)
+ {
+ d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
+ }
+ else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
+ {
+ d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
+ }
+ if (c1 > c0 && d2zxy < rl2)
+ {
+ cs = c0 + (int)(bz1_frac*(c1 - c0));
+ if (cs >= c1)
+ {
+ cs = c1 - 1;
+ }
+
+ d2xy = d2zxy - d2z;
+
+ /* Find the lowest cell that can possibly
+ * be within range.
+ */
+ cf = cs;
+ while (cf > c0 &&
+ (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
+ d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
+ {
+ cf--;
+ }
+
+ /* Find the highest cell that can possibly
+ * be within range.
+ */
+ cl = cs;
+ while (cl < c1-1 &&
+ (bbcz_j[cl*NNBSBB_D] <= bz1 ||
+ d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
+ {
+ cl++;
+ }
+
+#ifdef NBNXN_REFCODE
+ {
+ /* Simple reference code, for debugging,
+ * overrides the more complex code above.
+ */
+ int k;
+ cf = c1;
+ cl = -1;
+ for (k = c0; k < c1; k++)
+ {
+ if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
+ bb+k*NNBSBB_B) < rl2 &&
+ k < cf)
+ {
+ cf = k;
+ }
+ if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
+ bb+k*NNBSBB_B) < rl2 &&
+ k > cl)
+ {
+ cl = k;
+ }
+ }
+ }
+#endif
+
+ if (gridi == gridj)
+ {
+ /* We want each atom/cell pair only once,
+ * only use cj >= ci.
+ */
+#ifndef NBNXN_SHIFT_BACKWARD
+ cf = max(cf, ci);
+#else
+ if (shift == CENTRAL)
+ {
+ cf = max(cf, ci);
+ }
+#endif
+ }
+
+ if (cf <= cl)
+ {
+ /* For f buffer flags with simple lists */
+ ncj_old_j = nbl->ncj;
+
+ switch (nb_kernel_type)
+ {
+ case nbnxnk4x4_PlainC:
+ check_subcell_list_space_simple(nbl, cl-cf+1);
+
+ make_cluster_list_simple(gridj,
+ nbl, ci, cf, cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2, rbb2,
+ &ndistc);
+ break;
+#ifdef GMX_NBNXN_SIMD_4XN
+ case nbnxnk4xN_SIMD_4xN:
+ check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
+ make_cluster_list_simd_4xn(gridj,
+ nbl, ci, cf, cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2, rbb2,
+ &ndistc);
+ break;
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case nbnxnk4xN_SIMD_2xNN:
+ check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
+ make_cluster_list_simd_2xnn(gridj,
+ nbl, ci, cf, cl,
+ (gridi == gridj && shift == CENTRAL),
+ nbat->x,
+ rl2, rbb2,
+ &ndistc);
+ break;
+#endif
+ case nbnxnk8x8x8_PlainC:
+ case nbnxnk8x8x8_CUDA:
+ check_subcell_list_space_supersub(nbl, cl-cf+1);
+ for (cj = cf; cj <= cl; cj++)
+ {
+ make_cluster_list_supersub(gridi, gridj,
+ nbl, ci, cj,
+ (gridi == gridj && shift == CENTRAL && ci == cj),
+ nbat->xstride, nbat->x,
+ rl2, rbb2,
+ &ndistc);
+ }
+ break;
+ }
+ ncpcheck += cl - cf + 1;
+
+ if (bFBufferFlag && nbl->ncj > ncj_old_j)
+ {
+ int cbf, cbl, cb;
+
+ cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
+ cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
+ for (cb = cbf; cb <= cbl; cb++)
+ {
+ gridj_flag[cb] = 1U<<th;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* Set the exclusions for this ci list */
+ if (nbl->bSimple)
+ {
+ set_ci_top_excls(nbs,
+ nbl,
+ shift == CENTRAL && gridi == gridj,
+ gridj->na_c_2log,
+ na_cj_2log,
+ &(nbl->ci[nbl->nci]),
+ excl);
+ }
+ else
+ {
+ set_sci_top_excls(nbs,
+ nbl,
+ shift == CENTRAL && gridi == gridj,
+ gridj->na_c_2log,
+ &(nbl->sci[nbl->nsci]),
+ excl);
+ }
+
+ /* Close this ci list */
+ if (nbl->bSimple)
+ {
+ close_ci_entry_simple(nbl);
+ }
+ else
+ {
+ close_ci_entry_supersub(nbl,
+ nsubpair_max,
+ progBal, min_ci_balanced,
+ th, nth);
+ }
+ }
+ }
+ }
+
+ if (bFBufferFlag && nbl->ncj > ncj_old_i)
+ {
+ work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
+ }
+ }
+
+ work->ndistc = ndistc;
+
+ nbs_cycle_stop(&work->cc[enbsCCsearch]);
+
+ if (debug)
+ {
+ fprintf(debug, "number of distance checks %d\n", ndistc);
+ fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",
+ ncpcheck);
+
+ if (nbl->bSimple)
+ {
+ print_nblist_statistics_simple(debug, nbl, nbs, rlist);
+ }
+ else
+ {
+ print_nblist_statistics_supersub(debug, nbl, nbs, rlist);
+ }
+
+ }
+}
+
+static void reduce_buffer_flags(const nbnxn_search_t nbs,
+ int nsrc,
+ const nbnxn_buffer_flags_t *dest)
+{
+ int s, b;
+ const unsigned *flag;
+
+ for (s = 0; s < nsrc; s++)
+ {
+ flag = nbs->work[s].buffer_flags.flag;
+
+ for (b = 0; b < dest->nflag; b++)
+ {
+ dest->flag[b] |= flag[b];
+ }
+ }
+}
+
+static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
+{
+ int nelem, nkeep, ncopy, nred, b, c, out;
+
+ nelem = 0;
+ nkeep = 0;
+ ncopy = 0;
+ nred = 0;
+ for (b = 0; b < flags->nflag; b++)
+ {
+ if (flags->flag[b] == 1)
+ {
+ /* Only flag 0 is set, no copy of reduction required */
+ nelem++;
+ nkeep++;
+ }
+ else if (flags->flag[b] > 0)
+ {
+ c = 0;
+ for (out = 0; out < nout; out++)
+ {
+ if (flags->flag[b] & (1U<<out))
+ {
+ c++;
+ }
+ }
+ nelem += c;
+ if (c == 1)
+ {
+ ncopy++;
+ }
+ else
+ {
+ nred += c;
+ }
+ }
+ }
+
+ fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
+ flags->nflag, nout,
+ nelem/(double)(flags->nflag),
+ nkeep/(double)(flags->nflag),
+ ncopy/(double)(flags->nflag),
+ nred/(double)(flags->nflag));
+}
+
+/* Perform a count (linear) sort to sort the smaller lists to the end.
+ * This avoids load imbalance on the GPU, as large lists will be
+ * scheduled and executed first and the smaller lists later.
+ * Load balancing between multi-processors only happens at the end
+ * and there smaller lists lead to more effective load balancing.
+ * The sorting is done on the cj4 count, not on the actual pair counts.
+ * Not only does this make the sort faster, but it also results in
+ * better load balancing than using a list sorted on exact load.
+ * This function swaps the pointer in the pair list to avoid a copy operation.
+ */
+static void sort_sci(nbnxn_pairlist_t *nbl)
+{
+ nbnxn_list_work_t *work;
+ int m, i, s, s0, s1;
+ nbnxn_sci_t *sci_sort;
+
+ if (nbl->ncj4 <= nbl->nsci)
+ {
+ /* nsci = 0 or all sci have size 1, sorting won't change the order */
+ return;
+ }
+
+ work = nbl->work;
+
+ /* We will distinguish differences up to double the average */
+ m = (2*nbl->ncj4)/nbl->nsci;
+
+ if (m + 1 > work->sort_nalloc)
+ {
+ work->sort_nalloc = over_alloc_large(m + 1);
+ srenew(work->sort, work->sort_nalloc);
+ }
+
+ if (work->sci_sort_nalloc != nbl->sci_nalloc)
+ {
+ work->sci_sort_nalloc = nbl->sci_nalloc;
+ nbnxn_realloc_void((void **)&work->sci_sort,
+ 0,
+ work->sci_sort_nalloc*sizeof(*work->sci_sort),
+ nbl->alloc, nbl->free);
+ }
+
+ /* Count the entries of each size */
+ for (i = 0; i <= m; i++)
+ {
+ work->sort[i] = 0;
+ }
+ for (s = 0; s < nbl->nsci; s++)
+ {
+ i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
+ work->sort[i]++;
+ }
+ /* Calculate the offset for each count */
+ s0 = work->sort[m];
+ work->sort[m] = 0;
+ for (i = m - 1; i >= 0; i--)
+ {
+ s1 = work->sort[i];
+ work->sort[i] = work->sort[i + 1] + s0;
+ s0 = s1;
+ }
+
+ /* Sort entries directly into place */
+ sci_sort = work->sci_sort;
+ for (s = 0; s < nbl->nsci; s++)
+ {
+ i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
+ sci_sort[work->sort[i]++] = nbl->sci[s];
+ }
+
+ /* Swap the sci pointers so we use the new, sorted list */
+ work->sci_sort = nbl->sci;
+ nbl->sci = sci_sort;
+}
+
+/* Make a local or non-local pair-list, depending on iloc */
+void nbnxn_make_pairlist(const nbnxn_search_t nbs,
+ nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ real rlist,
+ int min_ci_balanced,
+ nbnxn_pairlist_set_t *nbl_list,
+ int iloc,
+ int nb_kernel_type,
+ t_nrnb *nrnb)
+{
+ nbnxn_grid_t *gridi, *gridj;
+ gmx_bool bGPUCPU;
+ int nzi, zi, zj0, zj1, zj;
+ int nsubpair_max;
+ int th;
+ int nnbl;
+ nbnxn_pairlist_t **nbl;
+ int ci_block;
+ gmx_bool CombineNBLists;
+ gmx_bool progBal;
+ int np_tot, np_noq, np_hlj, nap;
+
+ /* Check if we are running hybrid GPU + CPU nbnxn mode */
+ bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
+
+ nnbl = nbl_list->nnbl;
+ nbl = nbl_list->nbl;
+ CombineNBLists = nbl_list->bCombined;
+
+ if (debug)
+ {
+ fprintf(debug, "ns making %d nblists\n", nnbl);
+ }
+
+ nbat->bUseBufferFlags = (nbat->nout > 1);
+ /* We should re-init the flags before making the first list */
+ if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
+ {
+ init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
+ }
+
+ if (nbl_list->bSimple)
+ {
+ switch (nb_kernel_type)
+ {
+#ifdef GMX_NBNXN_SIMD_4XN
+ case nbnxnk4xN_SIMD_4xN:
+ nbs->icell_set_x = icell_set_x_simd_4xn;
+ break;
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case nbnxnk4xN_SIMD_2xNN:
+ nbs->icell_set_x = icell_set_x_simd_2xnn;
+ break;
+#endif
+ default:
+ nbs->icell_set_x = icell_set_x_simple;
+ break;
+ }
+ }
+ else
+ {
+#ifdef NBNXN_SEARCH_BB_SSE
+ nbs->icell_set_x = icell_set_x_supersub_sse8;
+#else
+ nbs->icell_set_x = icell_set_x_supersub;
+#endif
+ }
+
+ if (LOCAL_I(iloc))
+ {
+ /* Only zone (grid) 0 vs 0 */
+ nzi = 1;
+ zj0 = 0;
+ zj1 = 1;
+ }
+ else
+ {
+ nzi = nbs->zones->nizone;
+ }
+
+ if (!nbl_list->bSimple && min_ci_balanced > 0)
+ {
+ nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);
+ }
+ else
+ {
+ nsubpair_max = 0;
+ }
+
+ /* Clear all pair-lists */
+ for (th = 0; th < nnbl; th++)
+ {
+ clear_pairlist(nbl[th]);
+ }
+
+ for (zi = 0; zi < nzi; zi++)
+ {
+ gridi = &nbs->grid[zi];
+
+ if (NONLOCAL_I(iloc))
+ {
+ zj0 = nbs->zones->izone[zi].j0;
+ zj1 = nbs->zones->izone[zi].j1;
+ if (zi == 0)
+ {
+ zj0++;
+ }
+ }
+ for (zj = zj0; zj < zj1; zj++)
+ {
+ gridj = &nbs->grid[zj];
+
+ if (debug)
+ {
+ fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
+ }
+
+ nbs_cycle_start(&nbs->cc[enbsCCsearch]);
+
+ if (nbl[0]->bSimple && !gridi->bSimple)
+ {
+ /* Hybrid list, determine blocking later */
+ ci_block = 0;
+ }
+ else
+ {
+ ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
+ }
+
+#pragma omp parallel for num_threads(nnbl) schedule(static)
+ for (th = 0; th < nnbl; th++)
+ {
+ /* Re-init the thread-local work flag data before making
+ * the first list (not an elegant conditional).
+ */
+ if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
+ (bGPUCPU && zi == 0 && zj == 1)))
+ {
+ init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
+ }
+
+ if (CombineNBLists && th > 0)
+ {
+ clear_pairlist(nbl[th]);
+ }
+
+ /* With GPU: generate progressively smaller lists for
+ * load balancing for local only or non-local with 2 zones.
+ */
+ progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
+
+ /* Divide the i super cell equally over the nblists */
+ nbnxn_make_pairlist_part(nbs, gridi, gridj,
+ &nbs->work[th], nbat, excl,
+ rlist,
+ nb_kernel_type,
+ ci_block,
+ nbat->bUseBufferFlags,
+ nsubpair_max,
+ progBal, min_ci_balanced,
+ th, nnbl,
+ nbl[th]);
+ }
+ nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
+
+ np_tot = 0;
+ np_noq = 0;
+ np_hlj = 0;
+ for (th = 0; th < nnbl; th++)
+ {
+ inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
+
+ if (nbl_list->bSimple)
+ {
+ np_tot += nbl[th]->ncj;
+ np_noq += nbl[th]->work->ncj_noq;
+ np_hlj += nbl[th]->work->ncj_hlj;
+ }
+ else
+ {
+ /* This count ignores potential subsequent pair pruning */
+ np_tot += nbl[th]->nci_tot;
+ }
+ }
+ nap = nbl[0]->na_ci*nbl[0]->na_cj;
+ nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
+ nbl_list->natpair_lj = np_noq*nap;
+ nbl_list->natpair_q = np_hlj*nap/2;
+
+ if (CombineNBLists && nnbl > 1)
+ {
+ nbs_cycle_start(&nbs->cc[enbsCCcombine]);
+
+ combine_nblists(nnbl-1, nbl+1, nbl[0]);
+
+ nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
+ }
+ }
+ }
+
+ if (!nbl_list->bSimple)
+ {
+ /* Sort the entries on size, large ones first */
+ if (CombineNBLists || nnbl == 1)
+ {
+ sort_sci(nbl[0]);
+ }
+ else
+ {
+#pragma omp parallel for num_threads(nnbl) schedule(static)
+ for (th = 0; th < nnbl; th++)
+ {
+ sort_sci(nbl[th]);
+ }
+ }
+ }
+
+ if (nbat->bUseBufferFlags)
+ {
+ reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);
+ }
+
+ /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+ if (LOCAL_I(iloc))
+ {
+ nbs->search_count++;
+ }
+ if (nbs->print_cycles &&
+ (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
+ nbs->search_count % 100 == 0)
+ {
+ nbs_cycle_print(stderr, nbs);
+ }
+
+ if (debug && (CombineNBLists && nnbl > 1))
+ {
+ if (nbl[0]->bSimple)
+ {
+ print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);
+ }
+ else
+ {
+ print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);
+ }
+ }
+
+ if (debug)
+ {
+ if (gmx_debug_at)
+ {
+ if (nbl[0]->bSimple)
+ {
+ print_nblist_ci_cj(debug, nbl[0]);
+ }
+ else
+ {
+ print_nblist_sci_cj(debug, nbl[0]);
+ }
+ }
+
+ if (nbat->bUseBufferFlags)
+ {
+ print_reduction_cost(&nbat->buffer_flags, nnbl);
+ }
+ }
+}
--- /dev/null
- #if GMX_NBNXN_SIMD_BITWIDTH != 256
- #error "unsupported SIMD width"
- #endif
-
- #include "gmx_simd_macros.h"
-
- /* Define a few macros for half-width SIMD */
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
- /* Half-width SIMD real type */
- #define gmx_mm_hpr __m128
- /* Half-width SIMD operations */
- /* Load reals at half-width aligned pointer b into half-width SIMD register a */
- #define gmx_load_hpr(a,b) a = _mm_load_ps(b)
- #define gmx_set1_hpr _mm_set1_ps
- /* Load reals at half-width aligned pointer b into two halves of a */
- #define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
- /* Store half width SIMD registers b and c in ful width register a */
- #define gmx_2hpr_to_pr(a, b, c) a = _mm256_insertf128_ps(_mm256_castps128_ps256(b), c, 0x1)
- #else
- #error "Half-width SIMD macros are not yet defined"
- #endif
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
- gmx_load_hpr(a_S, a);
++/* Get the half-width SIMD stuff from the kernel utils files */
++#include "nbnxn_kernels/nbnxn_kernel_simd_utils.h"
+
+
+#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE/2)
+#else
+#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+{
+ gmx_mm_hpr a_S;
+ gmx_mm_pr a_a_S;
+
- gmx_2hpr_to_pr(a_a_S, a_S, a_S);
++ gmx_load_hpr(&a_S, a);
+
- a0_S = gmx_set1_hpr(a[0] + shift);
- a1_S = gmx_set1_hpr(a[1] + shift);
++ gmx_2hpr_to_pr(a_S, a_S, &a_a_S);
+
+ return a_a_S;
+}
+
+static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift)
+{
+ gmx_mm_hpr a0_S, a1_S;
+ gmx_mm_pr a0_a1_S;
+
- gmx_2hpr_to_pr(a0_a1_S, a0_S, a1_S);
++ gmx_set1_hpr(&a0_S, a[0] + shift);
++ gmx_set1_hpr(&a1_S, a[1] + shift);
+
- #ifndef GMX_HAVE_SIMD_ANYTRUE
++ gmx_2hpr_to_pr(a0_S, a1_S, &a0_a1_S);
+
+ return a0_a1_S;
+}
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_2xnn(int ci,
+ real shx, real shy, real shz,
+ int gmx_unused na_c,
+ int gmx_unused stride, const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia;
+ nbnxn_x_ci_simd_2xnn_t *x_ci;
+
+ x_ci = work->x_ci_simd_2xnn;
+
+ ia = X_IND_CI_SIMD_2XNN(ci);
+
+ x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+ x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+ x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+ x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+ x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+ x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+}
+
- gmx_anytrue_2xn_pr(gmx_mm_pr bool_S)
++#ifndef GMX_SIMD_HAVE_ANYTRUE
+/* Fallback function in case gmx_anytrue_pr is not present */
+static gmx_inline gmx_bool
- gmx_store_pr(bools, bool_S);
++gmx_anytrue_2xn_pb(gmx_mm_pb bool_S)
+{
+ real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
+ gmx_bool any;
+ int s;
+
+ bools = gmx_simd_align_real(bools_array);
+
- gmx_mm_pr wco_SSE0;
- gmx_mm_pr wco_SSE2;
- gmx_mm_pr wco_any_SSE;
++ gmx_store_pb(bools, bool_S);
+
+ any = FALSE;
+ for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
+ {
+ if (GMX_SIMD_IS_TRUE(s))
+ {
+ any = TRUE;
+ }
+ }
+
+ return any;
+}
+#endif
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci, int cjf, int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2, float rbb2,
+ int *ndistc)
+{
+ const nbnxn_x_ci_simd_2xnn_t *work;
+ const float *bb_ci;
+
+ gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
+
+ gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
+ gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
+
+ gmx_mm_pr rsq_SSE0;
+ gmx_mm_pr rsq_SSE2;
+
- d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++ gmx_mm_pb wco_SSE0;
++ gmx_mm_pb wco_SSE2;
++ gmx_mm_pb wco_any_SSE;
+
+ gmx_mm_pr rc2_SSE;
+
+ gmx_bool InRange;
+ float d2;
+ int xind_f, xind_l, cj;
+
+ cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+ cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+
+ work = nbl->work->x_ci_simd_2xnn;
+
+ bb_ci = nbl->work->bb_ci;
+
+ rc2_SSE = gmx_set1_pr(rl2);
+
+ InRange = FALSE;
+ while (!InRange && cjf <= cjl)
+ {
- wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
++#ifdef NBNXN_SEARCH_BB_SSE
++ d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++#else
++ d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
++#endif
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+
+ jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+ jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+ jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+
- #ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
++ wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2);
+
- InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++ InRange = gmx_anytrue_pb(wco_any_SSE);
+#else
- d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++ InRange = gmx_anytrue_2xn_pb(wco_any_SSE);
+#endif
+
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjf++;
+ }
+ }
+ if (!InRange)
+ {
+ return;
+ }
+
+ InRange = FALSE;
+ while (!InRange && cjl > cjf)
+ {
- wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
++#ifdef NBNXN_SEARCH_BB_SSE
++ d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++#else
++ d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
++#endif
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+
+ jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+ jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+ jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+
- #ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
++ wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2);
+
- InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++ InRange = gmx_anytrue_pb(wco_any_SSE);
+#else
-
- #undef gmx_mm_hpr
- #undef gmx_load_hpr
- #undef gmx_set1_hpr
- #undef gmx_2hpr_to_pr
++ InRange = gmx_anytrue_2xn_pb(wco_any_SSE);
+#endif
+
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjl--;
+ }
+ }
+
+ if (cjf <= cjl)
+ {
+ for (cj = cjf; cj <= cjl; cj++)
+ {
+ /* Store cj and the interaction mask */
+ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+ nbl->cj[nbl->ncj].excl = get_imask_simd_2xnn(remove_sub_diag, ci, cj);
+ nbl->ncj++;
+ }
+ /* Increase the closing index in i super-cell list */
+ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ }
+}
+
+#undef STRIDE_S
--- /dev/null
- #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
- #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
- #endif
-
- #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
- #define GMX_USE_HALF_WIDTH_SIMD_HERE
- #endif
- #include "gmx_simd_macros.h"
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
- #ifndef GMX_HAVE_SIMD_ANYTRUE
+
+#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE)
+#else
+#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_4xn(int ci,
+ real shx, real shy, real shz,
+ int gmx_unused na_c,
+ int gmx_unused stride, const real *x,
+ nbnxn_list_work_t *work)
+{
+ int ia;
+ nbnxn_x_ci_simd_4xn_t *x_ci;
+
+ x_ci = work->x_ci_simd_4xn;
+
+ ia = X_IND_CI_SIMD_4XN(ci);
+
+ x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx);
+ x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy);
+ x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz);
+ x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
+ x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
+ x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
+ x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
+ x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
+ x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
+ x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
+ x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
+ x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+}
+
- gmx_anytrue_4xn_pr(gmx_mm_pr bool_S)
++#ifndef GMX_SIMD_HAVE_ANYTRUE
+/* Fallback function in case gmx_anytrue_pr is not present */
+static gmx_inline gmx_bool
- gmx_store_pr(bools, bool_S);
++gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
+{
+ real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
+ gmx_bool any;
+ int s;
+
+ bools = gmx_simd_align_real(bools_array);
+
- gmx_mm_pr wco_SSE0;
- gmx_mm_pr wco_SSE1;
- gmx_mm_pr wco_SSE2;
- gmx_mm_pr wco_SSE3;
- gmx_mm_pr wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
++ gmx_store_pb(bools, bool_S);
+
+ any = FALSE;
+ for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
+ {
+ if (GMX_SIMD_IS_TRUE(bools[s]))
+ {
+ any = TRUE;
+ }
+ }
+
+ return any;
+}
+#endif
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+ nbnxn_pairlist_t *nbl,
+ int ci, int cjf, int cjl,
+ gmx_bool remove_sub_diag,
+ const real *x_j,
+ real rl2, float rbb2,
+ int *ndistc)
+{
+ const nbnxn_x_ci_simd_4xn_t *work;
+ const float *bb_ci;
+
+ gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
+
+ gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
+ gmx_mm_pr dx_SSE1, dy_SSE1, dz_SSE1;
+ gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
+ gmx_mm_pr dx_SSE3, dy_SSE3, dz_SSE3;
+
+ gmx_mm_pr rsq_SSE0;
+ gmx_mm_pr rsq_SSE1;
+ gmx_mm_pr rsq_SSE2;
+ gmx_mm_pr rsq_SSE3;
+
- d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++ gmx_mm_pb wco_SSE0;
++ gmx_mm_pb wco_SSE1;
++ gmx_mm_pb wco_SSE2;
++ gmx_mm_pb wco_SSE3;
++ gmx_mm_pb wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+
+ gmx_mm_pr rc2_SSE;
+
+ gmx_bool InRange;
+ float d2;
+ int xind_f, xind_l, cj;
+
+ cjf = CI_TO_CJ_SIMD_4XN(cjf);
+ cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
+
+ work = nbl->work->x_ci_simd_4xn;
+
+ bb_ci = nbl->work->bb_ci;
+
+ rc2_SSE = gmx_set1_pr(rl2);
+
+ InRange = FALSE;
+ while (!InRange && cjf <= cjl)
+ {
- wco_any_SSE01 = gmx_or_pr(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
- wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
++#ifdef NBNXN_SEARCH_BB_SSE
++ d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
++#else
++ d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
++#endif
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+
+ jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
+ jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
+ jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE);
+ dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE);
+ dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE);
+ dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE);
+ dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
+
- #ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
++ wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1);
++ wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3);
++ wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
+
- InRange = gmx_anytrue_4xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++ InRange = gmx_anytrue_pb(wco_any_SSE);
+#else
- d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++ InRange = gmx_anytrue_4xn_pb(wco_any_SSE);
+#endif
+
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjf++;
+ }
+ }
+ if (!InRange)
+ {
+ return;
+ }
+
+ InRange = FALSE;
+ while (!InRange && cjl > cjf)
+ {
- wco_any_SSE01 = gmx_or_pr(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
- wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
++#ifdef NBNXN_SEARCH_BB_SSE
++ d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
++#else
++ d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
++#endif
+ *ndistc += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rl2)
+ {
+ xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+
+ jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
+ jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
+ jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+
+ /* Calculate distance */
+ dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
+ dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
+ dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
+ dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE);
+ dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE);
+ dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE);
+ dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
+ dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
+ dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
+ dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE);
+ dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE);
+ dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
+ rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
+ rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
+
+ wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
+ wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
+ wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
+
- #ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
++ wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1);
++ wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3);
++ wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
+
- InRange = gmx_anytrue_4xn_pr(wco_any_SSE);
++#ifdef GMX_SIMD_HAVE_ANYTRUE
++ InRange = gmx_anytrue_pb(wco_any_SSE);
+#else
- #undef GMX_USE_HALF_WIDTH_SIMD_HERE
++ InRange = gmx_anytrue_4xn_pb(wco_any_SSE);
+#endif
+
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
+ }
+ if (!InRange)
+ {
+ cjl--;
+ }
+ }
+
+ if (cjf <= cjl)
+ {
+ for (cj = cjf; cj <= cjl; cj++)
+ {
+ /* Store cj and the interaction mask */
+ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+ nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
+ nbl->ncj++;
+ }
+ /* Increase the closing index in i super-cell list */
+ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+ }
+}
+
+#undef STRIDE_S
++
--- /dev/null
- /* Single precision, with SSE2 or higher available */
- #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+/* IMPORTANT FOR DEVELOPERS:
+ *
+ * Triclinic pme stuff isn't entirely trivial, and we've experienced
+ * some bugs during development (many of them due to me). To avoid
+ * this in the future, please check the following things if you make
+ * changes in this file:
+ *
+ * 1. You should obtain identical (at least to the PME precision)
+ * energies, forces, and virial for
+ * a rectangular box and a triclinic one where the z (or y) axis is
+ * tilted a whole box side. For instance you could use these boxes:
+ *
+ * rectangular triclinic
+ * 2 0 0 2 0 0
+ * 0 2 0 0 2 0
+ * 0 0 6 2 2 6
+ *
+ * 2. You should check the energy conservation in a triclinic box.
+ *
+ * It might seem an overkill, but better safe than sorry.
+ * /Erik 001109
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/utility/gmxmpi.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "typedefs.h"
+#include "txtdump.h"
+#include "vec.h"
+#include "gmxcomplex.h"
+#include "smalloc.h"
+#include "futil.h"
+#include "coulomb.h"
+#include "gmx_fatal.h"
+#include "pme.h"
+#include "network.h"
+#include "physics.h"
+#include "nrnb.h"
+#include "gmx_wallcycle.h"
+#include "pdbio.h"
+#include "gmx_cyclecounter.h"
+#include "gmx_omp.h"
+#include "macros.h"
+
- #include "gmx_x86_simd_single.h"
+
- #define PME_SSE
++/* Include the SIMD macro file and then check for support */
++#include "gmx_simd_macros.h"
++#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
++/* Turn on SIMD intrinsics for PME solve */
++#define PME_SIMD
++#endif
+
- #ifdef PME_SSE
++/* SIMD spread+gather only in single precision with SSE2 or higher available.
++ * We might want to switch to use gmx_simd_macros.h, but this is somewhat
++ * complicated, as we use unaligned and/or 4-wide only loads.
++ */
++#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
++#define PME_SSE_SPREAD_GATHER
++#include <emmintrin.h>
+/* Some old AMD processors could have problems with unaligned loads+stores */
+#ifndef GMX_FAHCORE
+#define PME_SSE_UNALIGNED
+#endif
+#endif
+
+#define DFT_TOL 1e-7
+/* #define PRT_FORCE */
+/* conditions for on the fly time-measurement */
+/* #define TAKETIME (step > 1 && timesteps < 10) */
+#define TAKETIME FALSE
+
+/* #define PME_TIME_THREADS */
+
+#ifdef GMX_DOUBLE
+#define mpi_type MPI_DOUBLE
+#else
+#define mpi_type MPI_FLOAT
+#endif
+
+/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#define GMX_CACHE_SEP 64
+
+/* We only define a maximum to be able to use local arrays without allocation.
+ * An order larger than 12 should never be needed, even for test cases.
+ * If needed it can be changed here.
+ */
+#define PME_ORDER_MAX 12
+
+/* Internal datastructures */
+typedef struct {
+ int send_index0;
+ int send_nindex;
+ int recv_index0;
+ int recv_nindex;
+ int recv_size; /* Receive buffer width, used with OpenMP */
+} pme_grid_comm_t;
+
+typedef struct {
+#ifdef GMX_MPI
+ MPI_Comm mpi_comm;
+#endif
+ int nnodes, nodeid;
+ int *s2g0;
+ int *s2g1;
+ int noverlap_nodes;
+ int *send_id, *recv_id;
+ int send_size; /* Send buffer width, used with OpenMP */
+ pme_grid_comm_t *comm_data;
+ real *sendbuf;
+ real *recvbuf;
+} pme_overlap_t;
+
+typedef struct {
+ int *n; /* Cumulative counts of the number of particles per thread */
+ int nalloc; /* Allocation size of i */
+ int *i; /* Particle indices ordered on thread index (n) */
+} thread_plist_t;
+
+typedef struct {
+ int *thread_one;
+ int n;
+ int *ind;
+ splinevec theta;
+ real *ptr_theta_z;
+ splinevec dtheta;
+ real *ptr_dtheta_z;
+} splinedata_t;
+
+typedef struct {
+ int dimind; /* The index of the dimension, 0=x, 1=y */
+ int nslab;
+ int nodeid;
+#ifdef GMX_MPI
+ MPI_Comm mpi_comm;
+#endif
+
+ int *node_dest; /* The nodes to send x and q to with DD */
+ int *node_src; /* The nodes to receive x and q from with DD */
+ int *buf_index; /* Index for commnode into the buffers */
+
+ int maxshift;
+
+ int npd;
+ int pd_nalloc;
+ int *pd;
+ int *count; /* The number of atoms to send to each node */
+ int **count_thread;
+ int *rcount; /* The number of atoms to receive */
+
+ int n;
+ int nalloc;
+ rvec *x;
+ real *q;
+ rvec *f;
+ gmx_bool bSpread; /* These coordinates are used for spreading */
+ int pme_order;
+ ivec *idx;
+ rvec *fractx; /* Fractional coordinate relative to the
+ * lower cell boundary
+ */
+ int nthread;
+ int *thread_idx; /* Which thread should spread which charge */
+ thread_plist_t *thread_plist;
+ splinedata_t *spline;
+} pme_atomcomm_t;
+
+#define FLBS 3
+#define FLBSZ 4
+
+typedef struct {
+ ivec ci; /* The spatial location of this grid */
+ ivec n; /* The used size of *grid, including order-1 */
+ ivec offset; /* The grid offset from the full node grid */
+ int order; /* PME spreading order */
+ ivec s; /* The allocated size of *grid, s >= n */
+ real *grid; /* The grid local thread, size n */
+} pmegrid_t;
+
+typedef struct {
+ pmegrid_t grid; /* The full node grid (non thread-local) */
+ int nthread; /* The number of threads operating on this grid */
+ ivec nc; /* The local spatial decomposition over the threads */
+ pmegrid_t *grid_th; /* Array of grids for each thread */
+ real *grid_all; /* Allocated array for the grids in *grid_th */
+ int **g2t; /* The grid to thread index */
+ ivec nthread_comm; /* The number of threads to communicate with */
+} pmegrids_t;
+
+
+typedef struct {
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ /* Masks for SSE aligned spreading and gathering */
+ __m128 mask_SSE0[6], mask_SSE1[6];
+#else
+ int dummy; /* C89 requires that struct has at least one member */
+#endif
+} pme_spline_work_t;
+
+typedef struct {
+ /* work data for solve_pme */
+ int nalloc;
+ real * mhx;
+ real * mhy;
+ real * mhz;
+ real * m2;
+ real * denom;
+ real * tmp1_alloc;
+ real * tmp1;
+ real * eterm;
+ real * m2inv;
+
+ real energy;
+ matrix vir;
+} pme_work_t;
+
+typedef struct gmx_pme {
+ int ndecompdim; /* The number of decomposition dimensions */
+ int nodeid; /* Our nodeid in mpi->mpi_comm */
+ int nodeid_major;
+ int nodeid_minor;
+ int nnodes; /* The number of nodes doing PME */
+ int nnodes_major;
+ int nnodes_minor;
+
+ MPI_Comm mpi_comm;
+ MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
+#ifdef GMX_MPI
+ MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
+#endif
+
+ gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
+ int nthread; /* The number of threads doing PME on our rank */
+
+ gmx_bool bPPnode; /* Node also does particle-particle forces */
+ gmx_bool bFEP; /* Compute Free energy contribution */
+ int nkx, nky, nkz; /* Grid dimensions */
+ gmx_bool bP3M; /* Do P3M: optimize the influence function */
+ int pme_order;
+ real epsilon_r;
+
+ pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
+ pmegrids_t pmegridB;
+ /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+ int pmegrid_nx, pmegrid_ny, pmegrid_nz;
+ /* pmegrid_nz might be larger than strictly necessary to ensure
+ * memory alignment, pmegrid_nz_base gives the real base size.
+ */
+ int pmegrid_nz_base;
+ /* The local PME grid starting indices */
+ int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
+
+ /* Work data for spreading and gathering */
+ pme_spline_work_t *spline_work;
+
+ real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+ real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
+ int fftgrid_nx, fftgrid_ny, fftgrid_nz;
+
+ t_complex *cfftgridA; /* Grids for complex FFT data */
+ t_complex *cfftgridB;
+ int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
+
+ gmx_parallel_3dfft_t pfft_setupA;
+ gmx_parallel_3dfft_t pfft_setupB;
+
+ int *nnx, *nny, *nnz;
+ real *fshx, *fshy, *fshz;
+
+ pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
+ matrix recipbox;
+ splinevec bsp_mod;
+
+ pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+
+ pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
+
+ rvec *bufv; /* Communication buffer */
+ real *bufr; /* Communication buffer */
+ int buf_nalloc; /* The communication buffer size */
+
+ /* thread local work data for solve_pme */
+ pme_work_t *work;
+
+ /* Work data for PME_redist */
+ gmx_bool redist_init;
+ int * scounts;
+ int * rcounts;
+ int * sdispls;
+ int * rdispls;
+ int * sidx;
+ int * idxa;
+ real * redist_buf;
+ int redist_buf_nalloc;
+
+ /* Work data for sum_qgrid */
+ real * sum_qgrid_tmp;
+ real * sum_qgrid_dd_tmp;
+} t_gmx_pme;
+
+
+static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
+ int start, int end, int thread)
+{
+ int i;
+ int *idxptr, tix, tiy, tiz;
+ real *xptr, *fptr, tx, ty, tz;
+ real rxx, ryx, ryy, rzx, rzy, rzz;
+ int nx, ny, nz;
+ int start_ix, start_iy, start_iz;
+ int *g2tx, *g2ty, *g2tz;
+ gmx_bool bThreads;
+ int *thread_idx = NULL;
+ thread_plist_t *tpl = NULL;
+ int *tpl_n = NULL;
+ int thread_i;
+
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+
+ start_ix = pme->pmegrid_start_ix;
+ start_iy = pme->pmegrid_start_iy;
+ start_iz = pme->pmegrid_start_iz;
+
+ rxx = pme->recipbox[XX][XX];
+ ryx = pme->recipbox[YY][XX];
+ ryy = pme->recipbox[YY][YY];
+ rzx = pme->recipbox[ZZ][XX];
+ rzy = pme->recipbox[ZZ][YY];
+ rzz = pme->recipbox[ZZ][ZZ];
+
+ g2tx = pme->pmegridA.g2t[XX];
+ g2ty = pme->pmegridA.g2t[YY];
+ g2tz = pme->pmegridA.g2t[ZZ];
+
+ bThreads = (atc->nthread > 1);
+ if (bThreads)
+ {
+ thread_idx = atc->thread_idx;
+
+ tpl = &atc->thread_plist[thread];
+ tpl_n = tpl->n;
+ for (i = 0; i < atc->nthread; i++)
+ {
+ tpl_n[i] = 0;
+ }
+ }
+
+ for (i = start; i < end; i++)
+ {
+ xptr = atc->x[i];
+ idxptr = atc->idx[i];
+ fptr = atc->fractx[i];
+
+ /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
+ tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
+ ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
+ tz = nz * ( xptr[ZZ] * rzz + 2.0 );
+
+ tix = (int)(tx);
+ tiy = (int)(ty);
+ tiz = (int)(tz);
+
+ /* Because decomposition only occurs in x and y,
+ * we never have a fraction correction in z.
+ */
+ fptr[XX] = tx - tix + pme->fshx[tix];
+ fptr[YY] = ty - tiy + pme->fshy[tiy];
+ fptr[ZZ] = tz - tiz;
+
+ idxptr[XX] = pme->nnx[tix];
+ idxptr[YY] = pme->nny[tiy];
+ idxptr[ZZ] = pme->nnz[tiz];
+
+#ifdef DEBUG
+ range_check(idxptr[XX], 0, pme->pmegrid_nx);
+ range_check(idxptr[YY], 0, pme->pmegrid_ny);
+ range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
+#endif
+
+ if (bThreads)
+ {
+ thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
+ thread_idx[i] = thread_i;
+ tpl_n[thread_i]++;
+ }
+ }
+
+ if (bThreads)
+ {
+ /* Make a list of particle indices sorted on thread */
+
+ /* Get the cumulative count */
+ for (i = 1; i < atc->nthread; i++)
+ {
+ tpl_n[i] += tpl_n[i-1];
+ }
+ /* The current implementation distributes particles equally
+ * over the threads, so we could actually allocate for that
+ * in pme_realloc_atomcomm_things.
+ */
+ if (tpl_n[atc->nthread-1] > tpl->nalloc)
+ {
+ tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
+ srenew(tpl->i, tpl->nalloc);
+ }
+ /* Set tpl_n to the cumulative start */
+ for (i = atc->nthread-1; i >= 1; i--)
+ {
+ tpl_n[i] = tpl_n[i-1];
+ }
+ tpl_n[0] = 0;
+
+ /* Fill our thread local array with indices sorted on thread */
+ for (i = start; i < end; i++)
+ {
+ tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
+ }
+ /* Now tpl_n contains the cummulative count again */
+ }
+}
+
+static void make_thread_local_ind(pme_atomcomm_t *atc,
+ int thread, splinedata_t *spline)
+{
+ int n, t, i, start, end;
+ thread_plist_t *tpl;
+
+ /* Combine the indices made by each thread into one index */
+
+ n = 0;
+ start = 0;
+ for (t = 0; t < atc->nthread; t++)
+ {
+ tpl = &atc->thread_plist[t];
+ /* Copy our part (start - end) from the list of thread t */
+ if (thread > 0)
+ {
+ start = tpl->n[thread-1];
+ }
+ end = tpl->n[thread];
+ for (i = start; i < end; i++)
+ {
+ spline->ind[n++] = tpl->i[i];
+ }
+ }
+
+ spline->n = n;
+}
+
+
+static void pme_calc_pidx(int start, int end,
+ matrix recipbox, rvec x[],
+ pme_atomcomm_t *atc, int *count)
+{
+ int nslab, i;
+ int si;
+ real *xptr, s;
+ real rxx, ryx, rzx, ryy, rzy;
+ int *pd;
+
+ /* Calculate PME task index (pidx) for each grid index.
+ * Here we always assign equally sized slabs to each node
+ * for load balancing reasons (the PME grid spacing is not used).
+ */
+
+ nslab = atc->nslab;
+ pd = atc->pd;
+
+ /* Reset the count */
+ for (i = 0; i < nslab; i++)
+ {
+ count[i] = 0;
+ }
+
+ if (atc->dimind == 0)
+ {
+ rxx = recipbox[XX][XX];
+ ryx = recipbox[YY][XX];
+ rzx = recipbox[ZZ][XX];
+ /* Calculate the node index in x-dimension */
+ for (i = start; i < end; i++)
+ {
+ xptr = x[i];
+ /* Fractional coordinates along box vectors */
+ s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
+ si = (int)(s + 2*nslab) % nslab;
+ pd[i] = si;
+ count[si]++;
+ }
+ }
+ else
+ {
+ ryy = recipbox[YY][YY];
+ rzy = recipbox[ZZ][YY];
+ /* Calculate the node index in y-dimension */
+ for (i = start; i < end; i++)
+ {
+ xptr = x[i];
+ /* Fractional coordinates along box vectors */
+ s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
+ si = (int)(s + 2*nslab) % nslab;
+ pd[i] = si;
+ count[si]++;
+ }
+ }
+}
+
+static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
+ pme_atomcomm_t *atc)
+{
+ int nthread, thread, slab;
+
+ nthread = atc->nthread;
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for (thread = 0; thread < nthread; thread++)
+ {
+ pme_calc_pidx(natoms* thread /nthread,
+ natoms*(thread+1)/nthread,
+ recipbox, x, atc, atc->count_thread[thread]);
+ }
+ /* Non-parallel reduction, since nslab is small */
+
+ for (thread = 1; thread < nthread; thread++)
+ {
+ for (slab = 0; slab < atc->nslab; slab++)
+ {
+ atc->count_thread[0][slab] += atc->count_thread[thread][slab];
+ }
+ }
+}
+
+static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
+{
+ const int padding = 4;
+ int i;
+
+ srenew(th[XX], nalloc);
+ srenew(th[YY], nalloc);
+ /* In z we add padding, this is only required for the aligned SSE code */
+ srenew(*ptr_z, nalloc+2*padding);
+ th[ZZ] = *ptr_z + padding;
+
+ for (i = 0; i < padding; i++)
+ {
+ (*ptr_z)[ i] = 0;
+ (*ptr_z)[padding+nalloc+i] = 0;
+ }
+}
+
+static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
+{
+ int i, d;
+
+ srenew(spline->ind, atc->nalloc);
+ /* Initialize the index to identity so it works without threads */
+ for (i = 0; i < atc->nalloc; i++)
+ {
+ spline->ind[i] = i;
+ }
+
+ realloc_splinevec(spline->theta, &spline->ptr_theta_z,
+ atc->pme_order*atc->nalloc);
+ realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
+ atc->pme_order*atc->nalloc);
+}
+
+static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
+{
+ int nalloc_old, i, j, nalloc_tpl;
+
+ /* We have to avoid a NULL pointer for atc->x to avoid
+ * possible fatal errors in MPI routines.
+ */
+ if (atc->n > atc->nalloc || atc->nalloc == 0)
+ {
+ nalloc_old = atc->nalloc;
+ atc->nalloc = over_alloc_dd(max(atc->n, 1));
+
+ if (atc->nslab > 1)
+ {
+ srenew(atc->x, atc->nalloc);
+ srenew(atc->q, atc->nalloc);
+ srenew(atc->f, atc->nalloc);
+ for (i = nalloc_old; i < atc->nalloc; i++)
+ {
+ clear_rvec(atc->f[i]);
+ }
+ }
+ if (atc->bSpread)
+ {
+ srenew(atc->fractx, atc->nalloc);
+ srenew(atc->idx, atc->nalloc);
+
+ if (atc->nthread > 1)
+ {
+ srenew(atc->thread_idx, atc->nalloc);
+ }
+
+ for (i = 0; i < atc->nthread; i++)
+ {
+ pme_realloc_splinedata(&atc->spline[i], atc);
+ }
+ }
+ }
+}
+
+static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
+ int n, gmx_bool bXF, rvec *x_f, real *charge,
+ pme_atomcomm_t *atc)
+/* Redistribute particle data for PME calculation */
+/* domain decomposition by x coordinate */
+{
+ int *idxa;
+ int i, ii;
+
+ if (FALSE == pme->redist_init)
+ {
+ snew(pme->scounts, atc->nslab);
+ snew(pme->rcounts, atc->nslab);
+ snew(pme->sdispls, atc->nslab);
+ snew(pme->rdispls, atc->nslab);
+ snew(pme->sidx, atc->nslab);
+ pme->redist_init = TRUE;
+ }
+ if (n > pme->redist_buf_nalloc)
+ {
+ pme->redist_buf_nalloc = over_alloc_dd(n);
+ srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
+ }
+
+ pme->idxa = atc->pd;
+
+#ifdef GMX_MPI
+ if (forw && bXF)
+ {
+ /* forward, redistribution from pp to pme */
+
+ /* Calculate send counts and exchange them with other nodes */
+ for (i = 0; (i < atc->nslab); i++)
+ {
+ pme->scounts[i] = 0;
+ }
+ for (i = 0; (i < n); i++)
+ {
+ pme->scounts[pme->idxa[i]]++;
+ }
+ MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
+
+ /* Calculate send and receive displacements and index into send
+ buffer */
+ pme->sdispls[0] = 0;
+ pme->rdispls[0] = 0;
+ pme->sidx[0] = 0;
+ for (i = 1; i < atc->nslab; i++)
+ {
+ pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
+ pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
+ pme->sidx[i] = pme->sdispls[i];
+ }
+ /* Total # of particles to be received */
+ atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
+
+ pme_realloc_atomcomm_things(atc);
+
+ /* Copy particle coordinates into send buffer and exchange*/
+ for (i = 0; (i < n); i++)
+ {
+ ii = DIM*pme->sidx[pme->idxa[i]];
+ pme->sidx[pme->idxa[i]]++;
+ pme->redist_buf[ii+XX] = x_f[i][XX];
+ pme->redist_buf[ii+YY] = x_f[i][YY];
+ pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
+ }
+ MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
+ pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
+ pme->rvec_mpi, atc->mpi_comm);
+ }
+ if (forw)
+ {
+ /* Copy charge into send buffer and exchange*/
+ for (i = 0; i < atc->nslab; i++)
+ {
+ pme->sidx[i] = pme->sdispls[i];
+ }
+ for (i = 0; (i < n); i++)
+ {
+ ii = pme->sidx[pme->idxa[i]];
+ pme->sidx[pme->idxa[i]]++;
+ pme->redist_buf[ii] = charge[i];
+ }
+ MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
+ atc->q, pme->rcounts, pme->rdispls, mpi_type,
+ atc->mpi_comm);
+ }
+ else /* backward, redistribution from pme to pp */
+ {
+ MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
+ pme->redist_buf, pme->scounts, pme->sdispls,
+ pme->rvec_mpi, atc->mpi_comm);
+
+ /* Copy data from receive buffer */
+ for (i = 0; i < atc->nslab; i++)
+ {
+ pme->sidx[i] = pme->sdispls[i];
+ }
+ for (i = 0; (i < n); i++)
+ {
+ ii = DIM*pme->sidx[pme->idxa[i]];
+ x_f[i][XX] += pme->redist_buf[ii+XX];
+ x_f[i][YY] += pme->redist_buf[ii+YY];
+ x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
+ pme->sidx[pme->idxa[i]]++;
+ }
+ }
+#endif
+}
+
+static void pme_dd_sendrecv(pme_atomcomm_t *atc,
+ gmx_bool bBackward, int shift,
+ void *buf_s, int nbyte_s,
+ void *buf_r, int nbyte_r)
+{
+#ifdef GMX_MPI
+ int dest, src;
+ MPI_Status stat;
+
+ if (bBackward == FALSE)
+ {
+ dest = atc->node_dest[shift];
+ src = atc->node_src[shift];
+ }
+ else
+ {
+ dest = atc->node_src[shift];
+ src = atc->node_dest[shift];
+ }
+
+ if (nbyte_s > 0 && nbyte_r > 0)
+ {
+ MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
+ dest, shift,
+ buf_r, nbyte_r, MPI_BYTE,
+ src, shift,
+ atc->mpi_comm, &stat);
+ }
+ else if (nbyte_s > 0)
+ {
+ MPI_Send(buf_s, nbyte_s, MPI_BYTE,
+ dest, shift,
+ atc->mpi_comm);
+ }
+ else if (nbyte_r > 0)
+ {
+ MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
+ src, shift,
+ atc->mpi_comm, &stat);
+ }
+#endif
+}
+
+static void dd_pmeredist_x_q(gmx_pme_t pme,
+ int n, gmx_bool bX, rvec *x, real *charge,
+ pme_atomcomm_t *atc)
+{
+ int *commnode, *buf_index;
+ int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
+
+ commnode = atc->node_dest;
+ buf_index = atc->buf_index;
+
+ nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
+
+ nsend = 0;
+ for (i = 0; i < nnodes_comm; i++)
+ {
+ buf_index[commnode[i]] = nsend;
+ nsend += atc->count[commnode[i]];
+ }
+ if (bX)
+ {
+ if (atc->count[atc->nodeid] + nsend != n)
+ {
+ gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+ "This usually means that your system is not well equilibrated.",
+ n - (atc->count[atc->nodeid] + nsend),
+ pme->nodeid, 'x'+atc->dimind);
+ }
+
+ if (nsend > pme->buf_nalloc)
+ {
+ pme->buf_nalloc = over_alloc_dd(nsend);
+ srenew(pme->bufv, pme->buf_nalloc);
+ srenew(pme->bufr, pme->buf_nalloc);
+ }
+
+ atc->n = atc->count[atc->nodeid];
+ for (i = 0; i < nnodes_comm; i++)
+ {
+ scount = atc->count[commnode[i]];
+ /* Communicate the count */
+ if (debug)
+ {
+ fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+ atc->dimind, atc->nodeid, commnode[i], scount);
+ }
+ pme_dd_sendrecv(atc, FALSE, i,
+ &scount, sizeof(int),
+ &atc->rcount[i], sizeof(int));
+ atc->n += atc->rcount[i];
+ }
+
+ pme_realloc_atomcomm_things(atc);
+ }
+
+ local_pos = 0;
+ for (i = 0; i < n; i++)
+ {
+ node = atc->pd[i];
+ if (node == atc->nodeid)
+ {
+ /* Copy direct to the receive buffer */
+ if (bX)
+ {
+ copy_rvec(x[i], atc->x[local_pos]);
+ }
+ atc->q[local_pos] = charge[i];
+ local_pos++;
+ }
+ else
+ {
+ /* Copy to the send buffer */
+ if (bX)
+ {
+ copy_rvec(x[i], pme->bufv[buf_index[node]]);
+ }
+ pme->bufr[buf_index[node]] = charge[i];
+ buf_index[node]++;
+ }
+ }
+
+ buf_pos = 0;
+ for (i = 0; i < nnodes_comm; i++)
+ {
+ scount = atc->count[commnode[i]];
+ rcount = atc->rcount[i];
+ if (scount > 0 || rcount > 0)
+ {
+ if (bX)
+ {
+ /* Communicate the coordinates */
+ pme_dd_sendrecv(atc, FALSE, i,
+ pme->bufv[buf_pos], scount*sizeof(rvec),
+ atc->x[local_pos], rcount*sizeof(rvec));
+ }
+ /* Communicate the charges */
+ pme_dd_sendrecv(atc, FALSE, i,
+ pme->bufr+buf_pos, scount*sizeof(real),
+ atc->q+local_pos, rcount*sizeof(real));
+ buf_pos += scount;
+ local_pos += atc->rcount[i];
+ }
+ }
+}
+
+static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
+ int n, rvec *f,
+ gmx_bool bAddF)
+{
+ int *commnode, *buf_index;
+ int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
+
+ commnode = atc->node_dest;
+ buf_index = atc->buf_index;
+
+ nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
+
+ local_pos = atc->count[atc->nodeid];
+ buf_pos = 0;
+ for (i = 0; i < nnodes_comm; i++)
+ {
+ scount = atc->rcount[i];
+ rcount = atc->count[commnode[i]];
+ if (scount > 0 || rcount > 0)
+ {
+ /* Communicate the forces */
+ pme_dd_sendrecv(atc, TRUE, i,
+ atc->f[local_pos], scount*sizeof(rvec),
+ pme->bufv[buf_pos], rcount*sizeof(rvec));
+ local_pos += scount;
+ }
+ buf_index[commnode[i]] = buf_pos;
+ buf_pos += rcount;
+ }
+
+ local_pos = 0;
+ if (bAddF)
+ {
+ for (i = 0; i < n; i++)
+ {
+ node = atc->pd[i];
+ if (node == atc->nodeid)
+ {
+ /* Add from the local force array */
+ rvec_inc(f[i], atc->f[local_pos]);
+ local_pos++;
+ }
+ else
+ {
+ /* Add from the receive buffer */
+ rvec_inc(f[i], pme->bufv[buf_index[node]]);
+ buf_index[node]++;
+ }
+ }
+ }
+ else
+ {
+ for (i = 0; i < n; i++)
+ {
+ node = atc->pd[i];
+ if (node == atc->nodeid)
+ {
+ /* Copy from the local force array */
+ copy_rvec(atc->f[local_pos], f[i]);
+ local_pos++;
+ }
+ else
+ {
+ /* Copy from the receive buffer */
+ copy_rvec(pme->bufv[buf_index[node]], f[i]);
+ buf_index[node]++;
+ }
+ }
+ }
+}
+
+#ifdef GMX_MPI
+static void
+gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
+{
+ pme_overlap_t *overlap;
+ int send_index0, send_nindex;
+ int recv_index0, recv_nindex;
+ MPI_Status stat;
+ int i, j, k, ix, iy, iz, icnt;
+ int ipulse, send_id, recv_id, datasize;
+ real *p;
+ real *sendptr, *recvptr;
+
+ /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
+ overlap = &pme->overlap[1];
+
+ for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
+ {
+ /* Since we have already (un)wrapped the overlap in the z-dimension,
+ * we only have to communicate 0 to nkz (not pmegrid_nz).
+ */
+ if (direction == GMX_SUM_QGRID_FORWARD)
+ {
+ send_id = overlap->send_id[ipulse];
+ recv_id = overlap->recv_id[ipulse];
+ send_index0 = overlap->comm_data[ipulse].send_index0;
+ send_nindex = overlap->comm_data[ipulse].send_nindex;
+ recv_index0 = overlap->comm_data[ipulse].recv_index0;
+ recv_nindex = overlap->comm_data[ipulse].recv_nindex;
+ }
+ else
+ {
+ send_id = overlap->recv_id[ipulse];
+ recv_id = overlap->send_id[ipulse];
+ send_index0 = overlap->comm_data[ipulse].recv_index0;
+ send_nindex = overlap->comm_data[ipulse].recv_nindex;
+ recv_index0 = overlap->comm_data[ipulse].send_index0;
+ recv_nindex = overlap->comm_data[ipulse].send_nindex;
+ }
+
+ /* Copy data to contiguous send buffer */
+ if (debug)
+ {
+ fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ pme->nodeid, overlap->nodeid, send_id,
+ pme->pmegrid_start_iy,
+ send_index0-pme->pmegrid_start_iy,
+ send_index0-pme->pmegrid_start_iy+send_nindex);
+ }
+ icnt = 0;
+ for (i = 0; i < pme->pmegrid_nx; i++)
+ {
+ ix = i;
+ for (j = 0; j < send_nindex; j++)
+ {
+ iy = j + send_index0 - pme->pmegrid_start_iy;
+ for (k = 0; k < pme->nkz; k++)
+ {
+ iz = k;
+ overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
+ }
+ }
+ }
+
+ datasize = pme->pmegrid_nx * pme->nkz;
+
+ MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
+ send_id, ipulse,
+ overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
+ recv_id, ipulse,
+ overlap->mpi_comm, &stat);
+
+ /* Get data from contiguous recv buffer */
+ if (debug)
+ {
+ fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ pme->nodeid, overlap->nodeid, recv_id,
+ pme->pmegrid_start_iy,
+ recv_index0-pme->pmegrid_start_iy,
+ recv_index0-pme->pmegrid_start_iy+recv_nindex);
+ }
+ icnt = 0;
+ for (i = 0; i < pme->pmegrid_nx; i++)
+ {
+ ix = i;
+ for (j = 0; j < recv_nindex; j++)
+ {
+ iy = j + recv_index0 - pme->pmegrid_start_iy;
+ for (k = 0; k < pme->nkz; k++)
+ {
+ iz = k;
+ if (direction == GMX_SUM_QGRID_FORWARD)
+ {
+ grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
+ }
+ else
+ {
+ grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
+ }
+ }
+ }
+ }
+ }
+
+ /* Major dimension is easier, no copying required,
+ * but we might have to sum to separate array.
+ * Since we don't copy, we have to communicate up to pmegrid_nz,
+ * not nkz as for the minor direction.
+ */
+ overlap = &pme->overlap[0];
+
+ for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
+ {
+ if (direction == GMX_SUM_QGRID_FORWARD)
+ {
+ send_id = overlap->send_id[ipulse];
+ recv_id = overlap->recv_id[ipulse];
+ send_index0 = overlap->comm_data[ipulse].send_index0;
+ send_nindex = overlap->comm_data[ipulse].send_nindex;
+ recv_index0 = overlap->comm_data[ipulse].recv_index0;
+ recv_nindex = overlap->comm_data[ipulse].recv_nindex;
+ recvptr = overlap->recvbuf;
+ }
+ else
+ {
+ send_id = overlap->recv_id[ipulse];
+ recv_id = overlap->send_id[ipulse];
+ send_index0 = overlap->comm_data[ipulse].recv_index0;
+ send_nindex = overlap->comm_data[ipulse].recv_nindex;
+ recv_index0 = overlap->comm_data[ipulse].send_index0;
+ recv_nindex = overlap->comm_data[ipulse].send_nindex;
+ recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+ }
+
+ sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+ datasize = pme->pmegrid_ny * pme->pmegrid_nz;
+
+ if (debug)
+ {
+ fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ pme->nodeid, overlap->nodeid, send_id,
+ pme->pmegrid_start_ix,
+ send_index0-pme->pmegrid_start_ix,
+ send_index0-pme->pmegrid_start_ix+send_nindex);
+ fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ pme->nodeid, overlap->nodeid, recv_id,
+ pme->pmegrid_start_ix,
+ recv_index0-pme->pmegrid_start_ix,
+ recv_index0-pme->pmegrid_start_ix+recv_nindex);
+ }
+
+ MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
+ send_id, ipulse,
+ recvptr, recv_nindex*datasize, GMX_MPI_REAL,
+ recv_id, ipulse,
+ overlap->mpi_comm, &stat);
+
+ /* ADD data from contiguous recv buffer */
+ if (direction == GMX_SUM_QGRID_FORWARD)
+ {
+ p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+ for (i = 0; i < recv_nindex*datasize; i++)
+ {
+ p[i] += overlap->recvbuf[i];
+ }
+ }
+ }
+}
+#endif
+
+
+static int
+copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
+{
+ ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ ivec local_pme_size;
+ int i, ix, iy, iz;
+ int pmeidx, fftidx;
+
+ /* Dimensions should be identical for A/B grid, so we just use A here */
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+
+ local_pme_size[0] = pme->pmegrid_nx;
+ local_pme_size[1] = pme->pmegrid_ny;
+ local_pme_size[2] = pme->pmegrid_nz;
+
+ /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
+ the offset is identical, and the PME grid always has more data (due to overlap)
+ */
+ {
+#ifdef DEBUG_PME
+ FILE *fp, *fp2;
+ char fn[STRLEN], format[STRLEN];
+ real val;
+ sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
+ fp = ffopen(fn, "w");
+ sprintf(fn, "pmegrid%d.txt", pme->nodeid);
+ fp2 = ffopen(fn, "w");
+ sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+#endif
+
+ for (ix = 0; ix < local_fft_ndata[XX]; ix++)
+ {
+ for (iy = 0; iy < local_fft_ndata[YY]; iy++)
+ {
+ for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
+ {
+ pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
+ fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
+ fftgrid[fftidx] = pmegrid[pmeidx];
+#ifdef DEBUG_PME
+ val = 100*pmegrid[pmeidx];
+ if (pmegrid[pmeidx] != 0)
+ {
+ fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
+ 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+ }
+ if (pmegrid[pmeidx] != 0)
+ {
+ fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
+ "qgrid",
+ pme->pmegrid_start_ix + ix,
+ pme->pmegrid_start_iy + iy,
+ pme->pmegrid_start_iz + iz,
+ pmegrid[pmeidx]);
+ }
+#endif
+ }
+ }
+ }
+#ifdef DEBUG_PME
+ ffclose(fp);
+ ffclose(fp2);
+#endif
+ }
+ return 0;
+}
+
+
+static gmx_cycles_t omp_cyc_start()
+{
+ return gmx_cycles_read();
+}
+
+static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
+{
+ return gmx_cycles_read() - c;
+}
+
+
+static int
+copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
+ int nthread, int thread)
+{
+ ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ ivec local_pme_size;
+ int ixy0, ixy1, ixy, ix, iy, iz;
+ int pmeidx, fftidx;
+#ifdef PME_TIME_THREADS
+ gmx_cycles_t c1;
+ static double cs1 = 0;
+ static int cnt = 0;
+#endif
+
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_start();
+#endif
+ /* Dimensions should be identical for A/B grid, so we just use A here */
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+
+ local_pme_size[0] = pme->pmegrid_nx;
+ local_pme_size[1] = pme->pmegrid_ny;
+ local_pme_size[2] = pme->pmegrid_nz;
+
+ /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
+ the offset is identical, and the PME grid always has more data (due to overlap)
+ */
+ ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+ ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+
+ for (ixy = ixy0; ixy < ixy1; ixy++)
+ {
+ ix = ixy/local_fft_ndata[YY];
+ iy = ixy - ix*local_fft_ndata[YY];
+
+ pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
+ fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
+ for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
+ {
+ pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
+ }
+ }
+
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_end(c1);
+ cs1 += (double)c1;
+ cnt++;
+ if (cnt % 20 == 0)
+ {
+ printf("copy %.2f\n", cs1*1e-9);
+ }
+#endif
+
+ return 0;
+}
+
+
+static void
+wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+{
+ int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
+
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+
+ pnx = pme->pmegrid_nx;
+ pny = pme->pmegrid_ny;
+ pnz = pme->pmegrid_nz;
+
+ overlap = pme->pme_order - 1;
+
+ /* Add periodic overlap in z */
+ for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ {
+ for (iy = 0; iy < pme->pmegrid_ny; iy++)
+ {
+ for (iz = 0; iz < overlap; iz++)
+ {
+ pmegrid[(ix*pny+iy)*pnz+iz] +=
+ pmegrid[(ix*pny+iy)*pnz+nz+iz];
+ }
+ }
+ }
+
+ if (pme->nnodes_minor == 1)
+ {
+ for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ {
+ for (iy = 0; iy < overlap; iy++)
+ {
+ for (iz = 0; iz < nz; iz++)
+ {
+ pmegrid[(ix*pny+iy)*pnz+iz] +=
+ pmegrid[(ix*pny+ny+iy)*pnz+iz];
+ }
+ }
+ }
+ }
+
+ if (pme->nnodes_major == 1)
+ {
+ ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
+
+ for (ix = 0; ix < overlap; ix++)
+ {
+ for (iy = 0; iy < ny_x; iy++)
+ {
+ for (iz = 0; iz < nz; iz++)
+ {
+ pmegrid[(ix*pny+iy)*pnz+iz] +=
+ pmegrid[((nx+ix)*pny+iy)*pnz+iz];
+ }
+ }
+ }
+ }
+}
+
+
+static void
+unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+{
+ int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
+
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+
+ pnx = pme->pmegrid_nx;
+ pny = pme->pmegrid_ny;
+ pnz = pme->pmegrid_nz;
+
+ overlap = pme->pme_order - 1;
+
+ if (pme->nnodes_major == 1)
+ {
+ ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
+
+ for (ix = 0; ix < overlap; ix++)
+ {
+ int iy, iz;
+
+ for (iy = 0; iy < ny_x; iy++)
+ {
+ for (iz = 0; iz < nz; iz++)
+ {
+ pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
+ pmegrid[(ix*pny+iy)*pnz+iz];
+ }
+ }
+ }
+ }
+
+ if (pme->nnodes_minor == 1)
+ {
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ {
+ int iy, iz;
+
+ for (iy = 0; iy < overlap; iy++)
+ {
+ for (iz = 0; iz < nz; iz++)
+ {
+ pmegrid[(ix*pny+ny+iy)*pnz+iz] =
+ pmegrid[(ix*pny+iy)*pnz+iz];
+ }
+ }
+ }
+ }
+
+ /* Copy periodic overlap in z */
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for (ix = 0; ix < pme->pmegrid_nx; ix++)
+ {
+ int iy, iz;
+
+ for (iy = 0; iy < pme->pmegrid_ny; iy++)
+ {
+ for (iz = 0; iz < overlap; iz++)
+ {
+ pmegrid[(ix*pny+iy)*pnz+nz+iz] =
+ pmegrid[(ix*pny+iy)*pnz+iz];
+ }
+ }
+ }
+}
+
+
+/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
+#define DO_BSPLINE(order) \
+ for (ithx = 0; (ithx < order); ithx++) \
+ { \
+ index_x = (i0+ithx)*pny*pnz; \
+ valx = qn*thx[ithx]; \
+ \
+ for (ithy = 0; (ithy < order); ithy++) \
+ { \
+ valxy = valx*thy[ithy]; \
+ index_xy = index_x+(j0+ithy)*pnz; \
+ \
+ for (ithz = 0; (ithz < order); ithz++) \
+ { \
+ index_xyz = index_xy+(k0+ithz); \
+ grid[index_xyz] += valxy*thz[ithz]; \
+ } \
+ } \
+ }
+
+
+static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
+ pme_atomcomm_t *atc, splinedata_t *spline,
+ pme_spline_work_t *work)
+{
+
+ /* spread charges from home atoms to local grid */
+ real *grid;
+ pme_overlap_t *ol;
+ int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
+ int * idxptr;
+ int order, norder, index_x, index_xy, index_xyz;
+ real valx, valxy, qn;
+ real *thx, *thy, *thz;
+ int localsize, bndsize;
+ int pnx, pny, pnz, ndatatot;
+ int offx, offy, offz;
+
+ pnx = pmegrid->s[XX];
+ pny = pmegrid->s[YY];
+ pnz = pmegrid->s[ZZ];
+
+ offx = pmegrid->offset[XX];
+ offy = pmegrid->offset[YY];
+ offz = pmegrid->offset[ZZ];
+
+ ndatatot = pnx*pny*pnz;
+ grid = pmegrid->grid;
+ for (i = 0; i < ndatatot; i++)
+ {
+ grid[i] = 0;
+ }
+
+ order = pmegrid->order;
+
+ for (nn = 0; nn < spline->n; nn++)
+ {
+ n = spline->ind[nn];
+ qn = atc->q[n];
+
+ if (qn != 0)
+ {
+ idxptr = atc->idx[n];
+ norder = nn*order;
+
+ i0 = idxptr[XX] - offx;
+ j0 = idxptr[YY] - offy;
+ k0 = idxptr[ZZ] - offz;
+
+ thx = spline->theta[XX] + norder;
+ thy = spline->theta[YY] + norder;
+ thz = spline->theta[ZZ] + norder;
+
+ switch (order)
+ {
+ case 4:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+#ifdef PME_SSE_UNALIGNED
+#define PME_SPREAD_SSE_ORDER4
+#else
+#define PME_SPREAD_SSE_ALIGNED
+#define PME_ORDER 4
+#endif
+#include "pme_sse_single.h"
+#else
+ DO_BSPLINE(4);
+#endif
+ break;
+ case 5:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+#define PME_SPREAD_SSE_ALIGNED
+#define PME_ORDER 5
+#include "pme_sse_single.h"
+#else
+ DO_BSPLINE(5);
+#endif
+ break;
+ default:
+ DO_BSPLINE(order);
+ break;
+ }
+ }
+ }
+}
+
+static void set_grid_alignment(int *pmegrid_nz, int pme_order)
+{
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+ if (pme_order == 5
+#ifndef PME_SSE_UNALIGNED
+ || pme_order == 4
+#endif
+ )
+ {
+ /* Round nz up to a multiple of 4 to ensure alignment */
+ *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
+ }
+#endif
+}
+
+static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
+{
- /* Allocate an aligned pointer for SSE operations, including 3 extra
- * elements at the end since SSE operates on 4 elements at a time.
++#ifdef PME_SSE_SPREAD_GATHER
+#ifndef PME_SSE_UNALIGNED
+ if (pme_order == 4)
+ {
+ /* Add extra elements to ensured aligned operations do not go
+ * beyond the allocated grid size.
+ * Note that for pme_order=5, the pme grid z-size alignment
+ * ensures that we will not go beyond the grid size.
+ */
+ *gridsize += 4;
+ }
+#endif
+#endif
+}
+
+static void pmegrid_init(pmegrid_t *grid,
+ int cx, int cy, int cz,
+ int x0, int y0, int z0,
+ int x1, int y1, int z1,
+ gmx_bool set_alignment,
+ int pme_order,
+ real *ptr)
+{
+ int nz, gridsize;
+
+ grid->ci[XX] = cx;
+ grid->ci[YY] = cy;
+ grid->ci[ZZ] = cz;
+ grid->offset[XX] = x0;
+ grid->offset[YY] = y0;
+ grid->offset[ZZ] = z0;
+ grid->n[XX] = x1 - x0 + pme_order - 1;
+ grid->n[YY] = y1 - y0 + pme_order - 1;
+ grid->n[ZZ] = z1 - z0 + pme_order - 1;
+ copy_ivec(grid->n, grid->s);
+
+ nz = grid->s[ZZ];
+ set_grid_alignment(&nz, pme_order);
+ if (set_alignment)
+ {
+ grid->s[ZZ] = nz;
+ }
+ else if (nz != grid->s[ZZ])
+ {
+ gmx_incons("pmegrid_init call with an unaligned z size");
+ }
+
+ grid->order = pme_order;
+ if (ptr == NULL)
+ {
+ gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
+ set_gridsize_alignment(&gridsize, pme_order);
+ snew_aligned(grid->grid, gridsize, 16);
+ }
+ else
+ {
+ grid->grid = ptr;
+ }
+}
+
+static int div_round_up(int enumerator, int denominator)
+{
+ return (enumerator + denominator - 1)/denominator;
+}
+
+static void make_subgrid_division(const ivec n, int ovl, int nthread,
+ ivec nsub)
+{
+ int gsize_opt, gsize;
+ int nsx, nsy, nsz;
+ char *env;
+
+ gsize_opt = -1;
+ for (nsx = 1; nsx <= nthread; nsx++)
+ {
+ if (nthread % nsx == 0)
+ {
+ for (nsy = 1; nsy <= nthread; nsy++)
+ {
+ if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
+ {
+ nsz = nthread/(nsx*nsy);
+
+ /* Determine the number of grid points per thread */
+ gsize =
+ (div_round_up(n[XX], nsx) + ovl)*
+ (div_round_up(n[YY], nsy) + ovl)*
+ (div_round_up(n[ZZ], nsz) + ovl);
+
+ /* Minimize the number of grids points per thread
+ * and, secondarily, the number of cuts in minor dimensions.
+ */
+ if (gsize_opt == -1 ||
+ gsize < gsize_opt ||
+ (gsize == gsize_opt &&
+ (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
+ {
+ nsub[XX] = nsx;
+ nsub[YY] = nsy;
+ nsub[ZZ] = nsz;
+ gsize_opt = gsize;
+ }
+ }
+ }
+ }
+ }
+
+ env = getenv("GMX_PME_THREAD_DIVISION");
+ if (env != NULL)
+ {
+ sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
+ }
+
+ if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
+ {
+ gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
+ }
+}
+
+static void pmegrids_init(pmegrids_t *grids,
+ int nx, int ny, int nz, int nz_base,
+ int pme_order,
+ gmx_bool bUseThreads,
+ int nthread,
+ int overlap_x,
+ int overlap_y)
+{
+ ivec n, n_base, g0, g1;
+ int t, x, y, z, d, i, tfac;
+ int max_comm_lines = -1;
+
+ n[XX] = nx - (pme_order - 1);
+ n[YY] = ny - (pme_order - 1);
+ n[ZZ] = nz - (pme_order - 1);
+
+ copy_ivec(n, n_base);
+ n_base[ZZ] = nz_base;
+
+ pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
+ NULL);
+
+ grids->nthread = nthread;
+
+ make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
+
+ if (bUseThreads)
+ {
+ ivec nst;
+ int gridsize;
+
+ for (d = 0; d < DIM; d++)
+ {
+ nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
+ }
+ set_grid_alignment(&nst[ZZ], pme_order);
+
+ if (debug)
+ {
+ fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
+ grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
+ fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
+ nx, ny, nz,
+ nst[XX], nst[YY], nst[ZZ]);
+ }
+
+ snew(grids->grid_th, grids->nthread);
+ t = 0;
+ gridsize = nst[XX]*nst[YY]*nst[ZZ];
+ set_gridsize_alignment(&gridsize, pme_order);
+ snew_aligned(grids->grid_all,
+ grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
+ 16);
+
+ for (x = 0; x < grids->nc[XX]; x++)
+ {
+ for (y = 0; y < grids->nc[YY]; y++)
+ {
+ for (z = 0; z < grids->nc[ZZ]; z++)
+ {
+ pmegrid_init(&grids->grid_th[t],
+ x, y, z,
+ (n[XX]*(x ))/grids->nc[XX],
+ (n[YY]*(y ))/grids->nc[YY],
+ (n[ZZ]*(z ))/grids->nc[ZZ],
+ (n[XX]*(x+1))/grids->nc[XX],
+ (n[YY]*(y+1))/grids->nc[YY],
+ (n[ZZ]*(z+1))/grids->nc[ZZ],
+ TRUE,
+ pme_order,
+ grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
+ t++;
+ }
+ }
+ }
+ }
+ else
+ {
+ grids->grid_th = NULL;
+ }
+
+ snew(grids->g2t, DIM);
+ tfac = 1;
+ for (d = DIM-1; d >= 0; d--)
+ {
+ snew(grids->g2t[d], n[d]);
+ t = 0;
+ for (i = 0; i < n[d]; i++)
+ {
+ /* The second check should match the parameters
+ * of the pmegrid_init call above.
+ */
+ while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
+ {
+ t++;
+ }
+ grids->g2t[d][i] = t*tfac;
+ }
+
+ tfac *= grids->nc[d];
+
+ switch (d)
+ {
+ case XX: max_comm_lines = overlap_x; break;
+ case YY: max_comm_lines = overlap_y; break;
+ case ZZ: max_comm_lines = pme_order - 1; break;
+ }
+ grids->nthread_comm[d] = 0;
+ while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
+ grids->nthread_comm[d] < grids->nc[d])
+ {
+ grids->nthread_comm[d]++;
+ }
+ if (debug != NULL)
+ {
+ fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
+ 'x'+d, grids->nthread_comm[d]);
+ }
+ /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
+ * work, but this is not a problematic restriction.
+ */
+ if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
+ {
+ gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
+ }
+ }
+}
+
+
+static void pmegrids_destroy(pmegrids_t *grids)
+{
+ int t;
+
+ if (grids->grid.grid != NULL)
+ {
+ sfree(grids->grid.grid);
+
+ if (grids->nthread > 0)
+ {
+ for (t = 0; t < grids->nthread; t++)
+ {
+ sfree(grids->grid_th[t].grid);
+ }
+ sfree(grids->grid_th);
+ }
+ }
+}
+
+
+static void realloc_work(pme_work_t *work, int nkx)
+{
+ if (nkx > work->nalloc)
+ {
+ work->nalloc = nkx;
+ srenew(work->mhx, work->nalloc);
+ srenew(work->mhy, work->nalloc);
+ srenew(work->mhz, work->nalloc);
+ srenew(work->m2, work->nalloc);
- snew_aligned(work->denom, work->nalloc+3, 16);
- snew_aligned(work->tmp1, work->nalloc+3, 16);
- snew_aligned(work->eterm, work->nalloc+3, 16);
++ /* Allocate an aligned pointer for SIMD operations, including extra
++ * elements at the end for padding.
+ */
++#ifdef PME_SIMD
++#define ALIGN_HERE GMX_SIMD_WIDTH_HERE
++#else
++/* We can use any alignment, apart from 0, so we use 4 */
++#define ALIGN_HERE 4
++#endif
+ sfree_aligned(work->denom);
+ sfree_aligned(work->tmp1);
+ sfree_aligned(work->eterm);
- #ifdef PME_SSE
- /* Calculate exponentials through SSE in float precision */
- inline static void calc_exponentials(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
++ snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
++ snew_aligned(work->tmp1, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
++ snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ srenew(work->m2inv, work->nalloc);
+ }
+}
+
+
+static void free_work(pme_work_t *work)
+{
+ sfree(work->mhx);
+ sfree(work->mhy);
+ sfree(work->mhz);
+ sfree(work->m2);
+ sfree_aligned(work->denom);
+ sfree_aligned(work->tmp1);
+ sfree_aligned(work->eterm);
+ sfree(work->m2inv);
+}
+
+
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
- __m128 f_sse;
- __m128 lu;
- __m128 tmp_d1, d_inv, tmp_r, tmp_e;
++#ifdef PME_SIMD
++/* Calculate exponentials through SIMD */
++inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+{
+ {
- f_sse = _mm_load1_ps(&f);
- for (kx = 0; kx < end; kx += 4)
++ const gmx_mm_pr two = gmx_set1_pr(2.0);
++ gmx_mm_pr f_simd;
++ gmx_mm_pr lu;
++ gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+ int kx;
- tmp_d1 = _mm_load_ps(d_aligned+kx);
- lu = _mm_rcp_ps(tmp_d1);
- d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
- tmp_r = _mm_load_ps(r_aligned+kx);
- tmp_r = gmx_mm_exp_ps(tmp_r);
- tmp_e = _mm_mul_ps(f_sse, d_inv);
- tmp_e = _mm_mul_ps(tmp_e, tmp_r);
- _mm_store_ps(e_aligned+kx, tmp_e);
++ f_simd = gmx_load1_pr(&f);
++ for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+ {
- #ifdef PME_SSE
++ tmp_d1 = gmx_load_pr(d_aligned+kx);
++ d_inv = gmx_inv_pr(tmp_d1);
++ tmp_r = gmx_load_pr(r_aligned+kx);
++ tmp_r = gmx_exp_pr(tmp_r);
++ tmp_e = gmx_mul_pr(f_simd, d_inv);
++ tmp_e = gmx_mul_pr(tmp_e, tmp_r);
++ gmx_store_pr(e_aligned+kx, tmp_e);
+ }
+ }
+}
+#else
+inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+{
+ int kx;
+ for (kx = start; kx < end; kx++)
+ {
+ d[kx] = 1.0/d[kx];
+ }
+ for (kx = start; kx < end; kx++)
+ {
+ r[kx] = exp(r[kx]);
+ }
+ for (kx = start; kx < end; kx++)
+ {
+ e[kx] = f*r[kx]*d[kx];
+ }
+}
+#endif
+
+
+static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
+ real ewaldcoeff, real vol,
+ gmx_bool bEnerVir,
+ int nthread, int thread)
+{
+ /* do recip sum over local cells in grid */
+ /* y major, z middle, x minor or continuous */
+ t_complex *p0;
+ int kx, ky, kz, maxkx, maxky, maxkz;
+ int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
+ real mx, my, mz;
+ real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+ real ets2, struct2, vfactor, ets2vf;
+ real d1, d2, energy = 0;
+ real by, bz;
+ real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
+ real rxx, ryx, ryy, rzx, rzy, rzz;
+ pme_work_t *work;
+ real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
+ real mhxk, mhyk, mhzk, m2k;
+ real corner_fac;
+ ivec complex_order;
+ ivec local_ndata, local_offset, local_size;
+ real elfac;
+
+ elfac = ONE_4PI_EPS0/pme->epsilon_r;
+
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+
+ /* Dimensions should be identical for A/B grid, so we just use A here */
+ gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
+ complex_order,
+ local_ndata,
+ local_offset,
+ local_size);
+
+ rxx = pme->recipbox[XX][XX];
+ ryx = pme->recipbox[YY][XX];
+ ryy = pme->recipbox[YY][YY];
+ rzx = pme->recipbox[ZZ][XX];
+ rzy = pme->recipbox[ZZ][YY];
+ rzz = pme->recipbox[ZZ][ZZ];
+
+ maxkx = (nx+1)/2;
+ maxky = (ny+1)/2;
+ maxkz = nz/2+1;
+
+ work = &pme->work[thread];
+ mhx = work->mhx;
+ mhy = work->mhy;
+ mhz = work->mhz;
+ m2 = work->m2;
+ denom = work->denom;
+ tmp1 = work->tmp1;
+ eterm = work->eterm;
+ m2inv = work->m2inv;
+
+ iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
+ iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
+
+ for (iyz = iyz0; iyz < iyz1; iyz++)
+ {
+ iy = iyz/local_ndata[ZZ];
+ iz = iyz - iy*local_ndata[ZZ];
+
+ ky = iy + local_offset[YY];
+
+ if (ky < maxky)
+ {
+ my = ky;
+ }
+ else
+ {
+ my = (ky - ny);
+ }
+
+ by = M_PI*vol*pme->bsp_mod[YY][ky];
+
+ kz = iz + local_offset[ZZ];
+
+ mz = kz;
+
+ bz = pme->bsp_mod[ZZ][kz];
+
+ /* 0.5 correction for corner points */
+ corner_fac = 1;
+ if (kz == 0 || kz == (nz+1)/2)
+ {
+ corner_fac = 0.5;
+ }
+
+ p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+
+ /* We should skip the k-space point (0,0,0) */
+ if (local_offset[XX] > 0 || ky > 0 || kz > 0)
+ {
+ kxstart = local_offset[XX];
+ }
+ else
+ {
+ kxstart = local_offset[XX] + 1;
+ p0++;
+ }
+ kxend = local_offset[XX] + local_ndata[XX];
+
+ if (bEnerVir)
+ {
+ /* More expensive inner loop, especially because of the storage
+ * of the mh elements in array's.
+ * Because x is the minor grid index, all mh elements
+ * depend on kx for triclinic unit cells.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+ for (kx = kxstart; kx < maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ }
+
+ for (kx = maxkx; kx < kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ }
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ m2inv[kx] = 1.0/m2[kx];
+ }
+
+ calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ p0->re = d1*eterm[kx];
+ p0->im = d2*eterm[kx];
+
+ struct2 = 2.0*(d1*d1+d2*d2);
+
+ tmp1[kx] = eterm[kx]*struct2;
+ }
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ ets2 = corner_fac*tmp1[kx];
+ vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
+ energy += ets2;
+
+ ets2vf = ets2*vfactor;
+ virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
+ virxy += ets2vf*mhx[kx]*mhy[kx];
+ virxz += ets2vf*mhx[kx]*mhz[kx];
+ viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
+ viryz += ets2vf*mhy[kx]*mhz[kx];
+ virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
+ }
+ }
+ else
+ {
+ /* We don't need to calculate the energy and the virial.
+ * In this case the triclinic overhead is small.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+
+ for (kx = kxstart; kx < maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ }
+
+ for (kx = maxkx; kx < kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ }
+
+ calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ p0->re = d1*eterm[kx];
+ p0->im = d2*eterm[kx];
+ }
+ }
+ }
+
+ if (bEnerVir)
+ {
+ /* Update virial with local values.
+ * The virial is symmetric by definition.
+ * this virial seems ok for isotropic scaling, but I'm
+ * experiencing problems on semiisotropic membranes.
+ * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
+ */
+ work->vir[XX][XX] = 0.25*virxx;
+ work->vir[YY][YY] = 0.25*viryy;
+ work->vir[ZZ][ZZ] = 0.25*virzz;
+ work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
+ work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
+ work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+
+ /* This energy should be corrected for a charged system */
+ work->energy = 0.5*energy;
+ }
+
+ /* Return the loop count */
+ return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
+ real *mesh_energy, matrix vir)
+{
+ /* This function sums output over threads
+ * and should therefore only be called after thread synchronization.
+ */
+ int thread;
+
+ *mesh_energy = pme->work[0].energy;
+ copy_mat(pme->work[0].vir, vir);
+
+ for (thread = 1; thread < nthread; thread++)
+ {
+ *mesh_energy += pme->work[thread].energy;
+ m_add(vir, pme->work[thread].vir, vir);
+ }
+}
+
+#define DO_FSPLINE(order) \
+ for (ithx = 0; (ithx < order); ithx++) \
+ { \
+ index_x = (i0+ithx)*pny*pnz; \
+ tx = thx[ithx]; \
+ dx = dthx[ithx]; \
+ \
+ for (ithy = 0; (ithy < order); ithy++) \
+ { \
+ index_xy = index_x+(j0+ithy)*pnz; \
+ ty = thy[ithy]; \
+ dy = dthy[ithy]; \
+ fxy1 = fz1 = 0; \
+ \
+ for (ithz = 0; (ithz < order); ithz++) \
+ { \
+ gval = grid[index_xy+(k0+ithz)]; \
+ fxy1 += thz[ithz]*gval; \
+ fz1 += dthz[ithz]*gval; \
+ } \
+ fx += dx*ty*fxy1; \
+ fy += tx*dy*fxy1; \
+ fz += tx*ty*fz1; \
+ } \
+ }
+
+
+static void gather_f_bsplines(gmx_pme_t pme, real *grid,
+ gmx_bool bClearF, pme_atomcomm_t *atc,
+ splinedata_t *spline,
+ real scale)
+{
+ /* sum forces for local particles */
+ int nn, n, ithx, ithy, ithz, i0, j0, k0;
+ int index_x, index_xy;
+ int nx, ny, nz, pnx, pny, pnz;
+ int * idxptr;
+ real tx, ty, dx, dy, qn;
+ real fx, fy, fz, gval;
+ real fxy1, fz1;
+ real *thx, *thy, *thz, *dthx, *dthy, *dthz;
+ int norder;
+ real rxx, ryx, ryy, rzx, rzy, rzz;
+ int order;
+
+ pme_spline_work_t *work;
+
+ work = pme->spline_work;
+
+ order = pme->pme_order;
+ thx = spline->theta[XX];
+ thy = spline->theta[YY];
+ thz = spline->theta[ZZ];
+ dthx = spline->dtheta[XX];
+ dthy = spline->dtheta[YY];
+ dthz = spline->dtheta[ZZ];
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+ pnx = pme->pmegrid_nx;
+ pny = pme->pmegrid_ny;
+ pnz = pme->pmegrid_nz;
+
+ rxx = pme->recipbox[XX][XX];
+ ryx = pme->recipbox[YY][XX];
+ ryy = pme->recipbox[YY][YY];
+ rzx = pme->recipbox[ZZ][XX];
+ rzy = pme->recipbox[ZZ][YY];
+ rzz = pme->recipbox[ZZ][ZZ];
+
+ for (nn = 0; nn < spline->n; nn++)
+ {
+ n = spline->ind[nn];
+ qn = scale*atc->q[n];
+
+ if (bClearF)
+ {
+ atc->f[n][XX] = 0;
+ atc->f[n][YY] = 0;
+ atc->f[n][ZZ] = 0;
+ }
+ if (qn != 0)
+ {
+ fx = 0;
+ fy = 0;
+ fz = 0;
+ idxptr = atc->idx[n];
+ norder = nn*order;
+
+ i0 = idxptr[XX];
+ j0 = idxptr[YY];
+ k0 = idxptr[ZZ];
+
+ /* Pointer arithmetic alert, next six statements */
+ thx = spline->theta[XX] + norder;
+ thy = spline->theta[YY] + norder;
+ thz = spline->theta[ZZ] + norder;
+ dthx = spline->dtheta[XX] + norder;
+ dthy = spline->dtheta[YY] + norder;
+ dthz = spline->dtheta[ZZ] + norder;
+
+ switch (order)
+ {
+ case 4:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+#ifdef PME_SSE_UNALIGNED
+#define PME_GATHER_F_SSE_ORDER4
+#else
+#define PME_GATHER_F_SSE_ALIGNED
+#define PME_ORDER 4
+#endif
+#include "pme_sse_single.h"
+#else
+ DO_FSPLINE(4);
+#endif
+ break;
+ case 5:
- #ifdef PME_SSE
++#ifdef PME_SSE_SPREAD_GATHER
+#define PME_GATHER_F_SSE_ALIGNED
+#define PME_ORDER 5
+#include "pme_sse_single.h"
+#else
+ DO_FSPLINE(5);
+#endif
+ break;
+ default:
+ DO_FSPLINE(order);
+ break;
+ }
+
+ atc->f[n][XX] += -qn*( fx*nx*rxx );
+ atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
+ atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+ }
+ }
+ /* Since the energy and not forces are interpolated
+ * the net force might not be exactly zero.
+ * This can be solved by also interpolating F, but
+ * that comes at a cost.
+ * A better hack is to remove the net force every
+ * step, but that must be done at a higher level
+ * since this routine doesn't see all atoms if running
+ * in parallel. Don't know how important it is? EL 990726
+ */
+}
+
+
+static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
+ pme_atomcomm_t *atc)
+{
+ splinedata_t *spline;
+ int n, ithx, ithy, ithz, i0, j0, k0;
+ int index_x, index_xy;
+ int * idxptr;
+ real energy, pot, tx, ty, qn, gval;
+ real *thx, *thy, *thz;
+ int norder;
+ int order;
+
+ spline = &atc->spline[0];
+
+ order = pme->pme_order;
+
+ energy = 0;
+ for (n = 0; (n < atc->n); n++)
+ {
+ qn = atc->q[n];
+
+ if (qn != 0)
+ {
+ idxptr = atc->idx[n];
+ norder = n*order;
+
+ i0 = idxptr[XX];
+ j0 = idxptr[YY];
+ k0 = idxptr[ZZ];
+
+ /* Pointer arithmetic alert, next three statements */
+ thx = spline->theta[XX] + norder;
+ thy = spline->theta[YY] + norder;
+ thz = spline->theta[ZZ] + norder;
+
+ pot = 0;
+ for (ithx = 0; (ithx < order); ithx++)
+ {
+ index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
+ tx = thx[ithx];
+
+ for (ithy = 0; (ithy < order); ithy++)
+ {
+ index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
+ ty = thy[ithy];
+
+ for (ithz = 0; (ithz < order); ithz++)
+ {
+ gval = grid[index_xy+(k0+ithz)];
+ pot += tx*ty*thz[ithz]*gval;
+ }
+
+ }
+ }
+
+ energy += pot*qn;
+ }
+ }
+
+ return energy;
+}
+
+/* Macro to force loop unrolling by fixing order.
+ * This gives a significant performance gain.
+ */
+#define CALC_SPLINE(order) \
+ { \
+ int j, k, l; \
+ real dr, div; \
+ real data[PME_ORDER_MAX]; \
+ real ddata[PME_ORDER_MAX]; \
+ \
+ for (j = 0; (j < DIM); j++) \
+ { \
+ dr = xptr[j]; \
+ \
+ /* dr is relative offset from lower cell limit */ \
+ data[order-1] = 0; \
+ data[1] = dr; \
+ data[0] = 1 - dr; \
+ \
+ for (k = 3; (k < order); k++) \
+ { \
+ div = 1.0/(k - 1.0); \
+ data[k-1] = div*dr*data[k-2]; \
+ for (l = 1; (l < (k-1)); l++) \
+ { \
+ data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
+ data[k-l-1]); \
+ } \
+ data[0] = div*(1-dr)*data[0]; \
+ } \
+ /* differentiate */ \
+ ddata[0] = -data[0]; \
+ for (k = 1; (k < order); k++) \
+ { \
+ ddata[k] = data[k-1] - data[k]; \
+ } \
+ \
+ div = 1.0/(order - 1); \
+ data[order-1] = div*dr*data[order-2]; \
+ for (l = 1; (l < (order-1)); l++) \
+ { \
+ data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
+ (order-l-dr)*data[order-l-1]); \
+ } \
+ data[0] = div*(1 - dr)*data[0]; \
+ \
+ for (k = 0; k < order; k++) \
+ { \
+ theta[j][i*order+k] = data[k]; \
+ dtheta[j][i*order+k] = ddata[k]; \
+ } \
+ } \
+ }
+
+void make_bsplines(splinevec theta, splinevec dtheta, int order,
+ rvec fractx[], int nr, int ind[], real charge[],
+ gmx_bool bFreeEnergy)
+{
+ /* construct splines for local atoms */
+ int i, ii;
+ real *xptr;
+
+ for (i = 0; i < nr; i++)
+ {
+ /* With free energy we do not use the charge check.
+ * In most cases this will be more efficient than calling make_bsplines
+ * twice, since usually more than half the particles have charges.
+ */
+ ii = ind[i];
+ if (bFreeEnergy || charge[ii] != 0.0)
+ {
+ xptr = fractx[ii];
+ switch (order)
+ {
+ case 4: CALC_SPLINE(4); break;
+ case 5: CALC_SPLINE(5); break;
+ default: CALC_SPLINE(order); break;
+ }
+ }
+ }
+}
+
+
+void make_dft_mod(real *mod, real *data, int ndata)
+{
+ int i, j;
+ real sc, ss, arg;
+
+ for (i = 0; i < ndata; i++)
+ {
+ sc = ss = 0;
+ for (j = 0; j < ndata; j++)
+ {
+ arg = (2.0*M_PI*i*j)/ndata;
+ sc += data[j]*cos(arg);
+ ss += data[j]*sin(arg);
+ }
+ mod[i] = sc*sc+ss*ss;
+ }
+ for (i = 0; i < ndata; i++)
+ {
+ if (mod[i] < 1e-7)
+ {
+ mod[i] = (mod[i-1]+mod[i+1])*0.5;
+ }
+ }
+}
+
+
+static void make_bspline_moduli(splinevec bsp_mod,
+ int nx, int ny, int nz, int order)
+{
+ int nmax = max(nx, max(ny, nz));
+ real *data, *ddata, *bsp_data;
+ int i, k, l;
+ real div;
+
+ snew(data, order);
+ snew(ddata, order);
+ snew(bsp_data, nmax);
+
+ data[order-1] = 0;
+ data[1] = 0;
+ data[0] = 1;
+
+ for (k = 3; k < order; k++)
+ {
+ div = 1.0/(k-1.0);
+ data[k-1] = 0;
+ for (l = 1; l < (k-1); l++)
+ {
+ data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
+ }
+ data[0] = div*data[0];
+ }
+ /* differentiate */
+ ddata[0] = -data[0];
+ for (k = 1; k < order; k++)
+ {
+ ddata[k] = data[k-1]-data[k];
+ }
+ div = 1.0/(order-1);
+ data[order-1] = 0;
+ for (l = 1; l < (order-1); l++)
+ {
+ data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
+ }
+ data[0] = div*data[0];
+
+ for (i = 0; i < nmax; i++)
+ {
+ bsp_data[i] = 0;
+ }
+ for (i = 1; i <= order; i++)
+ {
+ bsp_data[i] = data[i-1];
+ }
+
+ make_dft_mod(bsp_mod[XX], bsp_data, nx);
+ make_dft_mod(bsp_mod[YY], bsp_data, ny);
+ make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
+
+ sfree(data);
+ sfree(ddata);
+ sfree(bsp_data);
+}
+
+
+/* Return the P3M optimal influence function */
+static double do_p3m_influence(double z, int order)
+{
+ double z2, z4;
+
+ z2 = z*z;
+ z4 = z2*z2;
+
+ /* The formula and most constants can be found in:
+ * Ballenegger et al., JCTC 8, 936 (2012)
+ */
+ switch (order)
+ {
+ case 2:
+ return 1.0 - 2.0*z2/3.0;
+ break;
+ case 3:
+ return 1.0 - z2 + 2.0*z4/15.0;
+ break;
+ case 4:
+ return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
+ break;
+ case 5:
+ return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
+ break;
+ case 6:
+ return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
+ break;
+ case 7:
+ return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
+ case 8:
+ return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
+ break;
+ }
+
+ return 0.0;
+}
+
+/* Calculate the P3M B-spline moduli for one dimension */
+static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
+{
+ double zarg, zai, sinzai, infl;
+ int maxk, i;
+
+ if (order > 8)
+ {
+ gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
+ }
+
+ zarg = M_PI/n;
+
+ maxk = (n + 1)/2;
+
+ for (i = -maxk; i < 0; i++)
+ {
+ zai = zarg*i;
+ sinzai = sin(zai);
+ infl = do_p3m_influence(sinzai, order);
+ bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
+ }
+ bsp_mod[0] = 1.0;
+ for (i = 1; i < maxk; i++)
+ {
+ zai = zarg*i;
+ sinzai = sin(zai);
+ infl = do_p3m_influence(sinzai, order);
+ bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
+ }
+}
+
+/* Calculate the P3M B-spline moduli */
+static void make_p3m_bspline_moduli(splinevec bsp_mod,
+ int nx, int ny, int nz, int order)
+{
+ make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
+ make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
+ make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
+}
+
+
+static void setup_coordinate_communication(pme_atomcomm_t *atc)
+{
+ int nslab, n, i;
+ int fw, bw;
+
+ nslab = atc->nslab;
+
+ n = 0;
+ for (i = 1; i <= nslab/2; i++)
+ {
+ fw = (atc->nodeid + i) % nslab;
+ bw = (atc->nodeid - i + nslab) % nslab;
+ if (n < nslab - 1)
+ {
+ atc->node_dest[n] = fw;
+ atc->node_src[n] = bw;
+ n++;
+ }
+ if (n < nslab - 1)
+ {
+ atc->node_dest[n] = bw;
+ atc->node_src[n] = fw;
+ n++;
+ }
+ }
+}
+
+int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
+{
+ int thread;
+
+ if (NULL != log)
+ {
+ fprintf(log, "Destroying PME data structures.\n");
+ }
+
+ sfree((*pmedata)->nnx);
+ sfree((*pmedata)->nny);
+ sfree((*pmedata)->nnz);
+
+ pmegrids_destroy(&(*pmedata)->pmegridA);
+
+ sfree((*pmedata)->fftgridA);
+ sfree((*pmedata)->cfftgridA);
+ gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
+
+ if ((*pmedata)->pmegridB.grid.grid != NULL)
+ {
+ pmegrids_destroy(&(*pmedata)->pmegridB);
+ sfree((*pmedata)->fftgridB);
+ sfree((*pmedata)->cfftgridB);
+ gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
+ }
+ for (thread = 0; thread < (*pmedata)->nthread; thread++)
+ {
+ free_work(&(*pmedata)->work[thread]);
+ }
+ sfree((*pmedata)->work);
+
+ sfree(*pmedata);
+ *pmedata = NULL;
+
+ return 0;
+}
+
+static int mult_up(int n, int f)
+{
+ return ((n + f - 1)/f)*f;
+}
+
+
+static double pme_load_imbalance(gmx_pme_t pme)
+{
+ int nma, nmi;
+ double n1, n2, n3;
+
+ nma = pme->nnodes_major;
+ nmi = pme->nnodes_minor;
+
+ n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
+ n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
+ n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
+
+ /* pme_solve is roughly double the cost of an fft */
+
+ return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
+}
+
+static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
+ int dimind, gmx_bool bSpread)
+{
+ int nk, k, s, thread;
+
+ atc->dimind = dimind;
+ atc->nslab = 1;
+ atc->nodeid = 0;
+ atc->pd_nalloc = 0;
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ atc->mpi_comm = pme->mpi_comm_d[dimind];
+ MPI_Comm_size(atc->mpi_comm, &atc->nslab);
+ MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
+ }
+ if (debug)
+ {
+ fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
+ }
+#endif
+
+ atc->bSpread = bSpread;
+ atc->pme_order = pme->pme_order;
+
+ if (atc->nslab > 1)
+ {
+ /* These three allocations are not required for particle decomp. */
+ snew(atc->node_dest, atc->nslab);
+ snew(atc->node_src, atc->nslab);
+ setup_coordinate_communication(atc);
+
+ snew(atc->count_thread, pme->nthread);
+ for (thread = 0; thread < pme->nthread; thread++)
+ {
+ snew(atc->count_thread[thread], atc->nslab);
+ }
+ atc->count = atc->count_thread[0];
+ snew(atc->rcount, atc->nslab);
+ snew(atc->buf_index, atc->nslab);
+ }
+
+ atc->nthread = pme->nthread;
+ if (atc->nthread > 1)
+ {
+ snew(atc->thread_plist, atc->nthread);
+ }
+ snew(atc->spline, atc->nthread);
+ for (thread = 0; thread < atc->nthread; thread++)
+ {
+ if (atc->nthread > 1)
+ {
+ snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
+ atc->thread_plist[thread].n += GMX_CACHE_SEP;
+ }
+ snew(atc->spline[thread].thread_one, pme->nthread);
+ atc->spline[thread].thread_one[thread] = 1;
+ }
+}
+
+static void
+init_overlap_comm(pme_overlap_t * ol,
+ int norder,
+#ifdef GMX_MPI
+ MPI_Comm comm,
+#endif
+ int nnodes,
+ int nodeid,
+ int ndata,
+ int commplainsize)
+{
+ int lbnd, rbnd, maxlr, b, i;
+ int exten;
+ int nn, nk;
+ pme_grid_comm_t *pgc;
+ gmx_bool bCont;
+ int fft_start, fft_end, send_index1, recv_index1;
+#ifdef GMX_MPI
+ MPI_Status stat;
+
+ ol->mpi_comm = comm;
+#endif
+
+ ol->nnodes = nnodes;
+ ol->nodeid = nodeid;
+
+ /* Linear translation of the PME grid won't affect reciprocal space
+ * calculations, so to optimize we only interpolate "upwards",
+ * which also means we only have to consider overlap in one direction.
+ * I.e., particles on this node might also be spread to grid indices
+ * that belong to higher nodes (modulo nnodes)
+ */
+
+ snew(ol->s2g0, ol->nnodes+1);
+ snew(ol->s2g1, ol->nnodes);
+ if (debug)
+ {
+ fprintf(debug, "PME slab boundaries:");
+ }
+ for (i = 0; i < nnodes; i++)
+ {
+ /* s2g0 the local interpolation grid start.
+ * s2g1 the local interpolation grid end.
+ * Because grid overlap communication only goes forward,
+ * the grid the slabs for fft's should be rounded down.
+ */
+ ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
+ ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
+
+ if (debug)
+ {
+ fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
+ }
+ }
+ ol->s2g0[nnodes] = ndata;
+ if (debug)
+ {
+ fprintf(debug, "\n");
+ }
+
+ /* Determine with how many nodes we need to communicate the grid overlap */
+ b = 0;
+ do
+ {
+ b++;
+ bCont = FALSE;
+ for (i = 0; i < nnodes; i++)
+ {
+ if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
+ (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
+ {
+ bCont = TRUE;
+ }
+ }
+ }
+ while (bCont && b < nnodes);
+ ol->noverlap_nodes = b - 1;
+
+ snew(ol->send_id, ol->noverlap_nodes);
+ snew(ol->recv_id, ol->noverlap_nodes);
+ for (b = 0; b < ol->noverlap_nodes; b++)
+ {
+ ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
+ ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
+ }
+ snew(ol->comm_data, ol->noverlap_nodes);
+
+ ol->send_size = 0;
+ for (b = 0; b < ol->noverlap_nodes; b++)
+ {
+ pgc = &ol->comm_data[b];
+ /* Send */
+ fft_start = ol->s2g0[ol->send_id[b]];
+ fft_end = ol->s2g0[ol->send_id[b]+1];
+ if (ol->send_id[b] < nodeid)
+ {
+ fft_start += ndata;
+ fft_end += ndata;
+ }
+ send_index1 = ol->s2g1[nodeid];
+ send_index1 = min(send_index1, fft_end);
+ pgc->send_index0 = fft_start;
+ pgc->send_nindex = max(0, send_index1 - pgc->send_index0);
+ ol->send_size += pgc->send_nindex;
+
+ /* We always start receiving to the first index of our slab */
+ fft_start = ol->s2g0[ol->nodeid];
+ fft_end = ol->s2g0[ol->nodeid+1];
+ recv_index1 = ol->s2g1[ol->recv_id[b]];
+ if (ol->recv_id[b] > nodeid)
+ {
+ recv_index1 -= ndata;
+ }
+ recv_index1 = min(recv_index1, fft_end);
+ pgc->recv_index0 = fft_start;
+ pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
+ }
+
+#ifdef GMX_MPI
+ /* Communicate the buffer sizes to receive */
+ for (b = 0; b < ol->noverlap_nodes; b++)
+ {
+ MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
+ &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
+ ol->mpi_comm, &stat);
+ }
+#endif
+
+ /* For non-divisible grid we need pme_order iso pme_order-1 */
+ snew(ol->sendbuf, norder*commplainsize);
+ snew(ol->recvbuf, norder*commplainsize);
+}
+
+static void
+make_gridindex5_to_localindex(int n, int local_start, int local_range,
+ int **global_to_local,
+ real **fraction_shift)
+{
+ int i;
+ int * gtl;
+ real * fsh;
+
+ snew(gtl, 5*n);
+ snew(fsh, 5*n);
+ for (i = 0; (i < 5*n); i++)
+ {
+ /* Determine the global to local grid index */
+ gtl[i] = (i - local_start + n) % n;
+ /* For coordinates that fall within the local grid the fraction
+ * is correct, we don't need to shift it.
+ */
+ fsh[i] = 0;
+ if (local_range < n)
+ {
+ /* Due to rounding issues i could be 1 beyond the lower or
+ * upper boundary of the local grid. Correct the index for this.
+ * If we shift the index, we need to shift the fraction by
+ * the same amount in the other direction to not affect
+ * the weights.
+ * Note that due to this shifting the weights at the end of
+ * the spline might change, but that will only involve values
+ * between zero and values close to the precision of a real,
+ * which is anyhow the accuracy of the whole mesh calculation.
+ */
+ /* With local_range=0 we should not change i=local_start */
+ if (i % n != local_start)
+ {
+ if (gtl[i] == n-1)
+ {
+ gtl[i] = 0;
+ fsh[i] = -1;
+ }
+ else if (gtl[i] == local_range)
+ {
+ gtl[i] = local_range - 1;
+ fsh[i] = 1;
+ }
+ }
+ }
+ }
+
+ *global_to_local = gtl;
+ *fraction_shift = fsh;
+}
+
+static pme_spline_work_t *make_pme_spline_work(int order)
+{
+ pme_spline_work_t *work;
+
++#ifdef PME_SSE_SPREAD_GATHER
+ float tmp[8];
+ __m128 zero_SSE;
+ int of, i;
+
+ snew_aligned(work, 1, 16);
+
+ zero_SSE = _mm_setzero_ps();
+
+ /* Generate bit masks to mask out the unused grid entries,
+ * as we only operate on order of the 8 grid entries that are
+ * load into 2 SSE float registers.
+ */
+ for (of = 0; of < 8-(order-1); of++)
+ {
+ for (i = 0; i < 8; i++)
+ {
+ tmp[i] = (i >= of && i < of+order ? 1 : 0);
+ }
+ work->mask_SSE0[of] = _mm_loadu_ps(tmp);
+ work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
+ work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
+ work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+ }
+#else
+ work = NULL;
+#endif
+
+ return work;
+}
+
+void gmx_pme_check_restrictions(int pme_order,
+ int nkx, int nky, int nkz,
+ int nnodes_major,
+ int nnodes_minor,
+ gmx_bool bUseThreads,
+ gmx_bool bFatal,
+ gmx_bool *bValidSettings)
+{
+ if (pme_order > PME_ORDER_MAX)
+ {
+ if (!bFatal)
+ {
+ *bValidSettings = FALSE;
+ return;
+ }
+ gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+ pme_order, PME_ORDER_MAX);
+ }
+
+ if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
+ nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
+ nkz <= pme_order)
+ {
+ if (!bFatal)
+ {
+ *bValidSettings = FALSE;
+ return;
+ }
+ gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
+ pme_order);
+ }
+
+ /* Check for a limitation of the (current) sum_fftgrid_dd code.
+ * We only allow multiple communication pulses in dim 1, not in dim 0.
+ */
+ if (bUseThreads && (nkx < nnodes_major*pme_order &&
+ nkx != nnodes_major*(pme_order - 1)))
+ {
+ if (!bFatal)
+ {
+ *bValidSettings = FALSE;
+ return;
+ }
+ gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+ nkx/(double)nnodes_major, pme_order);
+ }
+
+ if (bValidSettings != NULL)
+ {
+ *bValidSettings = TRUE;
+ }
+
+ return;
+}
+
+int gmx_pme_init(gmx_pme_t * pmedata,
+ t_commrec * cr,
+ int nnodes_major,
+ int nnodes_minor,
+ t_inputrec * ir,
+ int homenr,
+ gmx_bool bFreeEnergy,
+ gmx_bool bReproducible,
+ int nthread)
+{
+ gmx_pme_t pme = NULL;
+
+ int use_threads, sum_use_threads;
+ ivec ndata;
+
+ if (debug)
+ {
+ fprintf(debug, "Creating PME data structures.\n");
+ }
+ snew(pme, 1);
+
+ pme->redist_init = FALSE;
+ pme->sum_qgrid_tmp = NULL;
+ pme->sum_qgrid_dd_tmp = NULL;
+ pme->buf_nalloc = 0;
+ pme->redist_buf_nalloc = 0;
+
+ pme->nnodes = 1;
+ pme->bPPnode = TRUE;
+
+ pme->nnodes_major = nnodes_major;
+ pme->nnodes_minor = nnodes_minor;
+
+#ifdef GMX_MPI
+ if (nnodes_major*nnodes_minor > 1)
+ {
+ pme->mpi_comm = cr->mpi_comm_mygroup;
+
+ MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
+ MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
+ if (pme->nnodes != nnodes_major*nnodes_minor)
+ {
+ gmx_incons("PME node count mismatch");
+ }
+ }
+ else
+ {
+ pme->mpi_comm = MPI_COMM_NULL;
+ }
+#endif
+
+ if (pme->nnodes == 1)
+ {
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0] = MPI_COMM_NULL;
+ pme->mpi_comm_d[1] = MPI_COMM_NULL;
+#endif
+ pme->ndecompdim = 0;
+ pme->nodeid_major = 0;
+ pme->nodeid_minor = 0;
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
+#endif
+ }
+ else
+ {
+ if (nnodes_minor == 1)
+ {
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0] = pme->mpi_comm;
+ pme->mpi_comm_d[1] = MPI_COMM_NULL;
+#endif
+ pme->ndecompdim = 1;
+ pme->nodeid_major = pme->nodeid;
+ pme->nodeid_minor = 0;
+
+ }
+ else if (nnodes_major == 1)
+ {
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0] = MPI_COMM_NULL;
+ pme->mpi_comm_d[1] = pme->mpi_comm;
+#endif
+ pme->ndecompdim = 1;
+ pme->nodeid_major = 0;
+ pme->nodeid_minor = pme->nodeid;
+ }
+ else
+ {
+ if (pme->nnodes % nnodes_major != 0)
+ {
+ gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+ }
+ pme->ndecompdim = 2;
+
+#ifdef GMX_MPI
+ MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
+ pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
+ MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
+ pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
+
+ MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
+ MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
+ MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
+ MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
+#endif
+ }
+ pme->bPPnode = (cr->duty & DUTY_PP);
+ }
+
+ pme->nthread = nthread;
+
+ /* Check if any of the PME MPI ranks uses threads */
+ use_threads = (pme->nthread > 1 ? 1 : 0);
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
+ MPI_SUM, pme->mpi_comm);
+ }
+ else
+#endif
+ {
+ sum_use_threads = use_threads;
+ }
+ pme->bUseThreads = (sum_use_threads > 0);
+
+ if (ir->ePBC == epbcSCREW)
+ {
+ gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
+ }
+
+ pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
+ pme->nkx = ir->nkx;
+ pme->nky = ir->nky;
+ pme->nkz = ir->nkz;
+ pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
+ pme->pme_order = ir->pme_order;
+ pme->epsilon_r = ir->epsilon_r;
+
+ /* If we violate restrictions, generate a fatal error here */
+ gmx_pme_check_restrictions(pme->pme_order,
+ pme->nkx, pme->nky, pme->nkz,
+ pme->nnodes_major,
+ pme->nnodes_minor,
+ pme->bUseThreads,
+ TRUE,
+ NULL);
+
+ if (pme->nnodes > 1)
+ {
+ double imbal;
+
+#ifdef GMX_MPI
+ MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
+ MPI_Type_commit(&(pme->rvec_mpi));
+#endif
+
+ /* Note that the charge spreading and force gathering, which usually
+ * takes about the same amount of time as FFT+solve_pme,
+ * is always fully load balanced
+ * (unless the charge distribution is inhomogeneous).
+ */
+
+ imbal = pme_load_imbalance(pme);
+ if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
+ {
+ fprintf(stderr,
+ "\n"
+ "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
+ " For optimal PME load balancing\n"
+ " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
+ " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+ "\n",
+ (int)((imbal-1)*100 + 0.5),
+ pme->nkx, pme->nky, pme->nnodes_major,
+ pme->nky, pme->nkz, pme->nnodes_minor);
+ }
+ }
+
+ /* For non-divisible grid we need pme_order iso pme_order-1 */
+ /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
+ * y is always copied through a buffer: we don't need padding in z,
+ * but we do need the overlap in x because of the communication order.
+ */
+ init_overlap_comm(&pme->overlap[0], pme->pme_order,
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0],
+#endif
+ pme->nnodes_major, pme->nodeid_major,
+ pme->nkx,
+ (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
+
+ /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
+ * We do this with an offset buffer of equal size, so we need to allocate
+ * extra for the offset. That's what the (+1)*pme->nkz is for.
+ */
+ init_overlap_comm(&pme->overlap[1], pme->pme_order,
+#ifdef GMX_MPI
+ pme->mpi_comm_d[1],
+#endif
+ pme->nnodes_minor, pme->nodeid_minor,
+ pme->nky,
+ (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
+
+ /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
+ * Note that gmx_pme_check_restrictions checked for this already.
+ */
+ if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
+ {
+ gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
+ }
+
+ snew(pme->bsp_mod[XX], pme->nkx);
+ snew(pme->bsp_mod[YY], pme->nky);
+ snew(pme->bsp_mod[ZZ], pme->nkz);
+
+ /* The required size of the interpolation grid, including overlap.
+ * The allocated size (pmegrid_n?) might be slightly larger.
+ */
+ pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
+ pme->overlap[0].s2g0[pme->nodeid_major];
+ pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
+ pme->overlap[1].s2g0[pme->nodeid_minor];
+ pme->pmegrid_nz_base = pme->nkz;
+ pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
+ set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
+
+ pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
+ pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
+ pme->pmegrid_start_iz = 0;
+
+ make_gridindex5_to_localindex(pme->nkx,
+ pme->pmegrid_start_ix,
+ pme->pmegrid_nx - (pme->pme_order-1),
+ &pme->nnx, &pme->fshx);
+ make_gridindex5_to_localindex(pme->nky,
+ pme->pmegrid_start_iy,
+ pme->pmegrid_ny - (pme->pme_order-1),
+ &pme->nny, &pme->fshy);
+ make_gridindex5_to_localindex(pme->nkz,
+ pme->pmegrid_start_iz,
+ pme->pmegrid_nz_base,
+ &pme->nnz, &pme->fshz);
+
+ pmegrids_init(&pme->pmegridA,
+ pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+ pme->pmegrid_nz_base,
+ pme->pme_order,
+ pme->bUseThreads,
+ pme->nthread,
+ pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+ pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+
+ pme->spline_work = make_pme_spline_work(pme->pme_order);
+
+ ndata[0] = pme->nkx;
+ ndata[1] = pme->nky;
+ ndata[2] = pme->nkz;
+
+ /* This routine will allocate the grid data to fit the FFTs */
+ gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
+ &pme->fftgridA, &pme->cfftgridA,
+ pme->mpi_comm_d,
+ bReproducible, pme->nthread);
+
+ if (bFreeEnergy)
+ {
+ pmegrids_init(&pme->pmegridB,
+ pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+ pme->pmegrid_nz_base,
+ pme->pme_order,
+ pme->bUseThreads,
+ pme->nthread,
+ pme->nkx % pme->nnodes_major != 0,
+ pme->nky % pme->nnodes_minor != 0);
+
+ gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
+ &pme->fftgridB, &pme->cfftgridB,
+ pme->mpi_comm_d,
+ bReproducible, pme->nthread);
+ }
+ else
+ {
+ pme->pmegridB.grid.grid = NULL;
+ pme->fftgridB = NULL;
+ pme->cfftgridB = NULL;
+ }
+
+ if (!pme->bP3M)
+ {
+ /* Use plain SPME B-spline interpolation */
+ make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
+ }
+ else
+ {
+ /* Use the P3M grid-optimized influence function */
+ make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
+ }
+
+ /* Use atc[0] for spreading */
+ init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
+ if (pme->ndecompdim >= 2)
+ {
+ init_atomcomm(pme, &pme->atc[1], 1, FALSE);
+ }
+
+ if (pme->nnodes == 1)
+ {
+ pme->atc[0].n = homenr;
+ pme_realloc_atomcomm_things(&pme->atc[0]);
+ }
+
+ {
+ int thread;
+
+ /* Use fft5d, order after FFT is y major, z, x minor */
+
+ snew(pme->work, pme->nthread);
+ for (thread = 0; thread < pme->nthread; thread++)
+ {
+ realloc_work(&pme->work[thread], pme->nkx);
+ }
+ }
+
+ *pmedata = pme;
+
+ return 0;
+}
+
+static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
+{
+ int d, t;
+
+ for (d = 0; d < DIM; d++)
+ {
+ if (new->grid.n[d] > old->grid.n[d])
+ {
+ return;
+ }
+ }
+
+ sfree_aligned(new->grid.grid);
+ new->grid.grid = old->grid.grid;
+
+ if (new->grid_th != NULL && new->nthread == old->nthread)
+ {
+ sfree_aligned(new->grid_all);
+ for (t = 0; t < new->nthread; t++)
+ {
+ new->grid_th[t].grid = old->grid_th[t].grid;
+ }
+ }
+}
+
+int gmx_pme_reinit(gmx_pme_t * pmedata,
+ t_commrec * cr,
+ gmx_pme_t pme_src,
+ const t_inputrec * ir,
+ ivec grid_size)
+{
+ t_inputrec irc;
+ int homenr;
+ int ret;
+
+ irc = *ir;
+ irc.nkx = grid_size[XX];
+ irc.nky = grid_size[YY];
+ irc.nkz = grid_size[ZZ];
+
+ if (pme_src->nnodes == 1)
+ {
+ homenr = pme_src->atc[0].n;
+ }
+ else
+ {
+ homenr = -1;
+ }
+
+ ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
+ &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
+
+ if (ret == 0)
+ {
+ /* We can easily reuse the allocated pme grids in pme_src */
+ reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
+ /* We would like to reuse the fft grids, but that's harder */
+ }
+
+ return ret;
+}
+
+
+static void copy_local_grid(gmx_pme_t pme,
+ pmegrids_t *pmegrids, int thread, real *fftgrid)
+{
+ ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ int fft_my, fft_mz;
+ int nsx, nsy, nsz;
+ ivec nf;
+ int offx, offy, offz, x, y, z, i0, i0t;
+ int d;
+ pmegrid_t *pmegrid;
+ real *grid_th;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+ fft_my = local_fft_size[YY];
+ fft_mz = local_fft_size[ZZ];
+
+ pmegrid = &pmegrids->grid_th[thread];
+
+ nsx = pmegrid->s[XX];
+ nsy = pmegrid->s[YY];
+ nsz = pmegrid->s[ZZ];
+
+ for (d = 0; d < DIM; d++)
+ {
+ nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
+ local_fft_ndata[d] - pmegrid->offset[d]);
+ }
+
+ offx = pmegrid->offset[XX];
+ offy = pmegrid->offset[YY];
+ offz = pmegrid->offset[ZZ];
+
+ /* Directly copy the non-overlapping parts of the local grids.
+ * This also initializes the full grid.
+ */
+ grid_th = pmegrid->grid;
+ for (x = 0; x < nf[XX]; x++)
+ {
+ for (y = 0; y < nf[YY]; y++)
+ {
+ i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
+ i0t = (x*nsy + y)*nsz;
+ for (z = 0; z < nf[ZZ]; z++)
+ {
+ fftgrid[i0+z] = grid_th[i0t+z];
+ }
+ }
+ }
+}
+
+static void
+reduce_threadgrid_overlap(gmx_pme_t pme,
+ const pmegrids_t *pmegrids, int thread,
+ real *fftgrid, real *commbuf_x, real *commbuf_y)
+{
+ ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ int fft_nx, fft_ny, fft_nz;
+ int fft_my, fft_mz;
+ int buf_my = -1;
+ int nsx, nsy, nsz;
+ ivec ne;
+ int offx, offy, offz, x, y, z, i0, i0t;
+ int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
+ gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
+ gmx_bool bCommX, bCommY;
+ int d;
+ int thread_f;
+ const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
+ const real *grid_th;
+ real *commbuf = NULL;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+ fft_nx = local_fft_ndata[XX];
+ fft_ny = local_fft_ndata[YY];
+ fft_nz = local_fft_ndata[ZZ];
+
+ fft_my = local_fft_size[YY];
+ fft_mz = local_fft_size[ZZ];
+
+ /* This routine is called when all thread have finished spreading.
+ * Here each thread sums grid contributions calculated by other threads
+ * to the thread local grid volume.
+ * To minimize the number of grid copying operations,
+ * this routines sums immediately from the pmegrid to the fftgrid.
+ */
+
+ /* Determine which part of the full node grid we should operate on,
+ * this is our thread local part of the full grid.
+ */
+ pmegrid = &pmegrids->grid_th[thread];
+
+ for (d = 0; d < DIM; d++)
+ {
+ ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ local_fft_ndata[d]);
+ }
+
+ offx = pmegrid->offset[XX];
+ offy = pmegrid->offset[YY];
+ offz = pmegrid->offset[ZZ];
+
+
+ bClearBufX = TRUE;
+ bClearBufY = TRUE;
+ bClearBufXY = TRUE;
+
+ /* Now loop over all the thread data blocks that contribute
+ * to the grid region we (our thread) are operating on.
+ */
+ /* Note that ffy_nx/y is equal to the number of grid points
+ * between the first point of our node grid and the one of the next node.
+ */
+ for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
+ {
+ fx = pmegrid->ci[XX] + sx;
+ ox = 0;
+ bCommX = FALSE;
+ if (fx < 0)
+ {
+ fx += pmegrids->nc[XX];
+ ox -= fft_nx;
+ bCommX = (pme->nnodes_major > 1);
+ }
+ pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
+ ox += pmegrid_g->offset[XX];
+ if (!bCommX)
+ {
+ tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+ }
+ else
+ {
+ tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
+ }
+
+ for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
+ {
+ fy = pmegrid->ci[YY] + sy;
+ oy = 0;
+ bCommY = FALSE;
+ if (fy < 0)
+ {
+ fy += pmegrids->nc[YY];
+ oy -= fft_ny;
+ bCommY = (pme->nnodes_minor > 1);
+ }
+ pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
+ oy += pmegrid_g->offset[YY];
+ if (!bCommY)
+ {
+ ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+ }
+ else
+ {
+ ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
+ }
+
+ for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
+ {
+ fz = pmegrid->ci[ZZ] + sz;
+ oz = 0;
+ if (fz < 0)
+ {
+ fz += pmegrids->nc[ZZ];
+ oz -= fft_nz;
+ }
+ pmegrid_g = &pmegrids->grid_th[fz];
+ oz += pmegrid_g->offset[ZZ];
+ tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+
+ if (sx == 0 && sy == 0 && sz == 0)
+ {
+ /* We have already added our local contribution
+ * before calling this routine, so skip it here.
+ */
+ continue;
+ }
+
+ thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
+
+ pmegrid_f = &pmegrids->grid_th[thread_f];
+
+ grid_th = pmegrid_f->grid;
+
+ nsx = pmegrid_f->s[XX];
+ nsy = pmegrid_f->s[YY];
+ nsz = pmegrid_f->s[ZZ];
+
+#ifdef DEBUG_PME_REDUCE
+ printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
+ pme->nodeid, thread, thread_f,
+ pme->pmegrid_start_ix,
+ pme->pmegrid_start_iy,
+ pme->pmegrid_start_iz,
+ sx, sy, sz,
+ offx-ox, tx1-ox, offx, tx1,
+ offy-oy, ty1-oy, offy, ty1,
+ offz-oz, tz1-oz, offz, tz1);
+#endif
+
+ if (!(bCommX || bCommY))
+ {
+ /* Copy from the thread local grid to the node grid */
+ for (x = offx; x < tx1; x++)
+ {
+ for (y = offy; y < ty1; y++)
+ {
+ i0 = (x*fft_my + y)*fft_mz;
+ i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+ for (z = offz; z < tz1; z++)
+ {
+ fftgrid[i0+z] += grid_th[i0t+z];
+ }
+ }
+ }
+ }
+ else
+ {
+ /* The order of this conditional decides
+ * where the corner volume gets stored with x+y decomp.
+ */
+ if (bCommY)
+ {
+ commbuf = commbuf_y;
+ buf_my = ty1 - offy;
+ if (bCommX)
+ {
+ /* We index commbuf modulo the local grid size */
+ commbuf += buf_my*fft_nx*fft_nz;
+
+ bClearBuf = bClearBufXY;
+ bClearBufXY = FALSE;
+ }
+ else
+ {
+ bClearBuf = bClearBufY;
+ bClearBufY = FALSE;
+ }
+ }
+ else
+ {
+ commbuf = commbuf_x;
+ buf_my = fft_ny;
+ bClearBuf = bClearBufX;
+ bClearBufX = FALSE;
+ }
+
+ /* Copy to the communication buffer */
+ for (x = offx; x < tx1; x++)
+ {
+ for (y = offy; y < ty1; y++)
+ {
+ i0 = (x*buf_my + y)*fft_nz;
+ i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+
+ if (bClearBuf)
+ {
+ /* First access of commbuf, initialize it */
+ for (z = offz; z < tz1; z++)
+ {
+ commbuf[i0+z] = grid_th[i0t+z];
+ }
+ }
+ else
+ {
+ for (z = offz; z < tz1; z++)
+ {
+ commbuf[i0+z] += grid_th[i0t+z];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+{
+ ivec local_fft_ndata, local_fft_offset, local_fft_size;
+ pme_overlap_t *overlap;
+ int send_index0, send_nindex;
+ int recv_nindex;
+#ifdef GMX_MPI
+ MPI_Status stat;
+#endif
+ int send_size_y, recv_size_y;
+ int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
+ real *sendptr, *recvptr;
+ int x, y, z, indg, indb;
+
+ /* Note that this routine is only used for forward communication.
+ * Since the force gathering, unlike the charge spreading,
+ * can be trivially parallelized over the particles,
+ * the backwards process is much simpler and can use the "old"
+ * communication setup.
+ */
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+
+ if (pme->nnodes_minor > 1)
+ {
+ /* Major dimension */
+ overlap = &pme->overlap[1];
+
+ if (pme->nnodes_major > 1)
+ {
+ size_yx = pme->overlap[0].comm_data[0].send_nindex;
+ }
+ else
+ {
+ size_yx = 0;
+ }
+ datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
+
+ send_size_y = overlap->send_size;
+
+ for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
+ {
+ send_id = overlap->send_id[ipulse];
+ recv_id = overlap->recv_id[ipulse];
+ send_index0 =
+ overlap->comm_data[ipulse].send_index0 -
+ overlap->comm_data[0].send_index0;
+ send_nindex = overlap->comm_data[ipulse].send_nindex;
+ /* We don't use recv_index0, as we always receive starting at 0 */
+ recv_nindex = overlap->comm_data[ipulse].recv_nindex;
+ recv_size_y = overlap->comm_data[ipulse].recv_size;
+
+ sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
+ recvptr = overlap->recvbuf;
+
+#ifdef GMX_MPI
+ MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
+ send_id, ipulse,
+ recvptr, recv_size_y*datasize, GMX_MPI_REAL,
+ recv_id, ipulse,
+ overlap->mpi_comm, &stat);
+#endif
+
+ for (x = 0; x < local_fft_ndata[XX]; x++)
+ {
+ for (y = 0; y < recv_nindex; y++)
+ {
+ indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+ indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
+ for (z = 0; z < local_fft_ndata[ZZ]; z++)
+ {
+ fftgrid[indg+z] += recvptr[indb+z];
+ }
+ }
+ }
+
+ if (pme->nnodes_major > 1)
+ {
+ /* Copy from the received buffer to the send buffer for dim 0 */
+ sendptr = pme->overlap[0].sendbuf;
+ for (x = 0; x < size_yx; x++)
+ {
+ for (y = 0; y < recv_nindex; y++)
+ {
+ indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+ indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
+ for (z = 0; z < local_fft_ndata[ZZ]; z++)
+ {
+ sendptr[indg+z] += recvptr[indb+z];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* We only support a single pulse here.
+ * This is not a severe limitation, as this code is only used
+ * with OpenMP and with OpenMP the (PME) domains can be larger.
+ */
+ if (pme->nnodes_major > 1)
+ {
+ /* Major dimension */
+ overlap = &pme->overlap[0];
+
+ datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
+ gridsize = local_fft_size[YY] *local_fft_size[ZZ];
+
+ ipulse = 0;
+
+ send_id = overlap->send_id[ipulse];
+ recv_id = overlap->recv_id[ipulse];
+ send_nindex = overlap->comm_data[ipulse].send_nindex;
+ /* We don't use recv_index0, as we always receive starting at 0 */
+ recv_nindex = overlap->comm_data[ipulse].recv_nindex;
+
+ sendptr = overlap->sendbuf;
+ recvptr = overlap->recvbuf;
+
+ if (debug != NULL)
+ {
+ fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+ send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
+ }
+
+#ifdef GMX_MPI
+ MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
+ send_id, ipulse,
+ recvptr, recv_nindex*datasize, GMX_MPI_REAL,
+ recv_id, ipulse,
+ overlap->mpi_comm, &stat);
+#endif
+
+ for (x = 0; x < recv_nindex; x++)
+ {
+ for (y = 0; y < local_fft_ndata[YY]; y++)
+ {
+ indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+ indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+ for (z = 0; z < local_fft_ndata[ZZ]; z++)
+ {
+ fftgrid[indg+z] += recvptr[indb+z];
+ }
+ }
+ }
+ }
+}
+
+
+static void spread_on_grid(gmx_pme_t pme,
+ pme_atomcomm_t *atc, pmegrids_t *grids,
+ gmx_bool bCalcSplines, gmx_bool bSpread,
+ real *fftgrid)
+{
+ int nthread, thread;
+#ifdef PME_TIME_THREADS
+ gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
+ static double cs1 = 0, cs2 = 0, cs3 = 0;
+ static double cs1a[6] = {0, 0, 0, 0, 0, 0};
+ static int cnt = 0;
+#endif
+
+ nthread = pme->nthread;
+ assert(nthread > 0);
+
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_start();
+#endif
+ if (bCalcSplines)
+ {
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for (thread = 0; thread < nthread; thread++)
+ {
+ int start, end;
+
+ start = atc->n* thread /nthread;
+ end = atc->n*(thread+1)/nthread;
+
+ /* Compute fftgrid index for all atoms,
+ * with help of some extra variables.
+ */
+ calc_interpolation_idx(pme, atc, start, end, thread);
+ }
+ }
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_end(c1);
+ cs1 += (double)c1;
+#endif
+
+#ifdef PME_TIME_THREADS
+ c2 = omp_cyc_start();
+#endif
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for (thread = 0; thread < nthread; thread++)
+ {
+ splinedata_t *spline;
+ pmegrid_t *grid = NULL;
+
+ /* make local bsplines */
+ if (grids == NULL || !pme->bUseThreads)
+ {
+ spline = &atc->spline[0];
+
+ spline->n = atc->n;
+
+ if (bSpread)
+ {
+ grid = &grids->grid;
+ }
+ }
+ else
+ {
+ spline = &atc->spline[thread];
+
+ if (grids->nthread == 1)
+ {
+ /* One thread, we operate on all charges */
+ spline->n = atc->n;
+ }
+ else
+ {
+ /* Get the indices our thread should operate on */
+ make_thread_local_ind(atc, thread, spline);
+ }
+
+ grid = &grids->grid_th[thread];
+ }
+
+ if (bCalcSplines)
+ {
+ make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
+ atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
+ }
+
+ if (bSpread)
+ {
+ /* put local atoms on grid. */
+#ifdef PME_TIME_SPREAD
+ ct1a = omp_cyc_start();
+#endif
+ spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+
+ if (pme->bUseThreads)
+ {
+ copy_local_grid(pme, grids, thread, fftgrid);
+ }
+#ifdef PME_TIME_SPREAD
+ ct1a = omp_cyc_end(ct1a);
+ cs1a[thread] += (double)ct1a;
+#endif
+ }
+ }
+#ifdef PME_TIME_THREADS
+ c2 = omp_cyc_end(c2);
+ cs2 += (double)c2;
+#endif
+
+ if (bSpread && pme->bUseThreads)
+ {
+#ifdef PME_TIME_THREADS
+ c3 = omp_cyc_start();
+#endif
+#pragma omp parallel for num_threads(grids->nthread) schedule(static)
+ for (thread = 0; thread < grids->nthread; thread++)
+ {
+ reduce_threadgrid_overlap(pme, grids, thread,
+ fftgrid,
+ pme->overlap[0].sendbuf,
+ pme->overlap[1].sendbuf);
+ }
+#ifdef PME_TIME_THREADS
+ c3 = omp_cyc_end(c3);
+ cs3 += (double)c3;
+#endif
+
+ if (pme->nnodes > 1)
+ {
+ /* Communicate the overlapping part of the fftgrid.
+ * For this communication call we need to check pme->bUseThreads
+ * to have all ranks communicate here, regardless of pme->nthread.
+ */
+ sum_fftgrid_dd(pme, fftgrid);
+ }
+ }
+
+#ifdef PME_TIME_THREADS
+ cnt++;
+ if (cnt % 20 == 0)
+ {
+ printf("idx %.2f spread %.2f red %.2f",
+ cs1*1e-9, cs2*1e-9, cs3*1e-9);
+#ifdef PME_TIME_SPREAD
+ for (thread = 0; thread < nthread; thread++)
+ {
+ printf(" %.2f", cs1a[thread]*1e-9);
+ }
+#endif
+ printf("\n");
+ }
+#endif
+}
+
+
+static void dump_grid(FILE *fp,
+ int sx, int sy, int sz, int nx, int ny, int nz,
+ int my, int mz, const real *g)
+{
+ int x, y, z;
+
+ for (x = 0; x < nx; x++)
+ {
+ for (y = 0; y < ny; y++)
+ {
+ for (z = 0; z < nz; z++)
+ {
+ fprintf(fp, "%2d %2d %2d %6.3f\n",
+ sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
+ }
+ }
+ }
+}
+
+static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
+{
+ ivec local_fft_ndata, local_fft_offset, local_fft_size;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+
+ dump_grid(stderr,
+ pme->pmegrid_start_ix,
+ pme->pmegrid_start_iy,
+ pme->pmegrid_start_iz,
+ pme->pmegrid_nx-pme->pme_order+1,
+ pme->pmegrid_ny-pme->pme_order+1,
+ pme->pmegrid_nz-pme->pme_order+1,
+ local_fft_size[YY],
+ local_fft_size[ZZ],
+ fftgrid);
+}
+
+
+void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
+{
+ pme_atomcomm_t *atc;
+ pmegrids_t *grid;
+
+ if (pme->nnodes > 1)
+ {
+ gmx_incons("gmx_pme_calc_energy called in parallel");
+ }
+ if (pme->bFEP > 1)
+ {
+ gmx_incons("gmx_pme_calc_energy with free energy");
+ }
+
+ atc = &pme->atc_energy;
+ atc->nthread = 1;
+ if (atc->spline == NULL)
+ {
+ snew(atc->spline, atc->nthread);
+ }
+ atc->nslab = 1;
+ atc->bSpread = TRUE;
+ atc->pme_order = pme->pme_order;
+ atc->n = n;
+ pme_realloc_atomcomm_things(atc);
+ atc->x = x;
+ atc->q = q;
+
+ /* We only use the A-charges grid */
+ grid = &pme->pmegridA;
+
+ /* Only calculate the spline coefficients, don't actually spread */
+ spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
+
+ *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
+}
+
+
+static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+ t_nrnb *nrnb, t_inputrec *ir,
+ gmx_large_int_t step)
+{
+ /* Reset all the counters related to performance over the run */
+ wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_reset_all(wcycle);
+ init_nrnb(nrnb);
+ if (ir->nsteps >= 0)
+ {
+ /* ir->nsteps is not used here, but we update it for consistency */
+ ir->nsteps -= step - ir->init_step;
+ }
+ ir->init_step = step;
+ wallcycle_start(wcycle, ewcRUN);
+}
+
+
+static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
+ ivec grid_size,
+ t_commrec *cr, t_inputrec *ir,
+ gmx_pme_t *pme_ret)
+{
+ int ind;
+ gmx_pme_t pme = NULL;
+
+ ind = 0;
+ while (ind < *npmedata)
+ {
+ pme = (*pmedata)[ind];
+ if (pme->nkx == grid_size[XX] &&
+ pme->nky == grid_size[YY] &&
+ pme->nkz == grid_size[ZZ])
+ {
+ *pme_ret = pme;
+
+ return;
+ }
+
+ ind++;
+ }
+
+ (*npmedata)++;
+ srenew(*pmedata, *npmedata);
+
+ /* Generate a new PME data structure, copying part of the old pointers */
+ gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
+
+ *pme_ret = (*pmedata)[ind];
+}
+
+
+int gmx_pmeonly(gmx_pme_t pme,
+ t_commrec *cr, t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle,
+ real ewaldcoeff,
+ t_inputrec *ir)
+{
+ int npmedata;
+ gmx_pme_t *pmedata;
+ gmx_pme_pp_t pme_pp;
+ int ret;
+ int natoms;
+ matrix box;
+ rvec *x_pp = NULL, *f_pp = NULL;
+ real *chargeA = NULL, *chargeB = NULL;
+ real lambda = 0;
+ int maxshift_x = 0, maxshift_y = 0;
+ real energy, dvdlambda;
+ matrix vir;
+ float cycles;
+ int count;
+ gmx_bool bEnerVir;
+ gmx_large_int_t step, step_rel;
+ ivec grid_switch;
+
+ /* This data will only use with PME tuning, i.e. switching PME grids */
+ npmedata = 1;
+ snew(pmedata, npmedata);
+ pmedata[0] = pme;
+
+ pme_pp = gmx_pme_pp_init(cr);
+
+ init_nrnb(nrnb);
+
+ count = 0;
+ do /****** this is a quasi-loop over time steps! */
+ {
+ /* The reason for having a loop here is PME grid tuning/switching */
+ do
+ {
+ /* Domain decomposition */
+ ret = gmx_pme_recv_q_x(pme_pp,
+ &natoms,
+ &chargeA, &chargeB, box, &x_pp, &f_pp,
+ &maxshift_x, &maxshift_y,
+ &pme->bFEP, &lambda,
+ &bEnerVir,
+ &step,
+ grid_switch, &ewaldcoeff);
+
+ if (ret == pmerecvqxSWITCHGRID)
+ {
+ /* Switch the PME grid to grid_switch */
+ gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
+ }
+
+ if (ret == pmerecvqxRESETCOUNTERS)
+ {
+ /* Reset the cycle and flop counters */
+ reset_pmeonly_counters(wcycle, nrnb, ir, step);
+ }
+ }
+ while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
+
+ if (ret == pmerecvqxFINISH)
+ {
+ /* We should stop: break out of the loop */
+ break;
+ }
+
+ step_rel = step - ir->init_step;
+
+ if (count == 0)
+ {
+ wallcycle_start(wcycle, ewcRUN);
+ }
+
+ wallcycle_start(wcycle, ewcPMEMESH);
+
+ dvdlambda = 0;
+ clear_mat(vir);
+ gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
+ cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
+ &energy, lambda, &dvdlambda,
+ GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+
+ cycles = wallcycle_stop(wcycle, ewcPMEMESH);
+
+ gmx_pme_send_force_vir_ener(pme_pp,
+ f_pp, vir, energy, dvdlambda,
+ cycles);
+
+ count++;
+ } /***** end of quasi-loop, we stop with the break above */
+ while (TRUE);
+
+ return 0;
+}
+
+int gmx_pme_do(gmx_pme_t pme,
+ int start, int homenr,
+ rvec x[], rvec f[],
+ real *chargeA, real *chargeB,
+ matrix box, t_commrec *cr,
+ int maxshift_x, int maxshift_y,
+ t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ matrix vir, real ewaldcoeff,
+ real *energy, real lambda,
+ real *dvdlambda, int flags)
+{
+ int q, d, i, j, ntot, npme;
+ int nx, ny, nz;
+ int n_d, local_ny;
+ pme_atomcomm_t *atc = NULL;
+ pmegrids_t *pmegrid = NULL;
+ real *grid = NULL;
+ real *ptr;
+ rvec *x_d, *f_d;
+ real *charge = NULL, *q_d;
+ real energy_AB[2];
+ matrix vir_AB[2];
+ gmx_bool bClearF;
+ gmx_parallel_3dfft_t pfft_setup;
+ real * fftgrid;
+ t_complex * cfftgrid;
+ int thread;
+ const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
+ const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
+
+ assert(pme->nnodes > 0);
+ assert(pme->nnodes == 1 || pme->ndecompdim > 0);
+
+ if (pme->nnodes > 1)
+ {
+ atc = &pme->atc[0];
+ atc->npd = homenr;
+ if (atc->npd > atc->pd_nalloc)
+ {
+ atc->pd_nalloc = over_alloc_dd(atc->npd);
+ srenew(atc->pd, atc->pd_nalloc);
+ }
+ atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ }
+ else
+ {
+ /* This could be necessary for TPI */
+ pme->atc[0].n = homenr;
+ }
+
+ for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+ {
+ if (q == 0)
+ {
+ pmegrid = &pme->pmegridA;
+ fftgrid = pme->fftgridA;
+ cfftgrid = pme->cfftgridA;
+ pfft_setup = pme->pfft_setupA;
+ charge = chargeA+start;
+ }
+ else
+ {
+ pmegrid = &pme->pmegridB;
+ fftgrid = pme->fftgridB;
+ cfftgrid = pme->cfftgridB;
+ pfft_setup = pme->pfft_setupB;
+ charge = chargeB+start;
+ }
+ grid = pmegrid->grid.grid;
+ /* Unpack structure */
+ if (debug)
+ {
+ fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+ cr->nnodes, cr->nodeid);
+ fprintf(debug, "Grid = %p\n", (void*)grid);
+ if (grid == NULL)
+ {
+ gmx_fatal(FARGS, "No grid!");
+ }
+ }
+ where();
+
+ m_inv_ur0(box, pme->recipbox);
+
+ if (pme->nnodes == 1)
+ {
+ atc = &pme->atc[0];
+ if (DOMAINDECOMP(cr))
+ {
+ atc->n = homenr;
+ pme_realloc_atomcomm_things(atc);
+ }
+ atc->x = x;
+ atc->q = charge;
+ atc->f = f;
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcPME_REDISTXF);
+ for (d = pme->ndecompdim-1; d >= 0; d--)
+ {
+ if (d == pme->ndecompdim-1)
+ {
+ n_d = homenr;
+ x_d = x + start;
+ q_d = charge;
+ }
+ else
+ {
+ n_d = pme->atc[d+1].n;
+ x_d = atc->x;
+ q_d = atc->q;
+ }
+ atc = &pme->atc[d];
+ atc->npd = n_d;
+ if (atc->npd > atc->pd_nalloc)
+ {
+ atc->pd_nalloc = over_alloc_dd(atc->npd);
+ srenew(atc->pd, atc->pd_nalloc);
+ }
+ atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
+ where();
+
+ /* Redistribute x (only once) and qA or qB */
+ if (DOMAINDECOMP(cr))
+ {
+ dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
+ }
+ else
+ {
+ pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
+ }
+ }
+ where();
+
+ wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+ cr->nodeid, atc->n);
+ }
+
+ if (flags & GMX_PME_SPREAD_Q)
+ {
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+
+ /* Spread the charges on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
+
+ if (q == 0)
+ {
+ inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+ }
+ inc_nrnb(nrnb, eNR_SPREADQBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+
+ if (!pme->bUseThreads)
+ {
+ wrap_periodic_pmegrid(pme, grid);
+
+ /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+ where();
+ }
+#endif
+
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
+ }
+
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+
+ /*
+ dump_local_fftgrid(pme,fftgrid);
+ exit(0);
+ */
+ }
+
+ /* Here we start a large thread parallel region */
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ thread = gmx_omp_get_thread_num();
+ if (flags & GMX_PME_SOLVE)
+ {
+ int loop_count;
+
+ /* do 3d-fft */
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+ }
+ where();
+
+ /* solve in k-space for our local cells */
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcPME_SOLVE);
+ }
+ loop_count =
+ solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_SOLVE);
+ where();
+ inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+ }
+ }
+
+ if (bCalcF)
+ {
+ /* do 3d-invfft */
+ if (thread == 0)
+ {
+ where();
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+
+ where();
+
+ if (pme->nodeid == 0)
+ {
+ ntot = pme->nkx*pme->nky*pme->nkz;
+ npme = ntot*log((real)ntot)/log(2.0);
+ inc_nrnb(nrnb, eNR_FFT, 2*npme);
+ }
+
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ }
+
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
+ }
+ }
+ /* End of thread parallel section.
+ * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
+ */
+
+ if (bCalcF)
+ {
+ /* distribute local grid to all nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+ }
+#endif
+ where();
+
+ unwrap_periodic_pmegrid(pme, grid);
+
+ /* interpolate forces for our local atoms */
+
+ where();
+
+ /* If we are running without parallelization,
+ * atc->f is the actual force array, not a buffer,
+ * therefore we should not clear it.
+ */
+ bClearF = (q == 0 && PAR(cr));
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for (thread = 0; thread < pme->nthread; thread++)
+ {
+ gather_f_bsplines(pme, grid, bClearF, atc,
+ &atc->spline[thread],
+ pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
+ }
+
+ where();
+
+ inc_nrnb(nrnb, eNR_GATHERFBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ }
+
+ if (bCalcEnerVir)
+ {
+ /* This should only be called on the master thread
+ * and after the threads have synchronized.
+ */
+ get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+ }
+ } /* of q-loop */
+
+ if (bCalcF && pme->nnodes > 1)
+ {
+ wallcycle_start(wcycle, ewcPME_REDISTXF);
+ for (d = 0; d < pme->ndecompdim; d++)
+ {
+ atc = &pme->atc[d];
+ if (d == pme->ndecompdim - 1)
+ {
+ n_d = homenr;
+ f_d = f + start;
+ }
+ else
+ {
+ n_d = pme->atc[d+1].n;
+ f_d = pme->atc[d+1].f;
+ }
+ if (DOMAINDECOMP(cr))
+ {
+ dd_pmeredist_f(pme, atc, n_d, f_d,
+ d == pme->ndecompdim-1 && pme->bPPnode);
+ }
+ else
+ {
+ pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
+ }
+ }
+
+ wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ }
+ where();
+
+ if (bCalcEnerVir)
+ {
+ if (!pme->bFEP)
+ {
+ *energy = energy_AB[0];
+ m_add(vir, vir_AB[0], vir);
+ }
+ else
+ {
+ *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
+ *dvdlambda += energy_AB[1] - energy_AB[0];
+ for (i = 0; i < DIM; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
+ lambda*vir_AB[1][i][j];
+ }
+ }
+ }
+ }
+ else
+ {
+ *energy = 0;
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "PME mesh energy: %g\n", *energy);
+ }
+
+ return 0;
+}
--- /dev/null
- /* Do the actual neighbour searching and if twin range electrostatics
- * also do the calculation of long range forces and energies.
- */
- for (i = 0; i < efptNR; i++)
- {
- dvdlambda[i] = 0;
- }
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef GMX_CRAY_XT3
+#include <catamount/dclock.h>
+#endif
+
+
+#include <stdio.h>
+#include <time.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <math.h>
+#include "typedefs.h"
+#include "string2.h"
+#include "gmxfio.h"
+#include "smalloc.h"
+#include "names.h"
+#include "confio.h"
+#include "mvdata.h"
+#include "txtdump.h"
+#include "pbc.h"
+#include "chargegroup.h"
+#include "vec.h"
+#include <time.h>
+#include "nrnb.h"
+#include "mshift.h"
+#include "mdrun.h"
+#include "sim_util.h"
+#include "update.h"
+#include "physics.h"
+#include "main.h"
+#include "mdatoms.h"
+#include "force.h"
+#include "bondf.h"
+#include "pme.h"
+#include "disre.h"
+#include "orires.h"
+#include "network.h"
+#include "calcmu.h"
+#include "constr.h"
+#include "xvgr.h"
+#include "trnio.h"
+#include "xtcio.h"
+#include "copyrite.h"
+#include "pull_rotation.h"
+#include "gmx_random.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "gmx_wallcycle.h"
+#include "genborn.h"
+#include "nbnxn_atomdata.h"
+#include "nbnxn_search.h"
+#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
+#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+
+#include "gromacs/utility/gmxmpi.h"
+
+#include "adress.h"
+#include "qmmm.h"
+
+#include "nbnxn_cuda_data_mgmt.h"
+#include "nbnxn_cuda/nbnxn_cuda.h"
+
+double
+gmx_gettime()
+{
+#ifdef HAVE_GETTIMEOFDAY
+ struct timeval t;
+ double seconds;
+
+ gettimeofday(&t, NULL);
+
+ seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
+
+ return seconds;
+#else
+ double seconds;
+
+ seconds = time(NULL);
+
+ return seconds;
+#endif
+}
+
+
+#define difftime(end, start) ((double)(end)-(double)(start))
+
+void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
+ t_inputrec *ir, t_commrec gmx_unused *cr)
+{
+ time_t finish;
+ char timebuf[STRLEN];
+ double dt;
+ char buf[48];
+
+#ifndef GMX_THREAD_MPI
+ if (!PAR(cr))
+#endif
+ {
+ fprintf(out, "\r");
+ }
+ fprintf(out, "step %s", gmx_step_str(step, buf));
+ if ((step >= ir->nstlist))
+ {
+ runtime->last = gmx_gettime();
+ dt = difftime(runtime->last, runtime->real);
+ runtime->time_per_step = dt/(step - ir->init_step + 1);
+
+ dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
+
+ if (ir->nsteps >= 0)
+ {
+ if (dt >= 300)
+ {
+ finish = (time_t) (runtime->last + dt);
+ gmx_ctime_r(&finish, timebuf, STRLEN);
+ sprintf(buf, "%s", timebuf);
+ buf[strlen(buf)-1] = '\0';
+ fprintf(out, ", will finish %s", buf);
+ }
+ else
+ {
+ fprintf(out, ", remaining runtime: %5d s ", (int)dt);
+ }
+ }
+ else
+ {
+ fprintf(out, " performance: %.1f ns/day ",
+ ir->delta_t/1000*24*60*60/runtime->time_per_step);
+ }
+ }
+#ifndef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ fprintf(out, "\n");
+ }
+#endif
+
+ fflush(out);
+}
+
+#ifdef NO_CLOCK
+#define clock() -1
+#endif
+
+static double set_proctime(gmx_runtime_t *runtime)
+{
+ double diff;
+#ifdef GMX_CRAY_XT3
+ double prev;
+
+ prev = runtime->proc;
+ runtime->proc = dclock();
+
+ diff = runtime->proc - prev;
+#else
+ clock_t prev;
+
+ prev = runtime->proc;
+ runtime->proc = clock();
+
+ diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
+#endif
+ if (diff < 0)
+ {
+ /* The counter has probably looped, ignore this data */
+ diff = 0;
+ }
+
+ return diff;
+}
+
+void runtime_start(gmx_runtime_t *runtime)
+{
+ runtime->real = gmx_gettime();
+ runtime->proc = 0;
+ set_proctime(runtime);
+ runtime->realtime = 0;
+ runtime->proctime = 0;
+ runtime->last = 0;
+ runtime->time_per_step = 0;
+}
+
+void runtime_end(gmx_runtime_t *runtime)
+{
+ double now;
+
+ now = gmx_gettime();
+
+ runtime->proctime += set_proctime(runtime);
+ runtime->realtime = now - runtime->real;
+ runtime->real = now;
+}
+
+void runtime_upd_proc(gmx_runtime_t *runtime)
+{
+ runtime->proctime += set_proctime(runtime);
+}
+
+void print_date_and_time(FILE *fplog, int nodeid, const char *title,
+ const gmx_runtime_t *runtime)
+{
+ int i;
+ char timebuf[STRLEN];
+ char time_string[STRLEN];
+ time_t tmptime;
+
+ if (fplog)
+ {
+ if (runtime != NULL)
+ {
+ tmptime = (time_t) runtime->real;
+ gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ }
+ else
+ {
+ tmptime = (time_t) gmx_gettime();
+ gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ }
+ for (i = 0; timebuf[i] >= ' '; i++)
+ {
+ time_string[i] = timebuf[i];
+ }
+ time_string[i] = '\0';
+
+ fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
+ }
+}
+
+static void sum_forces(int start, int end, rvec f[], rvec flr[])
+{
+ int i;
+
+ if (gmx_debug_at)
+ {
+ pr_rvecs(debug, 0, "fsr", f+start, end-start);
+ pr_rvecs(debug, 0, "flr", flr+start, end-start);
+ }
+ for (i = start; (i < end); i++)
+ {
+ rvec_inc(f[i], flr[i]);
+ }
+}
+
+/*
+ * calc_f_el calculates forces due to an electric field.
+ *
+ * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
+ *
+ * Et[] contains the parameters for the time dependent
+ * part of the field (not yet used).
+ * Ex[] contains the parameters for
+ * the spatial dependent part of the field. You can have cool periodic
+ * fields in principle, but only a constant field is supported
+ * now.
+ * The function should return the energy due to the electric field
+ * (if any) but for now returns 0.
+ *
+ * WARNING:
+ * There can be problems with the virial.
+ * Since the field is not self-consistent this is unavoidable.
+ * For neutral molecules the virial is correct within this approximation.
+ * For neutral systems with many charged molecules the error is small.
+ * But for systems with a net charge or a few charged molecules
+ * the error can be significant when the field is high.
+ * Solution: implement a self-consitent electric field into PME.
+ */
+static void calc_f_el(FILE *fp, int start, int homenr,
+ real charge[], rvec f[],
+ t_cosines Ex[], t_cosines Et[], double t)
+{
+ rvec Ext;
+ real t0;
+ int i, m;
+
+ for (m = 0; (m < DIM); m++)
+ {
+ if (Et[m].n > 0)
+ {
+ if (Et[m].n == 3)
+ {
+ t0 = Et[m].a[1];
+ Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
+ }
+ else
+ {
+ Ext[m] = cos(Et[m].a[0]*t);
+ }
+ }
+ else
+ {
+ Ext[m] = 1.0;
+ }
+ if (Ex[m].n > 0)
+ {
+ /* Convert the field strength from V/nm to MD-units */
+ Ext[m] *= Ex[m].a[0]*FIELDFAC;
+ for (i = start; (i < start+homenr); i++)
+ {
+ f[i][m] += charge[i]*Ext[m];
+ }
+ }
+ else
+ {
+ Ext[m] = 0;
+ }
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%10g %10g %10g %10g #FIELD\n", t,
+ Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
+ }
+}
+
+static void calc_virial(int start, int homenr, rvec x[], rvec f[],
+ tensor vir_part, t_graph *graph, matrix box,
+ t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
+{
+ int i, j;
+ tensor virtest;
+
+ /* The short-range virial from surrounding boxes */
+ clear_mat(vir_part);
+ calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
+ inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
+
+ /* Calculate partial virial, for local atoms only, based on short range.
+ * Total virial is computed in global_stat, called from do_md
+ */
+ f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
+ inc_nrnb(nrnb, eNR_VIRIAL, homenr);
+
+ /* Add position restraint contribution */
+ for (i = 0; i < DIM; i++)
+ {
+ vir_part[i][i] += fr->vir_diag_posres[i];
+ }
+
+ /* Add wall contribution */
+ for (i = 0; i < DIM; i++)
+ {
+ vir_part[i][ZZ] += fr->vir_wall_z[i];
+ }
+
+ if (debug)
+ {
+ pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
+ }
+}
+
+static void posres_wrapper(FILE *fplog,
+ int flags,
+ gmx_bool bSepDVDL,
+ t_inputrec *ir,
+ t_nrnb *nrnb,
+ gmx_localtop_t *top,
+ matrix box, rvec x[],
+ gmx_enerdata_t *enerd,
+ real *lambda,
+ t_forcerec *fr)
+{
+ t_pbc pbc;
+ real v, dvdl;
+ int i;
+
+ /* Position restraints always require full pbc */
+ set_pbc(&pbc, ir->ePBC, box);
+ dvdl = 0;
+ v = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ top->idef.iparams_posres,
+ (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ ir->ePBC == epbcNONE ? NULL : &pbc,
+ lambda[efptRESTRAINT], &dvdl,
+ fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ if (bSepDVDL)
+ {
+ gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
+ }
+ enerd->term[F_POSRES] += v;
+ /* If just the force constant changes, the FEP term is linear,
+ * but if k changes, it is not.
+ */
+ enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+ inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
+
+ if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+ {
+ for (i = 0; i < enerd->n_lambda; i++)
+ {
+ real dvdl_dum, lambda_dum;
+
+ lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+ v = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ top->idef.iparams_posres,
+ (const rvec*)x, NULL, NULL,
+ ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
+ fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ enerd->enerpart_lambda[i] += v;
+ }
+ }
+}
+
+static void pull_potential_wrapper(FILE *fplog,
+ gmx_bool bSepDVDL,
+ t_commrec *cr,
+ t_inputrec *ir,
+ matrix box, rvec x[],
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd,
+ real *lambda,
+ double t)
+{
+ t_pbc pbc;
+ real dvdl;
+
+ /* Calculate the center of mass forces, this requires communication,
+ * which is why pull_potential is called close to other communication.
+ * The virial contribution is calculated directly,
+ * which is why we call pull_potential after calc_virial.
+ */
+ set_pbc(&pbc, ir->ePBC, box);
+ dvdl = 0;
+ enerd->term[F_COM_PULL] +=
+ pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
+ cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
+ if (bSepDVDL)
+ {
+ gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
+ }
+ enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+}
+
+static void pme_receive_force_ener(FILE *fplog,
+ gmx_bool bSepDVDL,
+ t_commrec *cr,
+ gmx_wallcycle_t wcycle,
+ gmx_enerdata_t *enerd,
+ t_forcerec *fr)
+{
+ real e, v, dvdl;
+ float cycles_ppdpme, cycles_seppme;
+
+ cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
+ dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
+
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
+ dvdl = 0;
+ gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
+ &cycles_seppme);
+ if (bSepDVDL)
+ {
+ gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
+ }
+ enerd->term[F_COUL_RECIP] += e;
+ enerd->dvdl_lin[efptCOUL] += dvdl;
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
+ }
+ wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
+}
+
+static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
+ gmx_large_int_t step, real pforce, rvec *x, rvec *f)
+{
+ int i;
+ real pf2, fn2;
+ char buf[STEPSTRSIZE];
+
+ pf2 = sqr(pforce);
+ for (i = md->start; i < md->start+md->homenr; i++)
+ {
+ fn2 = norm2(f[i]);
+ /* We also catch NAN, if the compiler does not optimize this away. */
+ if (fn2 >= pf2 || fn2 != fn2)
+ {
+ fprintf(fp, "step %s atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
+ gmx_step_str(step, buf),
+ ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
+ }
+ }
+}
+
+static void post_process_forces(t_commrec *cr,
+ gmx_large_int_t step,
+ t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ matrix box, rvec x[],
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ t_graph *graph,
+ t_forcerec *fr, gmx_vsite_t *vsite,
+ int flags)
+{
+ if (fr->bF_NoVirSum)
+ {
+ if (vsite)
+ {
+ /* Spread the mesh force on virtual sites to the other particles...
+ * This is parallellized. MPI communication is performed
+ * if the constructing atoms aren't local.
+ */
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
+ (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
+ nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+ }
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Now add the forces, this is local */
+ if (fr->bDomDec)
+ {
+ sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
+ }
+ else
+ {
+ sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
+ f, fr->f_novirsum);
+ }
+ if (EEL_FULL(fr->eeltype))
+ {
+ /* Add the mesh contribution to the virial */
+ m_add(vir_force, fr->vir_el_recip, vir_force);
+ }
+ if (debug)
+ {
+ pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
+ }
+ }
+ }
+
+ if (fr->print_force >= 0)
+ {
+ print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
+ }
+}
+
+static void do_nb_verlet(t_forcerec *fr,
+ interaction_const_t *ic,
+ gmx_enerdata_t *enerd,
+ int flags, int ilocality,
+ int clearF,
+ t_nrnb *nrnb)
+{
+ int nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
+ char *env;
+ nonbonded_verlet_group_t *nbvg;
+ gmx_bool bCUDA;
+
+ if (!(flags & GMX_FORCE_NONBONDED))
+ {
+ /* skip non-bonded calculation */
+ return;
+ }
+
+ nbvg = &fr->nbv->grp[ilocality];
+
+ /* CUDA kernel launch overhead is already timed separately */
+ if (fr->cutoff_scheme != ecutsVERLET)
+ {
+ gmx_incons("Invalid cut-off scheme passed!");
+ }
+
+ bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
+
+ if (!bCUDA)
+ {
+ wallcycle_sub_start(wcycle, ewcsNONBONDED);
+ }
+ switch (nbvg->kernel_type)
+ {
+ case nbnxnk4x4_PlainC:
+ nbnxn_kernel_ref(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ case nbnxnk4xN_SIMD_4xN:
+ nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+ case nbnxnk4xN_SIMD_2xNN:
+ nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ case nbnxnk8x8x8_CUDA:
+ nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ break;
+
+ case nbnxnk8x8x8_PlainC:
+ nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ nbvg->nbat->out[0].f,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ default:
+ gmx_incons("Invalid nonbonded kernel type passed!");
+
+ }
+ if (!bCUDA)
+ {
+ wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+ }
+
+ if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ {
+ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
+ }
+ else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
+ (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+ {
+ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
+ }
+ else
+ {
+ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
+ }
+ enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
+ if (flags & GMX_FORCE_ENERGY)
+ {
+ /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+ enr_nbnxn_kernel_ljc += 1;
+ enr_nbnxn_kernel_lj += 1;
+ }
+
+ inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
+ nbvg->nbl_lists.natpair_ljq);
+ inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
+ nbvg->nbl_lists.natpair_lj);
+ inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
+ nbvg->nbl_lists.natpair_q);
+}
+
+void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_groups_t gmx_unused *groups,
+ matrix box, rvec x[], history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr, interaction_const_t *ic,
+ gmx_vsite_t *vsite, rvec mu_tot,
+ double t, FILE *field, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ int cg0, cg1, i, j;
+ int start, homenr;
+ int nb_kernel_type;
+ double mu[2*DIM];
+ gmx_bool bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ gmx_bool bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
+ gmx_bool bDiffKernels = FALSE;
+ matrix boxs;
+ rvec vzero, box_diag;
+ real e, v, dvdl;
+ float cycles_pme, cycles_force;
+ nonbonded_verlet_t *nbv;
+
+ cycles_force = 0;
+ nbv = fr->nbv;
+ nb_kernel_type = fr->nbv->grp[0].kernel_type;
+
+ start = mdatoms->start;
+ homenr = mdatoms->homenr;
+
+ bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+
+ clear_mat(vir_force);
+
+ cg0 = 0;
+ if (DOMAINDECOMP(cr))
+ {
+ cg1 = cr->dd->ncg_tot;
+ }
+ else
+ {
+ cg1 = top->cgs.nr;
+ }
+ if (fr->n_tpi > 0)
+ {
+ cg1--;
+ }
+
+ bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ bFillGrid = (bNS && bStateChanged);
+ bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
+ bDoLongRange = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
+ bDoForces = (flags & GMX_FORCE_FORCES);
+ bSepLRF = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+ bUseGPU = fr->nbv->bUseGPU;
+ bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
+
+ if (bStateChanged)
+ {
+ update_forcerec(fr, box);
+
+ if (NEED_MUTOT(*inputrec))
+ {
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ calc_mu(start, homenr,
+ x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ mu, mu+DIM);
+ }
+ }
+
+ if (fr->ePBC != epbcNONE)
+ {
+ /* Compute shift vectors every step,
+ * because of pressure coupling or box deformation!
+ */
+ if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ {
+ calc_shifts(box, fr->shift_vec);
+ }
+
+ if (bCalcCGCM)
+ {
+ put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
+ inc_nrnb(nrnb, eNR_SHIFTX, homenr);
+ }
+ else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ {
+ unshift_self(graph, box, x);
+ }
+ }
+
+ nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+ fr->shift_vec, nbv->grp[0].nbat);
+
+#ifdef GMX_MPI
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Send particle coordinates to the pme nodes.
+ * Since this is only implemented for domain decomposition
+ * and domain decomposition does not use the graph,
+ * we do not need to worry about shifting.
+ */
+
+ wallcycle_start(wcycle, ewcPP_PMESENDX);
+
+ bBS = (inputrec->nwall == 2);
+ if (bBS)
+ {
+ copy_mat(box, boxs);
+ svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ }
+
+ gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ mdatoms->nChargePerturbed, lambda[efptCOUL],
+ (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+
+ wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ }
+#endif /* GMX_MPI */
+
+ /* do gridding for pair search */
+ if (bNS)
+ {
+ if (graph && bStateChanged)
+ {
+ /* Calculate intramolecular shift vectors to make molecules whole */
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ }
+
+ clear_rvec(vzero);
+ box_diag[XX] = box[XX][XX];
+ box_diag[YY] = box[YY][YY];
+ box_diag[ZZ] = box[ZZ][ZZ];
+
+ wallcycle_start(wcycle, ewcNS);
+ if (!fr->bDomDec)
+ {
+ wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
+ nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
+ 0, vzero, box_diag,
+ 0, mdatoms->homenr, -1, fr->cginfo, x,
+ 0, NULL,
+ nbv->grp[eintLocal].kernel_type,
+ nbv->grp[eintLocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
+ }
+ else
+ {
+ wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
+ nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
+ fr->cginfo, x,
+ nbv->grp[eintNonlocal].kernel_type,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
+ }
+
+ if (nbv->ngrp == 1 ||
+ nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+ {
+ nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
+ nbv->nbs, mdatoms, fr->cginfo);
+ }
+ else
+ {
+ nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
+ nbv->nbs, mdatoms, fr->cginfo);
+ nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
+ nbv->nbs, mdatoms, fr->cginfo);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+
+ /* initialize the GPU atom data and copy shift vector */
+ if (bUseGPU)
+ {
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ /* do local pair search */
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcNS);
+ wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
+ nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+ &top->excls,
+ ic->rlist,
+ nbv->min_ci_balanced,
+ &nbv->grp[eintLocal].nbl_lists,
+ eintLocal,
+ nbv->grp[eintLocal].kernel_type,
+ nrnb);
+ wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
+
+ if (bUseGPU)
+ {
+ /* initialize local pair-list on the GPU */
+ nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ nbv->grp[eintLocal].nbl_lists.nbl[0],
+ eintLocal);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
+ nbv->grp[eintLocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ /* launch local nonbonded F on GPU */
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ nrnb);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ /* Communicate coordinates and sum dipole if necessary +
+ do non-local pair search */
+ if (DOMAINDECOMP(cr))
+ {
+ bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+ nbv->grp[eintLocal].kernel_type);
+
+ if (bDiffKernels)
+ {
+ /* With GPU+CPU non-bonded calculations we need to copy
+ * the local coordinates to the non-local nbat struct
+ * (in CPU format) as the non-local kernel call also
+ * calculates the local - non-local interactions.
+ */
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcNS);
+ wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+
+ if (bDiffKernels)
+ {
+ nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
+ }
+
+ nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+ &top->excls,
+ ic->rlist,
+ nbv->min_ci_balanced,
+ &nbv->grp[eintNonlocal].nbl_lists,
+ eintNonlocal,
+ nbv->grp[eintNonlocal].kernel_type,
+ nrnb);
+
+ wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+
+ if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+ {
+ /* initialize non-local pair-list on the GPU */
+ nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ eintNonlocal);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcMOVEX);
+ dd_move_x(cr->dd, box, x);
+
+ /* When we don't need the total dipole we sum it in global_stat */
+ if (bStateChanged && NEED_MUTOT(*inputrec))
+ {
+ gmx_sumd(2*DIM, mu, cr);
+ }
+ wallcycle_stop(wcycle, ewcMOVEX);
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bUseGPU && !bDiffKernels)
+ {
+ wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ /* launch non-local nonbonded F on GPU */
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ nrnb);
+ cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+ }
+
+ if (bUseGPU)
+ {
+ /* launch D2H copy-back F */
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ if (DOMAINDECOMP(cr) && !bDiffKernels)
+ {
+ nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal);
+ }
+ nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+ flags, eatLocal);
+ cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ if (bStateChanged && NEED_MUTOT(*inputrec))
+ {
+ if (PAR(cr))
+ {
+ gmx_sumd(2*DIM, mu, cr);
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ fr->mu_tot[i][j] = mu[i*DIM + j];
+ }
+ }
+ }
+ if (fr->efep == efepNO)
+ {
+ copy_rvec(fr->mu_tot[0], mu_tot);
+ }
+ else
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ mu_tot[j] =
+ (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+ lambda[efptCOUL]*fr->mu_tot[1][j];
+ }
+ }
+
+ /* Reset energies */
+ reset_enerdata(fr, bNS, enerd, MASTER(cr));
+ clear_rvecs(SHIFTS, fr->fshift);
+
+ if (DOMAINDECOMP(cr))
+ {
+ if (!(cr->duty & DUTY_PME))
+ {
+ wallcycle_start(wcycle, ewcPPDURINGPME);
+ dd_force_flop_start(cr->dd, nrnb);
+ }
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Enforced rotation has its own cycle counter that starts after the collective
+ * coordinates have been communicated. It is added to ddCyclF to allow
+ * for proper load-balancing */
+ wallcycle_start(wcycle, ewcROT);
+ do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ wallcycle_stop(wcycle, ewcROT);
+ }
+
+ /* Start the force cycle counter.
+ * This counter is stopped in do_forcelow_level.
+ * No parallel communication should occur while this counter is running,
+ * since that will interfere with the dynamic load balancing.
+ */
+ wallcycle_start(wcycle, ewcFORCE);
+ if (bDoForces)
+ {
+ /* Reset forces for which the virial is calculated separately:
+ * PME/Ewald forces if necessary */
+ if (fr->bF_NoVirSum)
+ {
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ fr->f_novirsum = fr->f_novirsum_alloc;
+ if (fr->bDomDec)
+ {
+ clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ }
+ else
+ {
+ clear_rvecs(homenr, fr->f_novirsum+start);
+ }
+ }
+ else
+ {
+ /* We are not calculating the pressure so we do not need
+ * a separate array for forces that do not contribute
+ * to the pressure.
+ */
+ fr->f_novirsum = f;
+ }
+ }
+
+ /* Clear the short- and long-range forces */
+ clear_rvecs(fr->natoms_force_constr, f);
+ if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ {
+ clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ }
+
+ clear_rvec(fr->vir_diag_posres);
+ }
+
+ if (inputrec->ePull == epullCONSTRAINT)
+ {
+ clear_pull_forces(inputrec->pull);
+ }
+
+ /* We calculate the non-bonded forces, when done on the CPU, here.
+ * We do this before calling do_force_lowlevel, as in there bondeds
+ * forces are calculated before PME, which does communication.
+ * With this order, non-bonded and bonded force calculation imbalance
+ * can be balanced out by the domain decomposition load balancing.
+ */
+
+ if (!bUseOrEmulGPU)
+ {
+ /* Maybe we should move this into do_force_lowlevel */
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ nrnb);
+ }
+
+ if (!bUseOrEmulGPU || bDiffKernels)
+ {
+ int aloc;
+
+ if (DOMAINDECOMP(cr))
+ {
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+ bDiffKernels ? enbvClearFYes : enbvClearFNo,
+ nrnb);
+ }
+
+ if (!bUseOrEmulGPU)
+ {
+ aloc = eintLocal;
+ }
+ else
+ {
+ aloc = eintNonlocal;
+ }
+
+ /* Add all the non-bonded force to the normal force array.
+ * This can be split into a local a non-local part when overlapping
+ * communication with calculation with domain decomposition.
+ */
+ cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+
+ /* if there are multiple fshift output buffers reduce them */
+ if ((flags & GMX_FORCE_VIRIAL) &&
+ nbv->grp[aloc].nbl_lists.nnbl > 1)
+ {
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+ fr->fshift);
+ }
+ }
+
+ /* update QMMMrec, if necessary */
+ if (fr->bQMMM)
+ {
+ update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ {
+ posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ enerd, lambda, fr);
+ }
+
+ /* Compute the bonded and non-bonded energies and optionally forces */
+ do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ cr, nrnb, wcycle, mdatoms,
+ x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
+ &(top->atomtypes), bBornRadii, box,
+ inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
+ flags, &cycles_pme);
+
+ if (bSepLRF)
+ {
+ if (do_per_step(step, inputrec->nstcalclr))
+ {
+ /* Add the long range forces to the short range forces */
+ for (i = 0; i < fr->natoms_force_constr; i++)
+ {
+ rvec_add(fr->f_twin[i], f[i], f[i]);
+ }
+ }
+ }
+
+ cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+
+ if (ed)
+ {
+ do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ }
+
+ if (bUseOrEmulGPU && !bDiffKernels)
+ {
+ /* wait for non-local forces (or calculate in emulation mode) */
+ if (DOMAINDECOMP(cr))
+ {
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
+ nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ }
+ else
+ {
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ nrnb);
+ cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ }
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ /* skip the reduction if there was no non-local work to do */
+ if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ {
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
+ nbv->grp[eintNonlocal].nbat, f);
+ }
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+ }
+
+ if (bDoForces)
+ {
+ /* Communicate the forces */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle, ewcMOVEF);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_f(cr->dd, f, fr->fshift);
+ /* Do we need to communicate the separate force array
+ * for terms that do not contribute to the single sum virial?
+ * Position restraints and electric fields do not introduce
+ * inter-cg forces, only full electrostatics methods do.
+ * When we do not calculate the virial, fr->f_novirsum = f,
+ * so we have already communicated these forces.
+ */
+ if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ (flags & GMX_FORCE_VIRIAL))
+ {
+ dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ }
+ if (bSepLRF)
+ {
+ /* We should not update the shift forces here,
+ * since f_twin is already included in f.
+ */
+ dd_move_f(cr->dd, fr->f_twin, NULL);
+ }
+ }
+ wallcycle_stop(wcycle, ewcMOVEF);
+ }
+ }
+
+ if (bUseOrEmulGPU)
+ {
+ /* wait for local forces (or calculate in emulation mode) */
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ nbv->grp[eintLocal].nbat,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ /* now clear the GPU outputs while we finish the step on the CPU */
+
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+ else
+ {
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ nrnb);
+ wallcycle_stop(wcycle, ewcFORCE);
+ }
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ {
+ /* skip the reduction if there was no non-local work to do */
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->grp[eintLocal].nbat, f);
+ }
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_force_flop_stop(cr->dd, nrnb);
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ }
+ }
+
+ if (bDoForces)
+ {
+ if (IR_ELEC_FIELD(*inputrec))
+ {
+ /* Compute forces due to electric field */
+ calc_f_el(MASTER(cr) ? field : NULL,
+ start, homenr, mdatoms->chargeA, fr->f_novirsum,
+ inputrec->ex, inputrec->et, t);
+ }
+
+ /* If we have NoVirSum forces, but we do not calculate the virial,
+ * we sum fr->f_novirum=f later.
+ */
+ if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+
+ if (bSepLRF)
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+ }
+ }
+
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Calculation of the virial must be done after vsites! */
+ calc_virial(mdatoms->start, mdatoms->homenr, x, f,
+ vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ }
+ }
+
+ if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ {
+ pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ f, vir_force, mdatoms, enerd, lambda, t);
+ }
+
+ /* Add the forces from enforced rotation potentials (if any) */
+ if (inputrec->bRot)
+ {
+ wallcycle_start(wcycle, ewcROTadd);
+ enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ wallcycle_stop(wcycle, ewcROTadd);
+ }
+
+ if (PAR(cr) && !(cr->duty & DUTY_PME))
+ {
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ }
+
+ if (bDoForces)
+ {
+ post_process_forces(cr, step, nrnb, wcycle,
+ top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ flags);
+ }
+
+ /* Sum the potential energy terms from group contributions */
+ sum_epot(&(enerd->grpp), enerd->term);
+}
+
+void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_groups_t *groups,
+ matrix box, rvec x[], history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
+ double t, FILE *field, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ int cg0, cg1, i, j;
+ int start, homenr;
+ double mu[2*DIM];
+ gmx_bool bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ gmx_bool bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
+ gmx_bool bDoAdressWF;
+ matrix boxs;
+ rvec vzero, box_diag;
+ real e, v, dvdlambda[efptNR];
+ t_pbc pbc;
+ float cycles_pme, cycles_force;
+
+ start = mdatoms->start;
+ homenr = mdatoms->homenr;
+
+ bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+
+ clear_mat(vir_force);
+
+ if (PARTDECOMP(cr))
+ {
+ pd_cg_range(cr, &cg0, &cg1);
+ }
+ else
+ {
+ cg0 = 0;
+ if (DOMAINDECOMP(cr))
+ {
+ cg1 = cr->dd->ncg_tot;
+ }
+ else
+ {
+ cg1 = top->cgs.nr;
+ }
+ if (fr->n_tpi > 0)
+ {
+ cg1--;
+ }
+ }
+
+ bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ /* Should we update the long-range neighborlists at this step? */
+ bDoLongRangeNS = fr->bTwinRange && bNS;
+ /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
+ bFillGrid = (bNS && bStateChanged);
+ bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
+ bDoForces = (flags & GMX_FORCE_FORCES);
+ bDoPotential = (flags & GMX_FORCE_ENERGY);
+ bSepLRF = ((inputrec->nstcalclr > 1) && bDoForces &&
+ (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
+
+ /* should probably move this to the forcerec since it doesn't change */
+ bDoAdressWF = ((fr->adress_type != eAdressOff));
+
+ if (bStateChanged)
+ {
+ update_forcerec(fr, box);
+
+ if (NEED_MUTOT(*inputrec))
+ {
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ calc_mu(start, homenr,
+ x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ mu, mu+DIM);
+ }
+ }
+
+ if (fr->ePBC != epbcNONE)
+ {
+ /* Compute shift vectors every step,
+ * because of pressure coupling or box deformation!
+ */
+ if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ {
+ calc_shifts(box, fr->shift_vec);
+ }
+
+ if (bCalcCGCM)
+ {
+ put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
+ &(top->cgs), x, fr->cg_cm);
+ inc_nrnb(nrnb, eNR_CGCM, homenr);
+ inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
+ }
+ else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ {
+ unshift_self(graph, box, x);
+ }
+ }
+ else if (bCalcCGCM)
+ {
+ calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
+ inc_nrnb(nrnb, eNR_CGCM, homenr);
+ }
+
+ if (bCalcCGCM)
+ {
+ if (PAR(cr))
+ {
+ move_cgcm(fplog, cr, fr->cg_cm);
+ }
+ if (gmx_debug_at)
+ {
+ pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
+ }
+ }
+
+#ifdef GMX_MPI
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Send particle coordinates to the pme nodes.
+ * Since this is only implemented for domain decomposition
+ * and domain decomposition does not use the graph,
+ * we do not need to worry about shifting.
+ */
+
+ wallcycle_start(wcycle, ewcPP_PMESENDX);
+
+ bBS = (inputrec->nwall == 2);
+ if (bBS)
+ {
+ copy_mat(box, boxs);
+ svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ }
+
+ gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ mdatoms->nChargePerturbed, lambda[efptCOUL],
+ (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+
+ wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ }
+#endif /* GMX_MPI */
+
+ /* Communicate coordinates and sum dipole if necessary */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle, ewcMOVEX);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_x(cr->dd, box, x);
+ }
+ else
+ {
+ move_x(cr, x, nrnb);
+ }
+ wallcycle_stop(wcycle, ewcMOVEX);
+ }
+
+ /* update adress weight beforehand */
+ if (bStateChanged && bDoAdressWF)
+ {
+ /* need pbc for adress weight calculation with pbc_dx */
+ set_pbc(&pbc, inputrec->ePBC, box);
+ if (fr->adress_site == eAdressSITEcog)
+ {
+ update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ else if (fr->adress_site == eAdressSITEcom)
+ {
+ update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ else if (fr->adress_site == eAdressSITEatomatom)
+ {
+ update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ else
+ {
+ update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ }
+
+ if (NEED_MUTOT(*inputrec))
+ {
+
+ if (bStateChanged)
+ {
+ if (PAR(cr))
+ {
+ gmx_sumd(2*DIM, mu, cr);
+ }
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ fr->mu_tot[i][j] = mu[i*DIM + j];
+ }
+ }
+ }
+ if (fr->efep == efepNO)
+ {
+ copy_rvec(fr->mu_tot[0], mu_tot);
+ }
+ else
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ mu_tot[j] =
+ (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+ }
+ }
+ }
+
+ /* Reset energies */
+ reset_enerdata(fr, bNS, enerd, MASTER(cr));
+ clear_rvecs(SHIFTS, fr->fshift);
+
+ if (bNS)
+ {
+ wallcycle_start(wcycle, ewcNS);
+
+ if (graph && bStateChanged)
+ {
+ /* Calculate intramolecular shift vectors to make molecules whole */
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ }
+
++ /* Do the actual neighbour searching */
+ ns(fplog, fr, box,
+ groups, top, mdatoms,
+ cr, nrnb, bFillGrid,
+ bDoLongRangeNS);
+
+ wallcycle_stop(wcycle, ewcNS);
+ }
+
+ if (inputrec->implicit_solvent && bNS)
+ {
+ make_gb_nblist(cr, inputrec->gb_algorithm,
+ x, box, fr, &top->idef, graph, fr->born);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ if (!(cr->duty & DUTY_PME))
+ {
+ wallcycle_start(wcycle, ewcPPDURINGPME);
+ dd_force_flop_start(cr->dd, nrnb);
+ }
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Enforced rotation has its own cycle counter that starts after the collective
+ * coordinates have been communicated. It is added to ddCyclF to allow
+ * for proper load-balancing */
+ wallcycle_start(wcycle, ewcROT);
+ do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ wallcycle_stop(wcycle, ewcROT);
+ }
+
+ /* Start the force cycle counter.
+ * This counter is stopped in do_forcelow_level.
+ * No parallel communication should occur while this counter is running,
+ * since that will interfere with the dynamic load balancing.
+ */
+ wallcycle_start(wcycle, ewcFORCE);
+
+ if (bDoForces)
+ {
+ /* Reset forces for which the virial is calculated separately:
+ * PME/Ewald forces if necessary */
+ if (fr->bF_NoVirSum)
+ {
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ fr->f_novirsum = fr->f_novirsum_alloc;
+ if (fr->bDomDec)
+ {
+ clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ }
+ else
+ {
+ clear_rvecs(homenr, fr->f_novirsum+start);
+ }
+ }
+ else
+ {
+ /* We are not calculating the pressure so we do not need
+ * a separate array for forces that do not contribute
+ * to the pressure.
+ */
+ fr->f_novirsum = f;
+ }
+ }
+
+ /* Clear the short- and long-range forces */
+ clear_rvecs(fr->natoms_force_constr, f);
+ if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ {
+ clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ }
+
+ clear_rvec(fr->vir_diag_posres);
+ }
+ if (inputrec->ePull == epullCONSTRAINT)
+ {
+ clear_pull_forces(inputrec->pull);
+ }
+
+ /* update QMMMrec, if necessary */
+ if (fr->bQMMM)
+ {
+ update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ {
+ posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ enerd, lambda, fr);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ {
+ /* Flat-bottomed position restraints always require full pbc */
+ if (!(bStateChanged && bDoAdressWF))
+ {
+ set_pbc(&pbc, inputrec->ePBC, box);
+ }
+ v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
+ top->idef.iparams_fbposres,
+ (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc,
+ fr->rc_scaling, fr->ePBC, fr->posres_com);
+ enerd->term[F_FBPOSRES] += v;
+ inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
+ }
+
+ /* Compute the bonded and non-bonded energies and optionally forces */
+ do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ cr, nrnb, wcycle, mdatoms,
+ x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
+ &(top->atomtypes), bBornRadii, box,
+ inputrec->fepvals, lambda,
+ graph, &(top->excls), fr->mu_tot,
+ flags,
+ &cycles_pme);
+
+ if (bSepLRF)
+ {
+ if (do_per_step(step, inputrec->nstcalclr))
+ {
+ /* Add the long range forces to the short range forces */
+ for (i = 0; i < fr->natoms_force_constr; i++)
+ {
+ rvec_add(fr->f_twin[i], f[i], f[i]);
+ }
+ }
+ }
+
+ cycles_force = wallcycle_stop(wcycle, ewcFORCE);
+
+ if (ed)
+ {
+ do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_force_flop_stop(cr->dd, nrnb);
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ }
+ }
+
+ if (bDoForces)
+ {
+ if (IR_ELEC_FIELD(*inputrec))
+ {
+ /* Compute forces due to electric field */
+ calc_f_el(MASTER(cr) ? field : NULL,
+ start, homenr, mdatoms->chargeA, fr->f_novirsum,
+ inputrec->ex, inputrec->et, t);
+ }
+
+ if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
+ {
+ /* Compute thermodynamic force in hybrid AdResS region */
+ adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+
+ /* Communicate the forces */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle, ewcMOVEF);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_f(cr->dd, f, fr->fshift);
+ /* Do we need to communicate the separate force array
+ * for terms that do not contribute to the single sum virial?
+ * Position restraints and electric fields do not introduce
+ * inter-cg forces, only full electrostatics methods do.
+ * When we do not calculate the virial, fr->f_novirsum = f,
+ * so we have already communicated these forces.
+ */
+ if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ (flags & GMX_FORCE_VIRIAL))
+ {
+ dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ }
+ if (bSepLRF)
+ {
+ /* We should not update the shift forces here,
+ * since f_twin is already included in f.
+ */
+ dd_move_f(cr->dd, fr->f_twin, NULL);
+ }
+ }
+ else
+ {
+ pd_move_f(cr, f, nrnb);
+ if (bSepLRF)
+ {
+ pd_move_f(cr, fr->f_twin, nrnb);
+ }
+ }
+ wallcycle_stop(wcycle, ewcMOVEF);
+ }
+
+ /* If we have NoVirSum forces, but we do not calculate the virial,
+ * we sum fr->f_novirum=f later.
+ */
+ if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+
+ if (bSepLRF)
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+ }
+ }
+
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Calculation of the virial must be done after vsites! */
+ calc_virial(mdatoms->start, mdatoms->homenr, x, f,
+ vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ }
+ }
+
+ if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ {
+ pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ f, vir_force, mdatoms, enerd, lambda, t);
+ }
+
+ /* Add the forces from enforced rotation potentials (if any) */
+ if (inputrec->bRot)
+ {
+ wallcycle_start(wcycle, ewcROTadd);
+ enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ wallcycle_stop(wcycle, ewcROTadd);
+ }
+
+ if (PAR(cr) && !(cr->duty & DUTY_PME))
+ {
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ }
+
+ if (bDoForces)
+ {
+ post_process_forces(cr, step, nrnb, wcycle,
+ top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ flags);
+ }
+
+ /* Sum the potential energy terms from group contributions */
+ sum_epot(&(enerd->grpp), enerd->term);
+}
+
+void do_force(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_groups_t *groups,
+ matrix box, rvec x[], history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr,
+ gmx_vsite_t *vsite, rvec mu_tot,
+ double t, FILE *field, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ /* modify force flag if not doing nonbonded */
+ if (!fr->bNonbonded)
+ {
+ flags &= ~GMX_FORCE_NONBONDED;
+ }
+
+ switch (inputrec->cutoff_scheme)
+ {
+ case ecutsVERLET:
+ do_force_cutsVERLET(fplog, cr, inputrec,
+ step, nrnb, wcycle,
+ top,
+ groups,
+ box, x, hist,
+ f, vir_force,
+ mdatoms,
+ enerd, fcd,
+ lambda, graph,
+ fr, fr->ic,
+ vsite, mu_tot,
+ t, field, ed,
+ bBornRadii,
+ flags);
+ break;
+ case ecutsGROUP:
+ do_force_cutsGROUP(fplog, cr, inputrec,
+ step, nrnb, wcycle,
+ top,
+ groups,
+ box, x, hist,
+ f, vir_force,
+ mdatoms,
+ enerd, fcd,
+ lambda, graph,
+ fr, vsite, mu_tot,
+ t, field, ed,
+ bBornRadii,
+ flags);
+ break;
+ default:
+ gmx_incons("Invalid cut-off scheme passed!");
+ }
+}
+
+
+void do_constrain_first(FILE *fplog, gmx_constr_t constr,
+ t_inputrec *ir, t_mdatoms *md,
+ t_state *state, t_commrec *cr, t_nrnb *nrnb,
+ t_forcerec *fr, gmx_localtop_t *top)
+{
+ int i, m, start, end;
+ gmx_large_int_t step;
+ real dt = ir->delta_t;
+ real dvdl_dum;
+ rvec *savex;
+
+ snew(savex, state->natoms);
+
+ start = md->start;
+ end = md->homenr + start;
+
+ if (debug)
+ {
+ fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
+ start, md->homenr, end);
+ }
+ /* Do a first constrain to reset particles... */
+ step = ir->init_step;
+ if (fplog)
+ {
+ char buf[STEPSTRSIZE];
+ fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
+ gmx_step_str(step, buf));
+ }
+ dvdl_dum = 0;
+
+ /* constrain the current position */
+ constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ ir, NULL, cr, step, 0, md,
+ state->x, state->x, NULL,
+ fr->bMolPBC, state->box,
+ state->lambda[efptBONDED], &dvdl_dum,
+ NULL, NULL, nrnb, econqCoord,
+ ir->epc == epcMTTK, state->veta, state->veta);
+ if (EI_VV(ir->eI))
+ {
+ /* constrain the inital velocity, and save it */
+ /* also may be useful if we need the ekin from the halfstep for velocity verlet */
+ /* might not yet treat veta correctly */
+ constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ ir, NULL, cr, step, 0, md,
+ state->x, state->v, state->v,
+ fr->bMolPBC, state->box,
+ state->lambda[efptBONDED], &dvdl_dum,
+ NULL, NULL, nrnb, econqVeloc,
+ ir->epc == epcMTTK, state->veta, state->veta);
+ }
+ /* constrain the inital velocities at t-dt/2 */
+ if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
+ {
+ for (i = start; (i < end); i++)
+ {
+ for (m = 0; (m < DIM); m++)
+ {
+ /* Reverse the velocity */
+ state->v[i][m] = -state->v[i][m];
+ /* Store the position at t-dt in buf */
+ savex[i][m] = state->x[i][m] + dt*state->v[i][m];
+ }
+ }
+ /* Shake the positions at t=-dt with the positions at t=0
+ * as reference coordinates.
+ */
+ if (fplog)
+ {
+ char buf[STEPSTRSIZE];
+ fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
+ gmx_step_str(step, buf));
+ }
+ dvdl_dum = 0;
+ constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ ir, NULL, cr, step, -1, md,
+ state->x, savex, NULL,
+ fr->bMolPBC, state->box,
+ state->lambda[efptBONDED], &dvdl_dum,
+ state->v, NULL, nrnb, econqCoord,
+ ir->epc == epcMTTK, state->veta, state->veta);
+
+ for (i = start; i < end; i++)
+ {
+ for (m = 0; m < DIM; m++)
+ {
+ /* Re-reverse the velocities */
+ state->v[i][m] = -state->v[i][m];
+ }
+ }
+ }
+ sfree(savex);
+}
+
+void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
+{
+ double eners[2], virs[2], enersum, virsum, y0, f, g, h;
+ double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+ double invscale, invscale2, invscale3;
+ int ri0, ri1, ri, i, offstart, offset;
+ real scale, *vdwtab, tabfactor, tmp;
+
+ fr->enershiftsix = 0;
+ fr->enershifttwelve = 0;
+ fr->enerdiffsix = 0;
+ fr->enerdifftwelve = 0;
+ fr->virdiffsix = 0;
+ fr->virdifftwelve = 0;
+
+ if (eDispCorr != edispcNO)
+ {
+ for (i = 0; i < 2; i++)
+ {
+ eners[i] = 0;
+ virs[i] = 0;
+ }
+ if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
+ {
+ if (fr->rvdw_switch == 0)
+ {
+ gmx_fatal(FARGS,
+ "With dispersion correction rvdw-switch can not be zero "
+ "for vdw-type = %s", evdw_names[fr->vdwtype]);
+ }
+
+ scale = fr->nblists[0].table_elec_vdw.scale;
+ vdwtab = fr->nblists[0].table_vdw.data;
+
+ /* Round the cut-offs to exact table values for precision */
+ ri0 = floor(fr->rvdw_switch*scale);
+ ri1 = ceil(fr->rvdw*scale);
+ r0 = ri0/scale;
+ r1 = ri1/scale;
+ rc3 = r0*r0*r0;
+ rc9 = rc3*rc3*rc3;
+
+ if (fr->vdwtype == evdwSHIFT)
+ {
+ /* Determine the constant energy shift below rvdw_switch.
+ * Table has a scale factor since we have scaled it down to compensate
+ * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
+ */
+ fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
+ fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
+ }
+ /* Add the constant part from 0 to rvdw_switch.
+ * This integration from 0 to rvdw_switch overcounts the number
+ * of interactions by 1, as it also counts the self interaction.
+ * We will correct for this later.
+ */
+ eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
+ eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
+
+ invscale = 1.0/(scale);
+ invscale2 = invscale*invscale;
+ invscale3 = invscale*invscale2;
+
+ /* following summation derived from cubic spline definition,
+ Numerical Recipies in C, second edition, p. 113-116. Exact
+ for the cubic spline. We first calculate the negative of
+ the energy from rvdw to rvdw_switch, assuming that g(r)=1,
+ and then add the more standard, abrupt cutoff correction to
+ that result, yielding the long-range correction for a
+ switched function. We perform both the pressure and energy
+ loops at the same time for simplicity, as the computational
+ cost is low. */
+
+ for (i = 0; i < 2; i++)
+ {
+ enersum = 0.0; virsum = 0.0;
+ if (i == 0)
+ {
+ offstart = 0;
+ /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
+ * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
+ * up (to save flops in kernels), we need to correct for this.
+ */
+ tabfactor = 6.0;
+ }
+ else
+ {
+ offstart = 4;
+ tabfactor = 12.0;
+ }
+ for (ri = ri0; ri < ri1; ri++)
+ {
+ r = ri*invscale;
+ ea = invscale3;
+ eb = 2.0*invscale2*r;
+ ec = invscale*r*r;
+
+ pa = invscale3;
+ pb = 3.0*invscale2*r;
+ pc = 3.0*invscale*r*r;
+ pd = r*r*r;
+
+ /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
+ offset = 8*ri + offstart;
+ y0 = vdwtab[offset];
+ f = vdwtab[offset+1];
+ g = vdwtab[offset+2];
+ h = vdwtab[offset+3];
+
+ enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
+ virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
+ }
+
+ enersum *= 4.0*M_PI*tabfactor;
+ virsum *= 4.0*M_PI*tabfactor;
+ eners[i] -= enersum;
+ virs[i] -= virsum;
+ }
+
+ /* now add the correction for rvdw_switch to infinity */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(9.0*rc9);
+ virs[0] += 8.0*M_PI/rc3;
+ virs[1] += -16.0*M_PI/(3.0*rc9);
+ }
+ else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
+ {
+ if (fr->vdwtype == evdwUSER && fplog)
+ {
+ fprintf(fplog,
+ "WARNING: using dispersion correction with user tables\n");
+ }
+ rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
+ rc9 = rc3*rc3*rc3;
+ /* Contribution beyond the cut-off */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(9.0*rc9);
+ if (fr->vdw_modifier == eintmodPOTSHIFT)
+ {
+ /* Contribution within the cut-off */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(3.0*rc9);
+ }
+ /* Contribution beyond the cut-off */
+ virs[0] += 8.0*M_PI/rc3;
+ virs[1] += -16.0*M_PI/(3.0*rc9);
+ }
+ else
+ {
+ gmx_fatal(FARGS,
+ "Dispersion correction is not implemented for vdw-type = %s",
+ evdw_names[fr->vdwtype]);
+ }
+ fr->enerdiffsix = eners[0];
+ fr->enerdifftwelve = eners[1];
+ /* The 0.5 is due to the Gromacs definition of the virial */
+ fr->virdiffsix = 0.5*virs[0];
+ fr->virdifftwelve = 0.5*virs[1];
+ }
+}
+
+void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
+ gmx_large_int_t step, int natoms,
+ matrix box, real lambda, tensor pres, tensor virial,
+ real *prescorr, real *enercorr, real *dvdlcorr)
+{
+ gmx_bool bCorrAll, bCorrPres;
+ real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
+ int m;
+
+ *prescorr = 0;
+ *enercorr = 0;
+ *dvdlcorr = 0;
+
+ clear_mat(virial);
+ clear_mat(pres);
+
+ if (ir->eDispCorr != edispcNO)
+ {
+ bCorrAll = (ir->eDispCorr == edispcAllEner ||
+ ir->eDispCorr == edispcAllEnerPres);
+ bCorrPres = (ir->eDispCorr == edispcEnerPres ||
+ ir->eDispCorr == edispcAllEnerPres);
+
+ invvol = 1/det(box);
+ if (fr->n_tpi)
+ {
+ /* Only correct for the interactions with the inserted molecule */
+ dens = (natoms - fr->n_tpi)*invvol;
+ ninter = fr->n_tpi;
+ }
+ else
+ {
+ dens = natoms*invvol;
+ ninter = 0.5*natoms;
+ }
+
+ if (ir->efep == efepNO)
+ {
+ avcsix = fr->avcsix[0];
+ avctwelve = fr->avctwelve[0];
+ }
+ else
+ {
+ avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
+ avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
+ }
+
+ enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
+ *enercorr += avcsix*enerdiff;
+ dvdlambda = 0.0;
+ if (ir->efep != efepNO)
+ {
+ dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
+ }
+ if (bCorrAll)
+ {
+ enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
+ *enercorr += avctwelve*enerdiff;
+ if (fr->efep != efepNO)
+ {
+ dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
+ }
+ }
+
+ if (bCorrPres)
+ {
+ svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
+ if (ir->eDispCorr == edispcAllEnerPres)
+ {
+ svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
+ }
+ /* The factor 2 is because of the Gromacs virial definition */
+ spres = -2.0*invvol*svir*PRESFAC;
+
+ for (m = 0; m < DIM; m++)
+ {
+ virial[m][m] += svir;
+ pres[m][m] += spres;
+ }
+ *prescorr += spres;
+ }
+
+ /* Can't currently control when it prints, for now, just print when degugging */
+ if (debug)
+ {
+ if (bCorrAll)
+ {
+ fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ avcsix, avctwelve);
+ }
+ if (bCorrPres)
+ {
+ fprintf(debug,
+ "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
+ *enercorr, spres, svir);
+ }
+ else
+ {
+ fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
+ }
+ }
+
+ if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
+ {
+ gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
+ }
+ if (fr->efep != efepNO)
+ {
+ *dvdlcorr += dvdlambda;
+ }
+ }
+}
+
+void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
+ t_graph *graph, rvec x[])
+{
+ if (fplog)
+ {
+ fprintf(fplog, "Removing pbc first time\n");
+ }
+ calc_shifts(box, fr->shift_vec);
+ if (graph)
+ {
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ if (gmx_debug_at)
+ {
+ p_graph(debug, "do_pbc_first 1", graph);
+ }
+ shift_self(graph, box, x);
+ /* By doing an extra mk_mshift the molecules that are broken
+ * because they were e.g. imported from another software
+ * will be made whole again. Such are the healing powers
+ * of GROMACS.
+ */
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ if (gmx_debug_at)
+ {
+ p_graph(debug, "do_pbc_first 2", graph);
+ }
+ }
+ if (fplog)
+ {
+ fprintf(fplog, "Done rmpbc\n");
+ }
+}
+
+static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ gmx_mtop_t *mtop, rvec x[],
+ gmx_bool bFirst)
+{
+ t_graph *graph;
+ int mb, as, mol;
+ gmx_molblock_t *molb;
+
+ if (bFirst && fplog)
+ {
+ fprintf(fplog, "Removing pbc first time\n");
+ }
+
+ snew(graph, 1);
+ as = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ molb = &mtop->molblock[mb];
+ if (molb->natoms_mol == 1 ||
+ (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
+ {
+ /* Just one atom or charge group in the molecule, no PBC required */
+ as += molb->nmol*molb->natoms_mol;
+ }
+ else
+ {
+ /* Pass NULL iso fplog to avoid graph prints for each molecule type */
+ mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
+ 0, molb->natoms_mol, FALSE, FALSE, graph);
+
+ for (mol = 0; mol < molb->nmol; mol++)
+ {
+ mk_mshift(fplog, graph, ePBC, box, x+as);
+
+ shift_self(graph, box, x+as);
+ /* The molecule is whole now.
+ * We don't need the second mk_mshift call as in do_pbc_first,
+ * since we no longer need this graph.
+ */
+
+ as += molb->natoms_mol;
+ }
+ done_graph(graph);
+ }
+ }
+ sfree(graph);
+}
+
+void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
+ gmx_mtop_t *mtop, rvec x[])
+{
+ low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
+}
+
+void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ gmx_mtop_t *mtop, rvec x[])
+{
+ low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
+}
+
+void finish_run(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ t_nrnb nrnb[], gmx_wallcycle_t wcycle,
+ gmx_runtime_t *runtime,
+ wallclock_gpu_t *gputimes,
+ gmx_bool bWriteStat)
+{
+ int i, j;
+ t_nrnb *nrnb_tot = NULL;
+ real delta_t;
+ double nbfs, mflop;
+
+ wallcycle_sum(cr, wcycle);
+
+ if (cr->nnodes > 1)
+ {
+ snew(nrnb_tot, 1);
+#ifdef GMX_MPI
+ MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
+ cr->mpi_comm_mysim);
+#endif
+ }
+ else
+ {
+ nrnb_tot = nrnb;
+ }
+
+#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+ if (cr->nnodes > 1)
+ {
+ /* reduce nodetime over all MPI processes in the current simulation */
+ double sum;
+ MPI_Allreduce(&runtime->proctime, &sum, 1, MPI_DOUBLE, MPI_SUM,
+ cr->mpi_comm_mysim);
+ runtime->proctime = sum;
+ }
+#endif
+
+ if (SIMMASTER(cr))
+ {
+ print_flop(fplog, nrnb_tot, &nbfs, &mflop);
+ }
+ if (cr->nnodes > 1)
+ {
+ sfree(nrnb_tot);
+ }
+
+ if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+ {
+ print_dd_statistics(cr, inputrec, fplog);
+ }
+
+#ifdef GMX_MPI
+ if (PARTDECOMP(cr))
+ {
+ if (MASTER(cr))
+ {
+ t_nrnb *nrnb_all;
+ int s;
+ MPI_Status stat;
+
+ snew(nrnb_all, cr->nnodes);
+ nrnb_all[0] = *nrnb;
+ for (s = 1; s < cr->nnodes; s++)
+ {
+ MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
+ cr->mpi_comm_mysim, &stat);
+ }
+ pr_load(fplog, cr, nrnb_all);
+ sfree(nrnb_all);
+ }
+ else
+ {
+ MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
+ cr->mpi_comm_mysim);
+ }
+ }
+#endif
+
+ if (SIMMASTER(cr))
+ {
+ wallcycle_print(fplog, cr->nnodes, cr->npmenodes, runtime->realtime,
+ wcycle, gputimes);
+
+ if (EI_DYNAMICS(inputrec->eI))
+ {
+ delta_t = inputrec->delta_t;
+ }
+ else
+ {
+ delta_t = 0;
+ }
+
+ if (fplog)
+ {
+ print_perf(fplog, runtime->proctime, runtime->realtime,
+ runtime->nsteps_done, delta_t, nbfs, mflop);
+ }
+ if (bWriteStat)
+ {
+ print_perf(stderr, runtime->proctime, runtime->realtime,
+ runtime->nsteps_done, delta_t, nbfs, mflop);
+ }
+ }
+}
+
+extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+{
+ /* this function works, but could probably use a logic rewrite to keep all the different
+ types of efep straight. */
+
+ int i;
+ t_lambda *fep = ir->fepvals;
+
+ if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ {
+ for (i = 0; i < efptNR; i++)
+ {
+ lambda[i] = 0.0;
+ if (lam0)
+ {
+ lam0[i] = 0.0;
+ }
+ }
+ return;
+ }
+ else
+ {
+ *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+ if checkpoint is set -- a kludge is in for now
+ to prevent this.*/
+ for (i = 0; i < efptNR; i++)
+ {
+ /* overwrite lambda state with init_lambda for now for backwards compatibility */
+ if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+ {
+ lambda[i] = fep->init_lambda;
+ if (lam0)
+ {
+ lam0[i] = lambda[i];
+ }
+ }
+ else
+ {
+ lambda[i] = fep->all_lambda[i][*fep_state];
+ if (lam0)
+ {
+ lam0[i] = lambda[i];
+ }
+ }
+ }
+ if (ir->bSimTemp)
+ {
+ /* need to rescale control temperatures to match current state */
+ for (i = 0; i < ir->opts.ngtc; i++)
+ {
+ if (ir->opts.ref_t[i] > 0)
+ {
+ ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+ }
+ }
+ }
+ }
+
+ /* Send to the log the information on the current lambdas */
+ if (fplog != NULL)
+ {
+ fprintf(fplog, "Initial vector of lambda components:[ ");
+ for (i = 0; i < efptNR; i++)
+ {
+ fprintf(fplog, "%10.4f ", lambda[i]);
+ }
+ fprintf(fplog, "]\n");
+ }
+ return;
+}
+
+
+void init_md(FILE *fplog,
+ t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
+ double *t, double *t0,
+ real *lambda, int *fep_state, double *lam0,
+ t_nrnb *nrnb, gmx_mtop_t *mtop,
+ gmx_update_t *upd,
+ int nfile, const t_filenm fnm[],
+ gmx_mdoutf_t **outf, t_mdebin **mdebin,
+ tensor force_vir, tensor shake_vir, rvec mu_tot,
+ gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
+{
+ int i, j, n;
+ real tmpt, mod;
+
+ /* Initial values */
+ *t = *t0 = ir->init_t;
+
+ *bSimAnn = FALSE;
+ for (i = 0; i < ir->opts.ngtc; i++)
+ {
+ /* set bSimAnn if any group is being annealed */
+ if (ir->opts.annealing[i] != eannNO)
+ {
+ *bSimAnn = TRUE;
+ }
+ }
+ if (*bSimAnn)
+ {
+ update_annealing_target_temp(&(ir->opts), ir->init_t);
+ }
+
+ /* Initialize lambda variables */
+ initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
+
+ if (upd)
+ {
+ *upd = init_update(ir);
+ }
+
+
+ if (vcm != NULL)
+ {
+ *vcm = init_vcm(fplog, &mtop->groups, ir);
+ }
+
+ if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
+ {
+ if (ir->etc == etcBERENDSEN)
+ {
+ please_cite(fplog, "Berendsen84a");
+ }
+ if (ir->etc == etcVRESCALE)
+ {
+ please_cite(fplog, "Bussi2007a");
+ }
+ }
+
+ init_nrnb(nrnb);
+
+ if (nfile != -1)
+ {
+ *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
+
+ *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
+ mtop, ir, (*outf)->fp_dhdl);
+ }
+
+ if (ir->bAdress)
+ {
+ please_cite(fplog, "Fritsch12");
+ please_cite(fplog, "Junghans10");
+ }
+ /* Initiate variables */
+ clear_mat(force_vir);
+ clear_mat(shake_vir);
+ clear_rvec(mu_tot);
+
+ debug_gmx();
+}