project(Gromacs)
include(Dart)
mark_as_advanced(DART_ROOT)
+
# PROJECT_VERSION should have the following structure:
-# VERSION[-dev-SUFFIX] where the VERSION can have any form and the suffix
-set(PROJECT_VERSION "4.5.1")
+# VERSION-dev[-SUFFIX] where the VERSION should have the for: vMajor.vMinor.vPatch
+#
+# The "-dev" suffix is important to keep because it makes possible to distinguish
+# between a build from official release and a build from git release branch on a
+# machine with no git.
+#
+# NOTE: when releasing the "-dev" suffix needs to be stripped off!
+set(PROJECT_VERSION "4.5.2-dev")
set(CUSTOM_VERSION_STRING ""
CACHE STRING "Custom version string (if empty, use hard-coded default)")
mark_as_advanced(CUSTOM_VERSION_STRING)
# It is a bit irritating, but this has to be set separately for now!
SET(CPACK_PACKAGE_VERSION_MAJOR "4")
SET(CPACK_PACKAGE_VERSION_MINOR "5")
-SET(CPACK_PACKAGE_VERSION_PATCH "1")
+SET(CPACK_PACKAGE_VERSION_PATCH "2")
# Cmake modules/macros are in a subdirectory to keep this file cleaner
option(GMX_THREADS "Build a parallel (thread-based) version of GROMACS (cannot be combined with MPI yet)" ON)
option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
mark_as_advanced(GMX_SOFTWARE_INVSQRT)
+option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" ON)
+mark_as_advanced(GMX_POWERPC_INVSQRT)
option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
mark_as_advanced(GMX_FAHCORE)
option(GMX_OPENMM "Accelerated execution on GPUs through the OpenMM library (rerun cmake after changing to see relevant options)" OFF)
option(GMX_DEFAULT_SUFFIX "Use default suffixes for GROMACS binaries and libs (_d for double, _mpi for MPI; rerun cmake after changing to see relevant options)" ON)
+if(UNIX AND NOT APPLE)
+ option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer \".a\" static archives (NOTE: this is enabled only for UNIX (excluding APPLE) platforms but it might not always work!" OFF)
+ mark_as_advanced(GMX_PREFER_STATIC_LIBS)
+endif()
+
########################################################################
# Set up binary and library suffixing
########################################################################
if(GMX_SOFTWARE_INVSQRT)
set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
endif(GMX_SOFTWARE_INVSQRT)
+if(GMX_POWERPC_INVSQRT)
+ set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
+endif(GMX_POWERPC_INVSQRT)
########################################################################
#Process MPI settings
########################################################################
# Find external packages #
########################################################################
+if(UNIX AND NOT APPLE)
+ if(GMX_PREFER_STATIC_LIBS)
+ SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+ if(BUILD_SHARED_LIBS)
+ message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
+ set(BUILD_SHARED_LIBS OFF CACHE BOOL "Enable shared libraries (can be problematic with MPI, Windows)" FORCE)
+ endif()
+ endif()
+endif()
find_package(LibXml2)
set(PKG_XML "")
endif()
endif (${GMX_ACCELERATION} STREQUAL "auto" AND NOT GMX_OPENMM)
-
include(gmxTestXDR)
gmx_test_xdr(GMX_SYSTEM_XDR)
if(NOT GMX_SYSTEM_XDR)
set(GMX_IA32_ASM 0)
set(GMX_GMX_X86_64_ASM 0)
elseif(${GMX_ACCELERATION} STREQUAL "BLUEGENE")
+# GMX_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
+ message(STATUS "Configuring for BlueGene")
set(GMX_BLUEGENE 1)
+ if (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
+ set(SHARED_LIBS_DEFAULT OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
+ set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
+ endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
+ set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
+ set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
+ set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
+ set(GMX_THREADS OFF CACHE BOOL "Threads not compatible with BlueGene, disabled!" FORCE)
+ set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
+ set(GMX_EXTERNAL_BLAS TRUE CACHE BOOL "Use MASSV for BLAS on BlueGene" FORCE)
+ set(GMX_EXTERNAL_LAPACK TRUE CACHE BOOL "Use MASSV for LAPACK on BlueGene" FORCE)
+ list(APPEND GMX_EXTRA_LIBRARIES massv)
elseif(${GMX_ACCELERATION} STREQUAL "POWER6")
set(GMX_POWER6 1)
+ set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
+ set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
elseif(${GMX_ACCELERATION} STREQUAL "IA64")
set(GMX_IA64_ASM 1)
set(DISABLE_WATERWATER_NLIST 1)
# Microsoft HPC SDK is automatically added to the system path
# Argonne National Labs MPICH2 sets a registry key that we can use.
+TRY_COMPILE(MPI_FOUND ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestMPI.c"
+ COMPILE_DEFINITIONS )
+
+if(MPI_FOUND)
+ return()
+endif()
+
set(_MPI_PACKAGE_DIR
mpi
mpich
--- /dev/null
+#include <mpi.h>
+
+int main(int argc, char **argv)
+{
+ MPI_Init(&argc,&argv);
+}
--- /dev/null
+# derived from http://cmake.org/Wiki/CmakeBlueGene
+
+# the name of the target operating system
+set(CMAKE_SYSTEM_NAME BlueGeneL CACHE STRING "Cross-compiling for BlueGene/L")
+
+# adjust to suit your machine's versions
+# /bgl/BlueLight/V1R3M2_140_2007-070424/ppc/bglsys
+set(BLRTS_PATH /bgl/BlueLight/V1R3M4_300_2008-080728/ppc/bglsys CACHE STRING "Path to the BlueGene/L system libraries and includes")
+
+# set the compiler
+set(CMAKE_C_COMPILER /opt/ibmcmp/vac/bg/8.0/bin/blrts_xlc)
+set(CMAKE_C_FLAGS "-O3 -qbgl -qarch=auto -qtune=auto -qnoautoconfig -qfloat=norngchk -qhot")
+set(CMAKE_EXE_LINKER_FLAGS "-L${BLRTS_PATH}/lib")
+set(CMAKE_CXX_COMPILER /opt/ibmcmp/vacpp/bg/8.0/bin/blrts_xlC)
+
+set(MPI_LIBRARY mpich.rts CACHE STRING "MPI library for BlueGene" FORCE)
+set(MPI_EXTRA_LIBRARY msglayer.rts devices.rts rts.rts devices.rts CACHE STRING "Extra MPI libraries for BlueGene" FORCE)
+set(MPI_INCLUDE_PATH ${BLRTS_PATH}/include CACHE STRING "MPI include path for BlueGene" FORCE)
+
+# This adds directories that find commands should specifically ignore for cross compiles.
+# Most of these directories are the includeand lib directories for the frontend on BG/P systems.
+# Not ignoring these can cause things like FindX11 to find a frontend PPC version mistakenly.
+# We use this on BG instead of re-rooting because backend libraries are typically strewn about
+# the filesystem, and we can't re-root ALL backend libraries to a single place.
+
+set(CMAKE_SYSTEM_IGNORE_PATH
+ /lib /lib64 /include
+ /usr/lib /usr/lib64 /usr/include
+ /usr/local/lib /usr/local/lib64 /usr/local/include
+ /usr/X11/lib /usr/X11/lib64 /usr/X11/include
+ /usr/lib/X11 /usr/lib64/X11 /usr/include/X11
+ /usr/X11R6/lib /usr/X11R6/lib64 /usr/X11R6/include
+ /usr/X11R7/lib /usr/X11R7/lib64 /usr/X11R7/include
+)
+
+# set the search path for the environment coming with the compiler
+# and a directory where you can install your own compiled software
+set(CMAKE_FIND_ROOT_PATH
+ /bgl/BlueLight/ppcfloor/
+ ${BLRTS_PATH}
+ /opt/ibmcmp/xlmass/bg
+)
+
+# adjust the default behaviour of the FIND_XXX() commands:
+# search headers and libraries in the target environment, search
+# programs in the host environment
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(GMX_ACCELERATION "BlueGene" CACHE STRING "Forcing BlueGene acceleration when using BlueGene toolchain")
#######################################################################
AC_PREREQ(2.50)
-AC_INIT(gromacs, 4.5.1, [gmx-users@gromacs.org])
+
+# The "-dev" suffix is important to keep because it makes possible to distinguish
+# between a build from official release and a build from git release branch on a
+# machine with no git.
+#
+# NOTE: when releasing the "-dev" suffix needs to be stripped off!
+AC_INIT(gromacs, 4.5.2-dev, [gmx-users@gromacs.org])
AC_CONFIG_SRCDIR(src/gmxlib/3dview.c)
AC_CONFIG_AUX_DIR(config)
AC_CANONICAL_HOST
# IBM Power6-specific optimization
AC_ARG_ENABLE(power6,
[AC_HELP_STRING([--enable-power6],
- [Use IBM Pwr6/PPC440/PPC450-specific F77 kernels])],,enable_power6=no)
+ [Use IBM Power6-specific F77 kernels])],,enable_power6=no)
if test "$enable_power6" = "yes"; then
- AC_DEFINE(GMX_POWER6,,[Enable IBM Pwr6/PPC440/PPC450-specific F77 kernels])
+ AC_DEFINE(GMX_POWER6,,[Enable IBM Power6-specific F77 kernels])
fi
AC_ARG_ENABLE(bluegene,
[AC_HELP_STRING([--disable-software-invsqrt],
[No software 1/sqrt (disabled on sgi,ibm,ia64)])],,
[case "${host_cpu}-${host_os}" in
- mips*-irix* | rs6000*-aix* | powerpc*-aix | ia64*-*) enable_software_invsqrt=no ;;
+ mips*-irix* | rs6000*-aix* | powerpc*-aix | powerpc*-none | ia64*-*) enable_software_invsqrt=no ;;
*) enable_software_invsqrt=yes ;;
esac])
if test "$enable_software_invsqrt" = "yes"; then
- AC_DEFINE(GMX_SOFTWARE_INVSQRT,,[Use the GROMACS sGMX_INTERNAL_XDRsqrt(x)])
+ AC_DEFINE(GMX_SOFTWARE_INVSQRT,,[Use the GROMACS software 1/sqrt(x)])
PKG_CFLAGS="$PKG_CFLAGS -DGMX_SOFTWARE_INVSQRT"
fi
AM_CONDITIONAL([GMX_SOFTWARE_INVSQRT],[test "$enable_software_invsqrt" = "yes"])
if test "$enable_bluegene" = "yes"; then
AC_DEFINE(GMX_BLUEGENE,,[Use assembly intrinsics kernels for BlueGene])
+ AC_DEFINE_UNQUOTED(GMX_POWERPC_INVSQRT,,[Use the PowerPC hardware 1/sqrt(x)])
+ PKG_CFLAGS="$PKG_CFLAGS -DGMX_POWERPC_INVSQRT"
fi
if test "$enable_fortran" = "yes"; then
# includes: Nothing to build, just installation
install(DIRECTORY . DESTINATION ${INCL_INSTALL_DIR}/gromacs
+ COMPONENT development
PATTERN "Makefile*" EXCLUDE
PATTERN "CMake*" EXCLUDE
PATTERN "cmake*" EXCLUDE
};
#endif
- const __m128d signmask = _mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
+ const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
const __m128d invtabscale = _mm_set1_pd(M_PI/32.0);
const __m128d one = _mm_set1_pd(1.0);
cswapsign = _mm_shuffle_epi32(cswapsign,_MM_SHUFFLE(1,1,0,0));
minusone = _mm_sub_pd(_mm_setzero_pd(),one);
- ssign = _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(sswapsign),minusone ),
- _mm_andnot_pd( _mm_castsi128_pd(sswapsign),one ));
- csign = _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(cswapsign),minusone ),
- _mm_andnot_pd( _mm_castsi128_pd(cswapsign),one ));
+ ssign = _mm_or_pd(_mm_and_pd( gmx_mm_castsi128_pd(sswapsign),minusone ),
+ _mm_andnot_pd( gmx_mm_castsi128_pd(sswapsign),one ));
+ csign = _mm_or_pd(_mm_and_pd( gmx_mm_castsi128_pd(cswapsign),minusone ),
+ _mm_andnot_pd( gmx_mm_castsi128_pd(cswapsign),one ));
/* First lookup into table */
#ifdef _MSC_VER
int nrtp, t_restp rtp[],
t_restp *restp, t_hackblock *hb,
int nterpairs, t_hackblock **ntdb, t_hackblock **ctdb,
- int *rn, int *rc, gmx_bool bAllowMissing,
+ gmx_bool bAllowMissing,
gmx_bool bVsites, gmx_bool bVsiteAromatics,
const char *ff, const char *ffdir,
real mHmult,
t_commrec *cr, rvec *x, matrix box, gmx_mtop_t *mtop, const output_env_t oenv,
gmx_bool bVerbose, unsigned long Flags);
-extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot,t_mdatoms *md);
+
+/*! \brief Make a selection of the home atoms for all enforced rotation groups.
+ *
+ * This routine is similar to dd_make_local_pull_groups, but works only with
+ * domain decomposition. It should be called at every domain decomposition.
+ *
+ * \param dd Structure containing domain decomposition data.
+ * \param rot Pointer to all the enforced rotation data.
+ */
+extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot);
+
/*! \brief Calculation of the enforced rotation potential.
*
gmx_large_int_t str_to_large_int_t(const char *str, char **endptr);
+#if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
+#define snprintf _snprintf
+#endif
+
#ifdef __cplusplus
}
#endif
} t_nblists;
/* macros for the cginfo data in forcerec */
-/* The maximum cg size is 255, because we only have space for 8 bits in cginfo,
+/* The maximum cg size in cginfo is 255,
+ * because we only have space for 8 bits in cginfo,
* this cg size entry is actually only read with domain decomposition.
+ * But there is a smaller limit due to the t_excl data structure
+ * which is defined in nblist.h.
*/
-#define MAX_CHARGEGROUP_SIZE 256
#define SET_CGINFO_GID(cgi,gid) (cgi) = (((cgi) & ~65535) | (gid) )
#define GET_CGINFO_GID(cgi) ( (cgi) & 65535)
#define SET_CGINFO_EXCL_INTRA(cgi) (cgi) = ((cgi) | (1<<16))
typedef unsigned long t_excl;
+/* The maximum charge group size because of minimum size of t_excl
+ * could be 32 bits.
+ */
+#define MAX_CHARGEGROUP_SIZE 32
+
/* The maximum charge group size for CG-CG nblists.
* The excl entry in t_nblist uses blocks of this size.
*/
static gmx_inline void mvmul_ur0(matrix a,const rvec src,rvec dest)
{
dest[ZZ]=a[ZZ][XX]*src[XX]+a[ZZ][YY]*src[YY]+a[ZZ][ZZ]*src[ZZ];
- dest[YY]=a[YY][XX]*src[XX]+a[YY][YY];
+ dest[YY]=a[YY][XX]*src[XX]+a[YY][YY]*src[YY];
dest[XX]=a[XX][XX]*src[XX];
}
# Man pages: Nothing to build, just installation
install(DIRECTORY . DESTINATION ${MAN_INSTALL_DIR}
+ COMPONENT data
PATTERN "Makefile*" EXCLUDE
PATTERN "CMake*" EXCLUDE
PATTERN "cmake*" EXCLUDE
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/GMXRC.csh.cmakein ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.csh @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/GMXRC.zsh.cmakein ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.zsh @ONLY)
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.bash DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.zsh DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.csh DESTINATION ${BIN_INSTALL_DIR})
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.bash DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.zsh DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.csh DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
file(GLOB EXTRA_SCRIPTS completion.*)
-install(FILES ${EXTRA_SCRIPTS} DESTINATION ${BIN_INSTALL_DIR})
+install(FILES ${EXTRA_SCRIPTS} DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
-install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/demux.pl DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/xplor2gmx.pl DESTINATION ${BIN_INSTALL_DIR})
+install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/demux.pl DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/xplor2gmx.pl DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
# Data: Nothing to build, just installation
install(DIRECTORY . DESTINATION ${DATA_INSTALL_DIR}
+ COMPONENT data
PATTERN "Makefile*" EXCLUDE
PATTERN "CMake*" EXCLUDE
PATTERN "cmake*" EXCLUDE
PATTERN "*~" EXCLUDE
)
-install(FILES template/CMakeLists.txt.template DESTINATION ${DATA_INSTALL_DIR} RENAME template/CMakeLists.txt)
+install(FILES template/CMakeLists.txt.template
+ DESTINATION ${DATA_INSTALL_DIR}
+ RENAME template/CMakeLists.txt
+ COMPONENT data)
When set to <b>2</b> pressure coupling and Ewald summation can be used
(it is usually best to use semiisotropic pressure coupling with
the x/y compressibility set to 0, as otherwise the surface area will change).
+Walls interact wit the rest of the system through an optional <tt>wall_atomtype</tt>.
Energy groups <tt>wall0</tt> and <tt>wall1</tt> (for <b>nwall=2</b>) are
added automatically to monitor the interaction of energy groups
with each wall.
The <A HREF="#run">center of mass motion removal</A> will be turned
off in the z-direction.</dd>
+<dt><b>wall_atomtype:</b></dt>
+<dd>the atom type name in the force field for each wall.
+By (for example) defining a special wall atom type in the topology with its
+own combination rules, this allows for independent tuning of the interaction
+of each atomtype with the walls.</dd>
<dt><b>wall_type:</b></dt>
<dl>
<dt><b>9-3</b></dt>
When the value is ≤0 (<0 for <b>wall_type=table</b>),
a fatal error is generated when atoms are beyond a wall.
</dd>
-<dt><b>wall_atomtype:</b></dt>
-<dd>the atom type name in the force field for each wall, this allows
-for independent tuning of the interaction of each atomtype with the walls</dd>
<dt><b>wall_density: [nm<sup>-3</sup>/nm<sup>-2</sup>]</b></dt>
<dd>the number density of the atoms for each wall for wall types
<b>9-3</b> and <b>10-4</b>
LYSN LSN
ASPH ASPP
GLUH GLUP
+HEM HEME
HP 0.1 1 1 0.125 0.85 ; H
NY 0.155 1 1.028 0.17063 0.79 ; N
CPT 0.172 0.012 1.554 0.1875 0.72 ; C
- MNH3 0 0 0 0 0 ; vsite (rigid tetrahedrical NH3 group)
- MNH2 0 0 0 0 0 ; vsite
- MCH3 0 0 0 0 0 ; vsite (rigid CH3 group)
\ No newline at end of file
+ MNH3 0 0 0 0 0 ; dummy mass
+ MNH2 0 0 0 0 0 ; dummy mass
+ MCH3 0 0 0 0 0 ; dummy mass
+ MCH3S 0 0 0 0 0 ; dummy mass
HISE HISB
LYS LYSH
LYSN LYS
+HEM HEME
[ atomtypes ]
; type mass charge ptype c6 c12
- OMET 15.999 -0.69 A 2.6169e-3 2.5231e-6
+ OMet 15.999 -0.69 A 2.6169e-3 2.5231e-6
OW 15.999 -0.82 A 2.6170e-3 2.6330e-6
- CMET 15.035 0.29 A 8.8758e-3 17.8426e-6
+ CMet 15.035 0.29 A 8.8758e-3 17.8426e-6
H 1.008 0.4 A 0.0 0.0
HW 1.008 0.41 A 0.0 0.0
#endif
[ atoms ]
; nr type resnr residu atom cgnr charge mass
#ifdef _FF_GROMOS96
-1 CMET 1 MeOH Me1 1 0.176 15.035
-2 OMET 1 MeOH O2 1 -0.574 15.999
+1 CMet 1 MeOH Me1 1 0.176 15.035
+2 OMet 1 MeOH O2 1 -0.574 15.999
3 H 1 MeOH H3 1 0.398 1.008
#else
-1 CMET 1 MeOH Me1 1 0.29 15.035
-2 OMET 1 MeOH O2 1 -0.69 15.999
+1 CMet 1 MeOH Me1 1 0.29 15.035
+2 OMet 1 MeOH O2 1 -0.69 15.999
3 H 1 MeOH H3 1 0.40 1.008
#endif
HISE HISB
LYS LYSH
LYSN LYS
+HEM HEME
HISE HISB
LYS LYSH
LYSN LYS
+HEM HEME
HISE HISB
LYS LYSH
LYSN LYS
+HEM HEME
HISE HISB
LYS LYSH
LYSN LYS
+HEM HEME
/* Use the GROMACS software 1/sqrt(x) */
#cmakedefine GMX_SOFTWARE_INVSQRT
+/* Use the PowerPC hardware 1/sqrt(x) */
+#cmakedefine GMX_POWERPC_INVSQRT
+
/* Compile with dlopen */
#cmakedefine GMX_DLOPEN
file(GLOB FORTRAN_SOURCES nonbonded/nb_kernel_power6/*.[cF])
endif(GMX_POWER6)
+if(GMX_BLUEGENE)
+ file(GLOB GMX_BLUEGENE_C_SRC nonbonded/nb_kernel_bluegene/*.c)
+endif(GMX_BLUEGENE)
+
if(NOT GMX_EXTERNAL_BLAS)
file(GLOB BLAS_SOURCES gmx_blas/*.c)
endif(NOT GMX_EXTERNAL_BLAS)
endif(GMX_ASM_USEASM-NASM)
endif(NOT GMX_OPENMM)
-add_library(gmx ${GMXLIB_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES} ${GMX_SSEKERNEL_C_SRC} ${GMX_SSEKERNEL_ASM_SRC} ${FORTRAN_SOURCES} ${THREAD_MPI_SRC})
+add_library(gmx ${GMXLIB_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES} ${GMX_SSEKERNEL_C_SRC} ${GMX_SSEKERNEL_ASM_SRC} ${FORTRAN_SOURCES} ${GMX_BLUEGENE_C_SRC} ${THREAD_MPI_SRC})
target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_LIB})
-add_dependencies(gmx gmx_version)
+if(USE_VERSION_H)
+ add_dependencies(gmx gmx_version)
+endif()
set_target_properties(gmx PROPERTIES OUTPUT_NAME "gmx${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
-install(TARGETS gmx DESTINATION ${LIB_INSTALL_DIR})
+install(TARGETS gmx DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmx.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmx.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmx.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig RENAME "libgmx${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmx.pc
+ DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+ RENAME "libgmx${GMX_LIBS_SUFFIX}.pc"
+ COMPONENT development)
int slen;
char buf[128];
- sprintf(buf,":-) %s (-:",s);
+ snprintf(buf,128,":-) %s (-:",s);
slen=strlen(buf);
space(out,(80-slen)/2);
fprintf(out,"%s\n",buf);
fprintf(out,"\n");
- sprintf(buf,"%s",Program());
+ snprintf(buf,256,"%s",Program());
#ifdef GMX_DOUBLE
strcat(buf," (double precision)");
#endif
}
break;
default:
- gmx_incons("Unknown block type");
+ gmx_incons("Unknown block type: this file is corrupted or from the future");
}
}
}
}
+static void enx_warning(const char *msg)
+{
+ if (getenv("GMX_ENX_NO_FATAL") != NULL)
+ {
+ gmx_warning(msg);
+ }
+ else
+ {
+ gmx_fatal(FARGS,"%s\n%s",
+ msg,
+ "If you want to use the correct frames before the corrupted frame and avoid this fatal error set the env.var. GMX_ENX_NO_FATAL");
+ }
+}
static void edr_strings(XDR *xdr,gmx_bool bRead,int file_version,
int n,gmx_enxnm_t **nms)
int nre_test,gmx_bool *bWrongPrecision,gmx_bool *bOK)
{
int magic=-7777777;
- real r;
+ real first_real_to_check;
int b,i,zero=0,dum=0;
gmx_bool bRead = gmx_fio_getread(ef->fio);
int tempfix_nr=0;
* (which is the case for for instance the block sizes for variable
* number of blocks, where this number is read before).
*/
- r = -2e10;
- if (!gmx_fio_do_real(ef->fio, r))
+ first_real_to_check = -2e10;
+ if (!gmx_fio_do_real(ef->fio, first_real_to_check))
{
return FALSE;
}
- if (r > -1e10)
+ if (first_real_to_check > -1e10)
{
/* Assume we are reading an old format */
*file_version = 1;
- fr->t = r;
+ fr->t = first_real_to_check;
if (!gmx_fio_do_int(ef->fio, dum)) *bOK = FALSE;
fr->step = dum;
}
if (!gmx_fio_do_int(ef->fio, magic)) *bOK = FALSE;
if (magic != -7777777)
{
- gmx_fatal(FARGS,"Energy header magic number mismatch, this is not a GROMACS edr file");
+ enx_warning("Energy header magic number mismatch, this is not a GROMACS edr file");
+ *bOK=FALSE;
+ return FALSE;
}
*file_version = enx_version;
if (!gmx_fio_do_int(ef->fio, *file_version)) *bOK = FALSE;
}
if (!gmx_fio_do_int(ef->fio, fr->nblock)) *bOK = FALSE;
+ if (fr->nblock < 0) *bOK=FALSE;
if (ndisre!=0)
{
if (*file_version >= 4)
- gmx_incons("Distance restraint blocks in old style in new style file");
+ {
+ enx_warning("Distance restraint blocks in old style in new style file");
+ *bOK=FALSE;
+ return FALSE;
+ }
fr->nblock+=1;
}
return *bOK;
}
+ /* we now know what these should be, or we've already bailed out because
+ of wrong precision */
+ if ( *file_version==1 && (fr->t < 0 || fr->t > 1e20 || fr->step < 0 ) )
+ {
+ enx_warning("edr file with negative step number or unreasonable time (and without version number).");
+ *bOK=FALSE;
+ return FALSE;
+ }
+
+
if (*bOK && bRead)
+ {
add_blocks_enxframe(fr, fr->nblock);
+ }
startb=0;
if (ndisre>0)
else
{
if (fr->block[b].nsub != 1)
+ {
gmx_incons("Writing an old version .edr file with too many subblocks");
+ }
if (fr->block[b].sub[0].type != dtreal)
{
gmx_incons("Writing an old version .edr file the wrong subblock type");
gmx_enxnm_t *nms=NULL;
int file_version=-1;
t_enxframe *fr;
- gmx_bool bWrongPrecision,bDum=TRUE;
+ gmx_bool bWrongPrecision,bOK=TRUE;
struct ener_file *ef;
snew(ef,1);
gmx_fio_setprecision(ef->fio,FALSE);
do_enxnms(ef,&nre,&nms);
snew(fr,1);
- do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bDum);
- if(!bDum)
+ do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bOK);
+ if(!bOK)
{
gmx_file("Cannot read energy file header. Corrupt file?");
}
gmx_fio_checktype(ef->fio);
gmx_fio_setprecision(ef->fio,TRUE);
do_enxnms(ef,&nre,&nms);
- do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bDum);
- if(!bDum)
+ do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bOK);
+ if(!bOK)
{
gmx_file("Cannot write energy file header; maybe you are out of quota?");
}
bOK1=gmx_fio_ndo_string(ef->fio, sub->sval, sub->nr);
break;
default:
- gmx_incons("Reading unknown block type");
+ gmx_incons("Reading unknown block data type: this file is corrupted or from the future");
}
bOK = bOK && bOK1;
}
}
else
{
-#if (defined( _WIN32 ) || defined( _WIN64 ) )
- /* windows doesn't do standard C */
-#define snprintf sprintf_s
-#endif
- snprintf(buf, GMX_FIO_BUFLEN, " ; %s %s",
- fio->comment ? fio->comment : "", desc);
+ snprintf(buf, GMX_FIO_BUFLEN, " ; %s %s", fio->comment ? fio->comment : "", desc);
}
return buf;
}
*/
-/* The optimized version of converts2ints is disabled
+/* The optimized version of converts2ints is disabled on BG/P
* because of issues on BG/P reported in bugzilla 429
*/
-/* #if (defined __IBMC__ || defined __IBMCPP__) */
-#if (0)
+#if defined __blrts__
#define convert2ints(x,xi,conv,i1,i2) \
xi = __fpctiwz(x); \
int * inneriter,
double * work)
{
- int nri,nthreads;
- int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
- int jnrA,jnrB;
- int j3A,j3B;
+ int nri,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int jnrA,jnrB;
+ int j3A,j3B;
gmx_gbdata_t *gbdata;
double * gpol;
nri = *p_nri;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
ix = _mm_set1_pd(shX+pos[ii3+0]);
iy = _mm_set1_pd(shY+pos[ii3+1]);
iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
+
iq = _mm_load1_pd(charge+ii);
iq = _mm_mul_pd(iq,facel);
-
+
isai = _mm_load1_pd(invsqrta+ii);
vctot = _mm_setzero_pd();
j3A = jnrA * 3;
j3B = jnrB * 3;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_pd(rinv,rinv);
-
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
qq = _mm_mul_pd(iq,jq);
vcoul = _mm_mul_pd(qq,rinv);
fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
+ vctot = _mm_add_pd(vctot,vcoul);
/* Polarization interaction */
qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-
- fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
-
- /***********************************/
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
{
jnrA = jjnr[k];
j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
dx = _mm_sub_sd(ix,jx);
dy = _mm_sub_sd(iy,jy);
dz = _mm_sub_sd(iz,jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_sd(rinv,rinv);
-
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-
+
isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
+ qq = _mm_mul_sd(jq,iq);
vcoul = _mm_mul_sd(qq,rinv);
fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_sd(rsq,rinv);
rtab = _mm_mul_sd(r,gbscale);
-
+
n0 = _mm_cvttpd_epi32(rtab);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
- fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
-
- /***********************************/
+ fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
- ggid = gid[n];
-
- gmx_mm_update_1pot_pd(vctot,vc+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ }
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
int * inneriter,
double * work)
{
- int nri,ntype,nthreads;
- int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
int offset,nti;
- int jnrA,jnrB;
- int j3A,j3B;
+ int jnrA,jnrB;
+ int j3A,j3B;
int tjA,tjB;
gmx_gbdata_t *gbdata;
double * gpol;
nri = *p_nri;
ntype = *p_ntype;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
- c6 = _mm_setzero_pd();
- c12 = _mm_setzero_pd();
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
ix = _mm_set1_pd(shX+pos[ii3+0]);
iy = _mm_set1_pd(shY+pos[ii3+1]);
iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
+
iq = _mm_load1_pd(charge+ii);
iq = _mm_mul_pd(iq,facel);
-
+
isai = _mm_load1_pd(invsqrta+ii);
nti = 2*ntype*type[ii];
j3A = jnrA * 3;
j3B = jnrB * 3;
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_pd(rinv,rinv);
-
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
tjB = nti+2*type[jnrB];
-
- GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
isaprod = _mm_mul_pd(isai,isaj);
qq = _mm_mul_pd(iq,jq);
vcoul = _mm_mul_pd(qq,rinv);
fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
+ vctot = _mm_add_pd(vctot,vcoul);
- /* Polarization interaction */
+ /* Polarization interaction */
qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_pd(rsq,rinv);
rtab = _mm_mul_pd(r,gbscale);
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
rinvsix = _mm_mul_pd(rinvsq,rinvsq);
rinvsix = _mm_mul_pd(rinvsix,rinvsq);
vvdw12 = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
- fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
- _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
- _mm_mul_pd(six,vvdw6))),
- _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
-
- /***********************************/
+ fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
+ _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+ _mm_mul_pd(six,vvdw6))),
+ _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
jnrA = jjnr[k];
j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
dx = _mm_sub_sd(ix,jx);
dy = _mm_sub_sd(iy,jy);
dz = _mm_sub_sd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_sd(rinv,rinv);
-
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vvdw6 = _mm_setzero_pd();
+ vvdw12 = _mm_setzero_pd();
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
-
- GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
+ qq = _mm_mul_sd(jq,iq);
vcoul = _mm_mul_sd(qq,rinv);
fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_sd(rsq,rinv);
rtab = _mm_mul_sd(r,gbscale);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
rinvsix = _mm_mul_sd(rinvsq,rinvsq);
rinvsix = _mm_mul_sd(rinvsix,rinvsq);
vvdw6 = _mm_mul_sd(c6,rinvsix);
vvdw12 = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
- vvdwtot = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
-
- fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
- _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
- _mm_mul_sd(six,vvdw6))),
- _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
-
- /***********************************/
+ vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
+
+ fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
+ _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+ _mm_mul_sd(six,vvdw6))),
+ _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
- ggid = gid[n];
-
- gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+
}
-
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
int * inneriter,
double * work)
{
- int nri,ntype,nthreads;
- int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
int offset,nti;
- int jnrA,jnrB;
- int j3A,j3B;
+ int jnrA,jnrB;
+ int j3A,j3B;
int tjA,tjB;
gmx_gbdata_t *gbdata;
double * gpol;
__m128d vcoul,fscal,gbscale,c6,c12;
__m128d rinvsq,r,rtab;
__m128d eps,Y,F,G,H;
- __m128d VV,FF,Fp;
+ __m128d VV,FF,Fp;
__m128d vgb,fijGB,dvdatmp;
__m128d rinvsix,vvdw6,vvdw12,vvdwtmp;
__m128d facel,gbtabscale,dvdaj;
- __m128d fijD,fijR;
- __m128d xmm1,tabscale,eps2;
+ __m128d fijD,fijR;
+ __m128d xmm1,tabscale,eps2;
__m128i n0, nnn;
nri = *p_nri;
ntype = *p_ntype;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
- tabscale = _mm_load1_pd(p_tabscale);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
- c6 = _mm_setzero_pd();
- c12 = _mm_setzero_pd();
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+ tabscale = _mm_load1_pd(p_tabscale);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
ix = _mm_set1_pd(shX+pos[ii3+0]);
iy = _mm_set1_pd(shY+pos[ii3+1]);
iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
+
iq = _mm_load1_pd(charge+ii);
iq = _mm_mul_pd(iq,facel);
-
+
isai = _mm_load1_pd(invsqrta+ii);
-
+
nti = 2*ntype*type[ii];
vctot = _mm_setzero_pd();
j3A = jnrA * 3;
j3B = jnrB * 3;
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_pd(rinv,rinv);
-
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
tjB = nti+2*type[jnrB];
-
- GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
isaprod = _mm_mul_pd(isai,isaj);
qq = _mm_mul_pd(iq,jq);
vcoul = _mm_mul_pd(qq,rinv);
fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_pd(rsq,rinv);
rtab = _mm_mul_pd(r,gbscale);
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
- /* Calculate VDW table index */
+ /* Calculate VDW table index */
rtab = _mm_mul_pd(r,tabscale);
n0 = _mm_cvttpd_epi32(rtab);
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_pd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
vvdw6 = _mm_mul_pd(c6,VV);
fijD = _mm_mul_pd(c6,FF);
-
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
- F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
- H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
-
+
xmm1 = _mm_add_pd(fijD,fijR);
xmm1 = _mm_mul_pd(xmm1,tabscale);
xmm1 = _mm_add_pd(xmm1,fijGB);
xmm1 = _mm_sub_pd(xmm1,fscal);
fscal = _mm_mul_pd(xmm1,neg);
fscal = _mm_mul_pd(fscal,rinv);
-
- /***********************************/
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
{
jnrA = jjnr[k];
j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
dx = _mm_sub_sd(ix,jx);
dy = _mm_sub_sd(iy,jy);
dz = _mm_sub_sd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_sd(rinv,rinv);
-
- /***********************************/
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vvdw6 = _mm_setzero_pd();
+ vvdw12 = _mm_setzero_pd();
+
+ /***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
-
- GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
+ qq = _mm_mul_sd(jq,iq);
vcoul = _mm_mul_sd(qq,rinv);
fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_sd(rsq,rinv);
rtab = _mm_mul_sd(r,gbscale);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
- /* Calculate VDW table index */
+ /* Calculate VDW table index */
rtab = _mm_mul_sd(r,tabscale);
n0 = _mm_cvttpd_epi32(rtab);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_sd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
vvdw6 = _mm_mul_sd(c6,VV);
fijD = _mm_mul_sd(c6,FF);
-
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
fijR = _mm_mul_sd(c12,FF);
vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
- vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
+ vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
xmm1 = _mm_add_sd(fijD,fijR);
xmm1 = _mm_mul_sd(xmm1,tabscale);
fscal = _mm_mul_sd(xmm1,neg);
fscal = _mm_mul_sd(fscal,rinv);
- /***********************************/
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
- ggid = gid[n];
-
- gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+
+ }
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
pmask1 = prologue_mask[i+1];
emask0 = epilogue_mask[i];
emask1 = epilogue_mask[i+1];
- imask_SSE0 = _mm_load1_pd((double *)(imask+i));
+ imask_SSE0 = _mm_load1_pd((double *)(imask+2*i));
imask_SSE1 = _mm_load1_pd((double *)(imask+2*i+2));
for(j=nj0; j<nj1; j+=UNROLLJ)
#include<math.h>
#include<vec.h>
-
#include <xmmintrin.h>
#include <emmintrin.h>
#include "../nb_kerneltype.h"
-
void nb_kernel400_sse2_double(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ double * shiftvec,
+ double * fshift,
+ int * gid,
+ double * pos,
+ double * faction,
+ double * charge,
+ double * p_facel,
+ double * p_krf,
+ double * p_crf,
+ double * vc,
+ int * type,
+ int * p_ntype,
+ double * vdwparam,
+ double * vvdw,
+ double * p_tabscale,
+ double * VFtab,
+ double * invsqrta,
+ double * dvda,
+ double * p_gbtabscale,
+ double * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ double * work)
{
- int nri,ntype,nthreads,offset;
- int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
- double facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
- double shX,shY,shZ,isai_d,dva;
+ int nri,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int jnrA,jnrB;
+ int j3A,j3B;
gmx_gbdata_t *gbdata;
- float * gpol;
-
- __m128d ix,iy,iz,jx,jy,jz;
- __m128d dx,dy,dz,t1,t2,t3;
- __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
- __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
- __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
- __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
- __m128d fac,tabscale,gbtabscale;
- __m128i n0,nnn;
+ double * gpol;
+
+ __m128d iq,qq,jq,isai;
+ __m128d ix,iy,iz;
+ __m128d jx,jy,jz;
+ __m128d dx,dy,dz;
+ __m128d vctot,vgbtot,dvdasum,gbfactor;
+ __m128d fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128d rinv,isaj,isaprod;
+ __m128d vcoul,fscal,gbscale;
+ __m128d rinvsq,r,rtab;
+ __m128d eps,Y,F,G,H;
+ __m128d vgb,fijGB,dvdatmp;
+ __m128d facel,gbtabscale,dvdaj;
+ __m128i n0, nnn;
- const __m128d neg = {-1.0,-1.0};
- const __m128d zero = {0.0,0.0};
- const __m128d half = {0.5,0.5};
- const __m128d two = {2.0,2.0};
- const __m128d three = {3.0,3.0};
+ const __m128d neg = _mm_set1_pd(-1.0);
+ const __m128d zero = _mm_set1_pd(0.0);
+ const __m128d minushalf = _mm_set1_pd(-0.5);
+ const __m128d two = _mm_set1_pd(2.0);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
-
+
nri = *p_nri;
- ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
- krf = *p_krf;
- crf = *p_crf;
- tabscl = *p_tabscale;
- gbtabscl = *p_gbtabscale;
- nj1 = 0;
-
- /* Splat variables */
- fac = _mm_load1_pd(&facel);
- tabscale = _mm_load1_pd(&tabscl);
- gbtabscale = _mm_load1_pd(&gbtabscl);
-
- /* Keep compiler happy */
- dvdatmp = _mm_setzero_pd();
- vgb = _mm_setzero_pd();
- dvdaj = _mm_setzero_pd();
- isaj = _mm_setzero_pd();
- vcoul = _mm_setzero_pd();
- t1 = _mm_setzero_pd();
- t2 = _mm_setzero_pd();
- t3 = _mm_setzero_pd();
-
- jnr1=jnr2=0;
- j13=j23=0;
+
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
-
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- offset = (nj1-nj0)%2;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
- ii = iinr[n];
- ii3 = ii*3;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shX+pos[ii3+1]);
- iz = _mm_set1_pd(shX+pos[ii3+2]);
- q = _mm_set1_pd(charge[ii]);
-
- iq = _mm_mul_pd(fac,q);
- isai_d = invsqrta[ii];
- isai = _mm_load1_pd(&isai_d);
-
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
-
- for(k=nj0;k<nj1-offset; k+=2)
+ ix = _mm_set1_pd(shX+pos[ii3+0]);
+ iy = _mm_set1_pd(shY+pos[ii3+1]);
+ iz = _mm_set1_pd(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_pd(charge+ii);
+ iq = _mm_mul_pd(iq,facel);
+
+ isai = _mm_load1_pd(invsqrta+ii);
+
+ vctot = _mm_setzero_pd();
+ vgbtot = _mm_setzero_pd();
+ dvdasum = _mm_setzero_pd();
+ fix = _mm_setzero_pd();
+ fiy = _mm_setzero_pd();
+ fiz = _mm_setzero_pd();
+
+ for(k=nj0;k<nj1-1; k+=2)
{
- jnr1 = jjnr[k];
- jnr2 = jjnr[k+1];
-
- j13 = jnr1 * 3;
- j23 = jnr2 * 3;
-
- /* Load coordinates */
- xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */
- xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */
-
- xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */
- xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */
-
- /* transpose */
- jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
-
- /* distances */
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- /* Load invsqrta */
- isaj = _mm_loadl_pd(isaj,invsqrta+jnr1);
- isaj = _mm_loadh_pd(isaj,invsqrta+jnr2);
- isaprod = _mm_mul_pd(isai,isaj);
-
- /* Load charges */
- q = _mm_loadl_pd(q,charge+jnr1);
- q = _mm_loadh_pd(q,charge+jnr2);
- qq = _mm_mul_pd(iq,q);
-
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- qq = _mm_mul_pd(isaprod,qq);
- qq = _mm_mul_pd(qq,neg);
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Load dvdaj */
- dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1);
- dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2);
-
- r = _mm_mul_pd(rsq11,rinv);
- rt = _mm_mul_pd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
- eps2 = _mm_mul_pd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- H = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,H);
- vgb = _mm_mul_pd(qq,VV);
- fijC = _mm_mul_pd(qq,FF);
- fijC = _mm_mul_pd(fijC,gbscale);
-
- dvdatmp = _mm_mul_pd(fijC,r);
- dvdatmp = _mm_add_pd(vgb,dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp,neg);
- dvdatmp = _mm_mul_pd(dvdatmp,half);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_pd(dvdatmp,isaj);
- xmm1 = _mm_mul_pd(xmm1,isaj);
- dvdaj = _mm_add_pd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
- _mm_storeh_pd(dvda+jnr2,dvdaj);
-
- vctot = _mm_add_pd(vctot,vcoul);
- vgbtot = _mm_add_pd(vgbtot,vgb);
-
- fscal = _mm_sub_pd(fijC,fscal);
- fscal = _mm_mul_pd(fscal,neg);
- fscal = _mm_mul_pd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_pd(fscal,dx);
- t2 = _mm_mul_pd(fscal,dy);
- t3 = _mm_mul_pd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
- xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-
- xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
- xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-
- /* transpose */
- xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
- xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
- xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_pd(xmm5,t1);
- xmm6 = _mm_sub_pd(xmm6,t2);
- xmm7 = _mm_sub_pd(xmm7,t3);
-
- xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
- xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* store fx and fy */
- _mm_storeu_pd(faction+j13,xmm1);
- _mm_storeu_pd(faction+j23,xmm2);
-
- /* .. then fz */
- _mm_storel_pd(faction+j13+2,xmm7);
- _mm_storeh_pd(faction+j23+2,xmm7);
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+
+ j3A = jnrA * 3;
+ j3B = jnrB * 3;
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ dx = _mm_sub_pd(ix,jx);
+ dy = _mm_sub_pd(iy,jy);
+ dz = _mm_sub_pd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_pd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+
+ isaprod = _mm_mul_pd(isai,isaj);
+ qq = _mm_mul_pd(iq,jq);
+ vcoul = _mm_mul_pd(qq,rinv);
+ fscal = _mm_mul_pd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+ gbscale = _mm_mul_pd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_pd(rsq,rinv);
+ rtab = _mm_mul_pd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
/* In double precision, offset can only be either 0 or 1 */
- if(offset!=0)
+ if(k<nj1)
{
- jnr1 = jjnr[k];
- j13 = jnr1*3;
-
- jx = _mm_load_sd(pos+j13);
- jy = _mm_load_sd(pos+j13+1);
- jz = _mm_load_sd(pos+j13+2);
-
- isaj = _mm_load_sd(invsqrta+jnr1);
- isaprod = _mm_mul_sd(isai,isaj);
- dvdaj = _mm_load_sd(dvda+jnr1);
- q = _mm_load_sd(charge+jnr1);
- qq = _mm_mul_sd(iq,q);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- qq = _mm_mul_sd(isaprod,qq);
- qq = _mm_mul_sd(qq,neg);
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- r = _mm_mul_sd(rsq11,rinv);
- rt = _mm_mul_sd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
- eps2 = _mm_mul_sd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2);
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2);
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0));
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1));
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- H = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,H);
- vgb = _mm_mul_sd(qq,VV);
- fijC = _mm_mul_sd(qq,FF);
- fijC = _mm_mul_sd(fijC,gbscale);
-
- dvdatmp = _mm_mul_sd(fijC,r);
- dvdatmp = _mm_add_sd(vgb,dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp,neg);
- dvdatmp = _mm_mul_sd(dvdatmp,half);
- dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_sd(dvdatmp,isaj);
- xmm1 = _mm_mul_sd(xmm1,isaj);
- dvdaj = _mm_add_sd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
-
- vctot = _mm_add_sd(vctot,vcoul);
- vgbtot = _mm_add_sd(vgbtot,vgb);
-
- fscal = _mm_sub_sd(fijC,fscal);
- fscal = _mm_mul_sd(fscal,neg);
- fscal = _mm_mul_sd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_sd(fscal,dx);
- t2 = _mm_mul_sd(fscal,dy);
- t3 = _mm_mul_sd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_sd(fix,t1);
- fiy = _mm_add_sd(fiy,t2);
- fiz = _mm_add_sd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm5 = _mm_load_sd(faction+j13); /* fx */
- xmm6 = _mm_load_sd(faction+j13+1); /* fy */
- xmm7 = _mm_load_sd(faction+j13+2); /* fz */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_sd(xmm5,t1);
- xmm6 = _mm_sub_sd(xmm6,t2);
- xmm7 = _mm_sub_sd(xmm7,t3);
-
- /* store forces */
- _mm_store_sd(faction+j13,xmm5);
- _mm_store_sd(faction+j13+1,xmm6);
- _mm_store_sd(faction+j13+2,xmm7);
+ jnrA = jjnr[k];
+ j3A = jnrA * 3;
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_sd(rinv,rinv);
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+
+ isaprod = _mm_mul_sd(isai,isaj);
+ qq = _mm_mul_sd(jq,iq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
+ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+ gbscale = _mm_mul_sd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_sd(rsq,rinv);
+ rtab = _mm_mul_sd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+
+ fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- /* fix/fiy/fiz now contain four partial terms, that all should be
- * added to the i particle forces
- */
- t1 = _mm_unpacklo_pd(t1,fix);
- t2 = _mm_unpacklo_pd(t2,fiy);
- t3 = _mm_unpacklo_pd(t3,fiz);
-
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
- fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
- fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-
- /* Load i forces from memory */
- xmm1 = _mm_load_sd(faction+ii3);
- xmm2 = _mm_load_sd(faction+ii3+1);
- xmm3 = _mm_load_sd(faction+ii3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(faction+ii3,fix);
- _mm_store_sd(faction+ii3+1,fiy);
- _mm_store_sd(faction+ii3+2,fiz);
-
- /* now do dvda */
- dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
- _mm_storeh_pd(&dva,dvdasum);
- dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-
- ggid = gid[n];
-
- /* Coulomb potential */
- vcoul = _mm_unpacklo_pd(vcoul,vctot);
- vctot = _mm_add_pd(vctot,vcoul);
- _mm_storeh_pd(&vct,vctot);
- Vc[ggid] = Vc[ggid] + vct;
-
- /* GB potential */
- vgb = _mm_unpacklo_pd(vgb,vgbtot);
- vgbtot = _mm_add_pd(vgbtot,vgb);
- _mm_storeh_pd(&vgbt,vgbtot);
- gpol[ggid] = gpol[ggid] + vgbt;
- }
-
- *outeriter = nri;
- *inneriter = nj1;
-}
-
-
-/*
- * Gromacs nonbonded kernel nb_kernel400nf
- * Coulomb interaction: Generalized-Born
- * VdW interaction: Not calculated
- * water optimization: No
- * Calculate forces: no
- */
-void nb_kernel400nf_sse2_double(
- int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
-{
- int nri,ntype,nthreads;
- double facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
- int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
- double shX,shY,shZ;
- double iq;
- double qq,vcoul,vctot;
- double r,rt,eps,eps2;
- int n0,nnn;
- double Y,F,Geps,Heps2,Fp,VV;
- double isai,isaj,isaprod,gbscale;
- double ix1,iy1,iz1;
- double jx1,jy1,jz1;
- double dx11,dy11,dz11,rsq11,rinv11;
- const int fractshift = 12;
- const int fractmask = 8388607;
- const int expshift = 23;
- const int expmask = 2139095040;
- const int explsb = 8388608;
- double lu;
- int iexp,addr;
- union { unsigned int bval; double fval; } bitpattern,result;
-
- nri = *p_nri;
- ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = *p_facel;
- krf = *p_krf;
- crf = *p_crf;
- tabscale = *p_tabscale;
- gbtabscale = *p_gbtabscale;
- nj1 = 0;
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
- for(n=0; (n<nri); n++)
- {
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
- ix1 = shX + pos[ii3+0];
- iy1 = shY + pos[ii3+1];
- iz1 = shZ + pos[ii3+2];
- iq = facel*charge[ii];
- isai = invsqrta[ii];
- vctot = 0;
-
- for(k=nj0; (k<nj1); k++)
- {
- jnr = jjnr[k];
- j3 = 3*jnr;
- jx1 = pos[j3+0];
- jy1 = pos[j3+1];
- jz1 = pos[j3+2];
- dx11 = ix1 - jx1;
- dy11 = iy1 - jy1;
- dz11 = iz1 - jz1;
- rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
- bitpattern.fval = rsq11;
- iexp = (((bitpattern.bval)&expmask)>>expshift);
- addr = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
- result.bval = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
- lu = result.fval;
- rinv11 = (0.5*lu*(3.0-((rsq11*lu)*lu)));
- isaj = invsqrta[jnr];
- isaprod = isai*isaj;
- qq = iq*charge[jnr];
- vcoul = qq*rinv11;
- qq = isaprod*(-qq);
- gbscale = isaprod*gbtabscale;
- r = rsq11*rinv11;
- rt = r*gbscale;
- n0 = rt;
- eps = rt-n0;
- eps2 = eps*eps;
- nnn = 4*n0;
- Y = GBtab[nnn];
- F = GBtab[nnn+1];
- Geps = eps*GBtab[nnn+2];
- Heps2 = eps2*GBtab[nnn+3];
- Fp = F+Geps+Heps2;
- VV = Y+eps*Fp;
- vgb = qq*VV;
- vctot = vctot + vcoul;
- }
-
- ggid = gid[n];
- Vc[ggid] = Vc[ggid] + vctot;
- }
+ ggid = gid[n];
- *outeriter = nri;
- *inneriter = nj1;
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ }
+
+ *outeriter = nri;
+ *inneriter = nj1;
}
-
-
#include "../nb_kerneltype.h"
+
void nb_kernel410_sse2_double(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ double * shiftvec,
+ double * fshift,
+ int * gid,
+ double * pos,
+ double * faction,
+ double * charge,
+ double * p_facel,
+ double * p_krf,
+ double * p_crf,
+ double * vc,
+ int * type,
+ int * p_ntype,
+ double * vdwparam,
+ double * vvdw,
+ double * p_tabscale,
+ double * VFtab,
+ double * invsqrta,
+ double * dvda,
+ double * p_gbtabscale,
+ double * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ double * work)
{
- int nri,ntype,nthreads,offset,tj,tj2,nti;
- int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
- double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,nt1,nt2;
- double shX,shY,shZ,isai_d,dva,vgbt;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int offset,nti;
+ int jnrA,jnrB;
+ int j3A,j3B;
+ int tjA,tjB;
gmx_gbdata_t *gbdata;
- float * gpol;
-
- __m128d ix,iy,iz,jx,jy,jz;
- __m128d dx,dy,dz,t1,t2,t3;
- __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
- __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
- __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
- __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
- __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,rinvsq,rinvsix;
- __m128d fac,tabscale,gbtabscale;
- __m128i n0,nnn;
+ double * gpol;
+
+ __m128d iq,qq,jq,isai;
+ __m128d ix,iy,iz;
+ __m128d jx,jy,jz;
+ __m128d dx,dy,dz;
+ __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+ __m128d fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128d rinv,isaj,isaprod;
+ __m128d vcoul,fscal,gbscale,c6,c12;
+ __m128d rinvsq,r,rtab;
+ __m128d eps,Y,F,G,H;
+ __m128d vgb,fijGB,dvdatmp;
+ __m128d rinvsix,vvdw6,vvdw12;
+ __m128d facel,gbtabscale,dvdaj;
+ __m128i n0, nnn;
- const __m128d neg = {-1.0,-1.0};
- const __m128d zero = {0.0,0.0};
- const __m128d half = {0.5,0.5};
- const __m128d two = {2.0,2.0};
- const __m128d three = {3.0,3.0};
- const __m128d six = {6.0,6.0};
- const __m128d twelwe = {12.0,12.0};
+ const __m128d neg = _mm_set1_pd(-1.0);
+ const __m128d zero = _mm_set1_pd(0.0);
+ const __m128d minushalf = _mm_set1_pd(-0.5);
+ const __m128d two = _mm_set1_pd(2.0);
+ const __m128d six = _mm_set1_pd(6.0);
+ const __m128d twelve = _mm_set1_pd(12.0);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
-
+
nri = *p_nri;
ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
- krf = *p_krf;
- crf = *p_crf;
- tabscl = *p_tabscale;
- gbtabscl = *p_gbtabscale;
- nj1 = 0;
-
- /* Splat variables */
- fac = _mm_load1_pd(&facel);
- tabscale = _mm_load1_pd(&tabscl);
- gbtabscale = _mm_load1_pd(&gbtabscl);
-
- /* Keep compiler happy */
- Vvdwtmp = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
- dvdatmp = _mm_setzero_pd();
- dvdaj = _mm_setzero_pd();
- isaj = _mm_setzero_pd();
- vcoul = _mm_setzero_pd();
- vgb = _mm_setzero_pd();
- t1 = _mm_setzero_pd();
- t2 = _mm_setzero_pd();
- t3 = _mm_setzero_pd();
- xmm1 = _mm_setzero_pd();
- xmm2 = _mm_setzero_pd();
- xmm3 = _mm_setzero_pd();
- xmm4 = _mm_setzero_pd();
- jnr1 = jnr2 = 0;
- j13 = j23 = 0;
+
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
-
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- offset = (nj1-nj0)%2;
-
- ii = iinr[n];
- ii3 = ii*3;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shX+pos[ii3+1]);
- iz = _mm_set1_pd(shX+pos[ii3+2]);
- q = _mm_set1_pd(charge[ii]);
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
- iq = _mm_mul_pd(fac,q);
- isai_d = invsqrta[ii];
- isai = _mm_load1_pd(&isai_d);
-
- nti = 2*ntype*type[ii];
-
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
+ ix = _mm_set1_pd(shX+pos[ii3+0]);
+ iy = _mm_set1_pd(shY+pos[ii3+1]);
+ iz = _mm_set1_pd(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_pd(charge+ii);
+ iq = _mm_mul_pd(iq,facel);
+
+ isai = _mm_load1_pd(invsqrta+ii);
+
+ nti = 2*ntype*type[ii];
- for(k=nj0;k<nj1-offset; k+=2)
+ vctot = _mm_setzero_pd();
+ vvdwtot = _mm_setzero_pd();
+ vgbtot = _mm_setzero_pd();
+ dvdasum = _mm_setzero_pd();
+ fix = _mm_setzero_pd();
+ fiy = _mm_setzero_pd();
+ fiz = _mm_setzero_pd();
+
+ for(k=nj0;k<nj1-1; k+=2)
{
- jnr1 = jjnr[k];
- jnr2 = jjnr[k+1];
-
- j13 = jnr1 * 3;
- j23 = jnr2 * 3;
-
- /* Load coordinates */
- xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */
- xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */
-
- xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */
- xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */
-
- /* transpose */
- jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
-
- /* distances */
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- /* Load invsqrta */
- isaj = _mm_loadl_pd(isaj,invsqrta+jnr1);
- isaj = _mm_loadh_pd(isaj,invsqrta+jnr2);
- isaprod = _mm_mul_pd(isai,isaj);
-
- /* Load charges */
- q = _mm_loadl_pd(q,charge+jnr1);
- q = _mm_loadh_pd(q,charge+jnr2);
- qq = _mm_mul_pd(iq,q);
-
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- qq = _mm_mul_pd(isaprod,qq);
- qq = _mm_mul_pd(qq,neg);
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
- tj2 = nti+2*type[jnr2];
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
- xmm1 = _mm_loadu_pd(vdwparam+tj);
- xmm2 = _mm_loadu_pd(vdwparam+tj2);
- c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-
- rinvsq = _mm_mul_pd(rinv,rinv);
-
- /* Load dvdaj */
- dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1);
- dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2);
-
- r = _mm_mul_pd(rsq11,rinv);
- rt = _mm_mul_pd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
- eps2 = _mm_mul_pd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- H = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,H);
- vgb = _mm_mul_pd(qq,VV);
- fijC = _mm_mul_pd(qq,FF);
- fijC = _mm_mul_pd(fijC,gbscale);
-
- dvdatmp = _mm_mul_pd(fijC,r);
- dvdatmp = _mm_add_pd(vgb,dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp,neg);
- dvdatmp = _mm_mul_pd(dvdatmp,half);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_pd(dvdatmp,isaj);
- xmm1 = _mm_mul_pd(xmm1,isaj);
- dvdaj = _mm_add_pd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
- _mm_storeh_pd(dvda+jnr2,dvdaj);
-
- vctot = _mm_add_pd(vctot,vcoul);
- vgbtot = _mm_add_pd(vgbtot,vgb);
-
- /* VdW interaction */
- rinvsix = _mm_mul_pd(rinvsq,rinvsq);
- rinvsix = _mm_mul_pd(rinvsix,rinvsq);
-
- Vvdw6 = _mm_mul_pd(c6,rinvsix);
- Vvdw12 = _mm_mul_pd(c12,rinvsix);
- Vvdw12 = _mm_mul_pd(Vvdw12,rinvsix);
- Vvdwtmp = _mm_sub_pd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-
- xmm1 = _mm_mul_pd(twelwe,Vvdw12);
- xmm2 = _mm_mul_pd(six,Vvdw6);
- xmm1 = _mm_sub_pd(xmm1,xmm2);
- xmm1 = _mm_mul_pd(xmm1,rinvsq);
-
- /* Scalar force */
- fscal = _mm_sub_pd(fijC,fscal);
- fscal = _mm_mul_pd(fscal,rinv);
- fscal = _mm_sub_pd(xmm1,fscal);
-
- /* calculate partial force terms */
- t1 = _mm_mul_pd(fscal,dx);
- t2 = _mm_mul_pd(fscal,dy);
- t3 = _mm_mul_pd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
- xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-
- xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
- xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-
- /* transpose */
- xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
- xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
- xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_pd(xmm5,t1);
- xmm6 = _mm_sub_pd(xmm6,t2);
- xmm7 = _mm_sub_pd(xmm7,t3);
-
- xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
- xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* store fx and fy */
- _mm_storeu_pd(faction+j13,xmm1);
- _mm_storeu_pd(faction+j23,xmm2);
-
- /* .. then fz */
- _mm_storel_pd(faction+j13+2,xmm7);
- _mm_storeh_pd(faction+j23+2,xmm7);
+ j3A = jnrA * 3;
+ j3B = jnrB * 3;
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ dx = _mm_sub_pd(ix,jx);
+ dy = _mm_sub_pd(iy,jy);
+ dz = _mm_sub_pd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_pd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ isaprod = _mm_mul_pd(isai,isaj);
+ qq = _mm_mul_pd(iq,jq);
+ vcoul = _mm_mul_pd(qq,rinv);
+ fscal = _mm_mul_pd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+ gbscale = _mm_mul_pd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_pd(rsq,rinv);
+ rtab = _mm_mul_pd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ rinvsix = _mm_mul_pd(rinvsq,rinvsq);
+ rinvsix = _mm_mul_pd(rinvsix,rinvsq);
+
+ vvdw6 = _mm_mul_pd(c6,rinvsix);
+ vvdw12 = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
+ vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
+
+ fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
+ _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+ _mm_mul_pd(six,vvdw6))),
+ _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
/* In double precision, offset can only be either 0 or 1 */
- if(offset!=0)
+ if(k<nj1)
{
- jnr1 = jjnr[k];
- j13 = jnr1*3;
-
- jx = _mm_load_sd(pos+j13);
- jy = _mm_load_sd(pos+j13+1);
- jz = _mm_load_sd(pos+j13+2);
-
- isaj = _mm_load_sd(invsqrta+jnr1);
- isaprod = _mm_mul_sd(isai,isaj);
- dvdaj = _mm_load_sd(dvda+jnr1);
- q = _mm_load_sd(charge+jnr1);
- qq = _mm_mul_sd(iq,q);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- qq = _mm_mul_sd(isaprod,qq);
- qq = _mm_mul_sd(qq,neg);
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
-
- c6 = _mm_load_sd(vdwparam+tj);
- c12 = _mm_load_sd(vdwparam+tj+1);
-
- rinvsq = _mm_mul_sd(rinv,rinv);
-
- r = _mm_mul_sd(rsq11,rinv);
- rt = _mm_mul_sd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
- eps2 = _mm_mul_sd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2);
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2);
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0));
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1));
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- H = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,H);
- vgb = _mm_mul_sd(qq,VV);
- fijC = _mm_mul_sd(qq,FF);
- fijC = _mm_mul_sd(fijC,gbscale);
-
- dvdatmp = _mm_mul_sd(fijC,r);
- dvdatmp = _mm_add_sd(vgb,dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp,neg);
- dvdatmp = _mm_mul_sd(dvdatmp,half);
- dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_sd(dvdatmp,isaj);
- xmm1 = _mm_mul_sd(xmm1,isaj);
- dvdaj = _mm_add_sd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
-
- vctot = _mm_add_sd(vctot,vcoul);
- vgbtot = _mm_add_sd(vgbtot,vgb);
-
- /* VdW interaction */
- rinvsix = _mm_mul_sd(rinvsq,rinvsq);
- rinvsix = _mm_mul_sd(rinvsix,rinvsq);
-
- Vvdw6 = _mm_mul_sd(c6,rinvsix);
- Vvdw12 = _mm_mul_sd(c12,rinvsix);
- Vvdw12 = _mm_mul_sd(Vvdw12,rinvsix);
- Vvdwtmp = _mm_sub_sd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-
- xmm1 = _mm_mul_sd(twelwe,Vvdw12);
- xmm2 = _mm_mul_sd(six,Vvdw6);
- xmm1 = _mm_sub_sd(xmm1,xmm2);
- xmm1 = _mm_mul_sd(xmm1,rinvsq);
-
- /* Scalar force */
- fscal = _mm_sub_sd(fijC,fscal);
- fscal = _mm_mul_sd(fscal,rinv);
- fscal = _mm_sub_sd(xmm1,fscal);
-
- /* calculate partial force terms */
- t1 = _mm_mul_sd(fscal,dx);
- t2 = _mm_mul_sd(fscal,dy);
- t3 = _mm_mul_sd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_sd(fix,t1);
- fiy = _mm_add_sd(fiy,t2);
- fiz = _mm_add_sd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm5 = _mm_load_sd(faction+j13); /* fx */
- xmm6 = _mm_load_sd(faction+j13+1); /* fy */
- xmm7 = _mm_load_sd(faction+j13+2); /* fz */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_sd(xmm5,t1);
- xmm6 = _mm_sub_sd(xmm6,t2);
- xmm7 = _mm_sub_sd(xmm7,t3);
-
- /* store forces */
- _mm_store_sd(faction+j13,xmm5);
- _mm_store_sd(faction+j13+1,xmm6);
- _mm_store_sd(faction+j13+2,xmm7);
+ jnrA = jjnr[k];
+
+ j3A = jnrA * 3;
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_sd(rinv,rinv);
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vvdw6 = _mm_setzero_pd();
+ vvdw12 = _mm_setzero_pd();
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+
+ isaprod = _mm_mul_sd(isai,isaj);
+ qq = _mm_mul_sd(jq,iq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
+ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+ gbscale = _mm_mul_sd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_sd(rsq,rinv);
+ rtab = _mm_mul_sd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+
+ rinvsix = _mm_mul_sd(rinvsq,rinvsq);
+ rinvsix = _mm_mul_sd(rinvsix,rinvsq);
+
+ vvdw6 = _mm_mul_sd(c6,rinvsix);
+ vvdw12 = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
+ vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
+
+ fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
+ _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+ _mm_mul_sd(six,vvdw6))),
+ _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- /* fix/fiy/fiz now contain four partial terms, that all should be
- * added to the i particle forces
- */
- t1 = _mm_unpacklo_pd(t1,fix);
- t2 = _mm_unpacklo_pd(t2,fiy);
- t3 = _mm_unpacklo_pd(t3,fiz);
-
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
- fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
- fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-
- /* Load i forces from memory */
- xmm1 = _mm_load_sd(faction+ii3);
- xmm2 = _mm_load_sd(faction+ii3+1);
- xmm3 = _mm_load_sd(faction+ii3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(faction+ii3,fix);
- _mm_store_sd(faction+ii3+1,fiy);
- _mm_store_sd(faction+ii3+2,fiz);
-
- /* now do dvda */
- dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
- _mm_storeh_pd(&dva,dvdasum);
- dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-
- ggid = gid[n];
-
- /* Coulomb potential */
- vcoul = _mm_unpacklo_pd(vcoul,vctot);
- vctot = _mm_add_pd(vctot,vcoul);
- _mm_storeh_pd(&vct,vctot);
- Vc[ggid] = Vc[ggid] + vct;
-
- /* VdW potential */
- Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
- _mm_storeh_pd(&vdwt,Vvdwtot);
- Vvdw[ggid] = Vvdw[ggid] + vdwt;
-
- /* GB potential */
- vgb = _mm_unpacklo_pd(vgb,vgbtot);
- vgbtot = _mm_add_pd(vgbtot,vgb);
- _mm_storeh_pd(&vgbt,vgbtot);
- gpol[ggid] = gpol[ggid] + vgbt;
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+
}
-
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
-
-
-
-/*
- * Gromacs nonbonded kernel nb_kernel410nf
- * Coulomb interaction: Generalized-Born
- * VdW interaction: Lennard-Jones
- * water optimization: No
- * Calculate forces: no
- */
-void nb_kernel410nf_sse2_double(
- int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
-{
- int nri,ntype,nthreads;
- double facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
- int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
- double shX,shY,shZ;
- double rinvsq;
- double iq;
- double qq,vcoul,vctot;
- int nti;
- int tj;
- double rinvsix;
- double Vvdw6,Vvdwtot;
- double Vvdw12;
- double r,rt,eps,eps2;
- int n0,nnn;
- double Y,F,Geps,Heps2,Fp,VV;
- double isai,isaj,isaprod,gbscale;
- double ix1,iy1,iz1;
- double jx1,jy1,jz1;
- double dx11,dy11,dz11,rsq11,rinv11;
- double c6,c12;
- const int fractshift = 12;
- const int fractmask = 8388607;
- const int expshift = 23;
- const int expmask = 2139095040;
- const int explsb = 8388608;
- double lu;
- int iexp,addr;
- union { unsigned int bval; double fval; } bitpattern,result;
-
- nri = *p_nri;
- ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = *p_facel;
- krf = *p_krf;
- crf = *p_crf;
- tabscale = *p_tabscale;
- gbtabscale = *p_gbtabscale;
- nj1 = 0;
-
- for(n=0; (n<nri); n++)
- {
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
- ix1 = shX + pos[ii3+0];
- iy1 = shY + pos[ii3+1];
- iz1 = shZ + pos[ii3+2];
- iq = facel*charge[ii];
- isai = invsqrta[ii];
- nti = 2*ntype*type[ii];
- vctot = 0;
- Vvdwtot = 0;
-
- for(k=nj0; (k<nj1); k++)
- {
- jnr = jjnr[k];
- j3 = 3*jnr;
- jx1 = pos[j3+0];
- jy1 = pos[j3+1];
- jz1 = pos[j3+2];
- dx11 = ix1 - jx1;
- dy11 = iy1 - jy1;
- dz11 = iz1 - jz1;
- rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
- bitpattern.fval = rsq11;
- iexp = (((bitpattern.bval)&expmask)>>expshift);
- addr = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
- result.bval = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
- lu = result.fval;
- rinv11 = (0.5*lu*(3.0-((rsq11*lu)*lu)));
- isaj = invsqrta[jnr];
- isaprod = isai*isaj;
- qq = iq*charge[jnr];
- vcoul = qq*rinv11;
- qq = isaprod*(-qq);
- gbscale = isaprod*gbtabscale;
- tj = nti+2*type[jnr];
- c6 = vdwparam[tj];
- c12 = vdwparam[tj+1];
- rinvsq = rinv11*rinv11;
- r = rsq11*rinv11;
- rt = r*gbscale;
- n0 = rt;
- eps = rt-n0;
- eps2 = eps*eps;
- nnn = 4*n0;
- Y = GBtab[nnn];
- F = GBtab[nnn+1];
- Geps = eps*GBtab[nnn+2];
- Heps2 = eps2*GBtab[nnn+3];
- Fp = F+Geps+Heps2;
- VV = Y+eps*Fp;
- vgb = qq*VV;
- vctot = vctot + vcoul;
- rinvsix = rinvsq*rinvsq*rinvsq;
- Vvdw6 = c6*rinvsix;
- Vvdw12 = c12*rinvsix*rinvsix;
- Vvdwtot = Vvdwtot+Vvdw12-Vvdw6;
- }
-
- ggid = gid[n];
- Vc[ggid] = Vc[ggid] + vctot;
- Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
- }
-
- *outeriter = nri;
- *inneriter = nj1;
-}
-
-
#include<math.h>
#include<vec.h>
-
#include <xmmintrin.h>
#include <emmintrin.h>
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-
+#include "nb_kernel430_x86_64_sse2.h"
void nb_kernel430_sse2_double(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ double * shiftvec,
+ double * fshift,
+ int * gid,
+ double * pos,
+ double * faction,
+ double * charge,
+ double * p_facel,
+ double * p_krf,
+ double * p_crf,
+ double * vc,
+ int * type,
+ int * p_ntype,
+ double * vdwparam,
+ double * vvdw,
+ double * p_tabscale,
+ double * VFtab,
+ double * invsqrta,
+ double * dvda,
+ double * p_gbtabscale,
+ double * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ double * work)
{
- int nri,ntype,nthreads,offset,tj,tj2,nti;
- int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
- double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
- double shX,shY,shZ,isai_d,dva;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int offset,nti;
+ int jnrA,jnrB;
+ int j3A,j3B;
+ int tjA,tjB;
gmx_gbdata_t *gbdata;
- float * gpol;
-
- __m128d ix,iy,iz,jx,jy,jz;
- __m128d dx,dy,dz,t1,t2,t3;
- __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
- __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
- __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
- __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
- __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
- __m128d fac,tabscale,gbtabscale;
- __m128i n0,nnn;
-
- const __m128d neg = {-1.0,-1.0};
- const __m128d zero = {0.0,0.0};
- const __m128d half = {0.5,0.5};
- const __m128d two = {2.0,2.0};
- const __m128d three = {3.0,3.0};
- const __m128d six = {6.0,6.0};
- const __m128d twelwe = {12.0,12.0};
+ double * gpol;
+
+ __m128d iq,qq,jq,isai;
+ __m128d ix,iy,iz;
+ __m128d jx,jy,jz;
+ __m128d dx,dy,dz;
+ __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+ __m128d fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128d rinv,isaj,isaprod;
+ __m128d vcoul,fscal,gbscale,c6,c12;
+ __m128d rinvsq,r,rtab;
+ __m128d eps,Y,F,G,H;
+ __m128d VV,FF,Fp;
+ __m128d vgb,fijGB,dvdatmp;
+ __m128d rinvsix,vvdw6,vvdw12,vvdwtmp;
+ __m128d facel,gbtabscale,dvdaj;
+ __m128d fijD,fijR;
+ __m128d xmm1,tabscale,eps2;
+ __m128i n0, nnn;
+
- const __m128i four = _mm_set_epi32(4,4,4,4);
+ const __m128d neg = _mm_set1_pd(-1.0);
+ const __m128d zero = _mm_set1_pd(0.0);
+ const __m128d minushalf = _mm_set1_pd(-0.5);
+ const __m128d two = _mm_set1_pd(2.0);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
-
+
nri = *p_nri;
ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
- krf = *p_krf;
- crf = *p_crf;
- tabscl = *p_tabscale;
- gbtabscl = *p_gbtabscale;
- nj1 = 0;
-
- /* Splat variables */
- fac = _mm_load1_pd(&facel);
- tabscale = _mm_load1_pd(&tabscl);
- gbtabscale = _mm_load1_pd(&gbtabscl);
-
- /* Keep compiler happy */
- Vvdwtmp = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
- dvdatmp = _mm_setzero_pd();
- dvdaj = _mm_setzero_pd();
- isaj = _mm_setzero_pd();
- vcoul = _mm_setzero_pd();
- vgb = _mm_setzero_pd();
- t1 = _mm_setzero_pd();
- t2 = _mm_setzero_pd();
- t3 = _mm_setzero_pd();
- xmm1 = _mm_setzero_pd();
- xmm2 = _mm_setzero_pd();
- xmm3 = _mm_setzero_pd();
- xmm4 = _mm_setzero_pd();
- jnr1 = jnr2 = 0;
- j13 = j23 = 0;
+
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+ tabscale = _mm_load1_pd(p_tabscale);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
-
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- offset = (nj1-nj0)%2;
-
- ii = iinr[n];
- ii3 = ii*3;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shX+pos[ii3+1]);
- iz = _mm_set1_pd(shX+pos[ii3+2]);
- q = _mm_set1_pd(charge[ii]);
-
- iq = _mm_mul_pd(fac,q);
- isai_d = invsqrta[ii];
- isai = _mm_load1_pd(&isai_d);
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
- nti = 2*ntype*type[ii];
-
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
+ ix = _mm_set1_pd(shX+pos[ii3+0]);
+ iy = _mm_set1_pd(shY+pos[ii3+1]);
+ iz = _mm_set1_pd(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_pd(charge+ii);
+ iq = _mm_mul_pd(iq,facel);
+
+ isai = _mm_load1_pd(invsqrta+ii);
+
+ nti = 2*ntype*type[ii];
- for(k=nj0;k<nj1-offset; k+=2)
+ vctot = _mm_setzero_pd();
+ vvdwtot = _mm_setzero_pd();
+ vgbtot = _mm_setzero_pd();
+ dvdasum = _mm_setzero_pd();
+ fix = _mm_setzero_pd();
+ fiy = _mm_setzero_pd();
+ fiz = _mm_setzero_pd();
+
+ for(k=nj0;k<nj1-1; k+=2)
{
- jnr1 = jjnr[k];
- jnr2 = jjnr[k+1];
-
- j13 = jnr1 * 3;
- j23 = jnr2 * 3;
-
- /* Load coordinates */
- xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */
- xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */
-
- xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */
- xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */
-
- /* transpose */
- jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
-
- /* distances */
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- /* Load invsqrta */
- isaj = _mm_loadl_pd(isaj,invsqrta+jnr1);
- isaj = _mm_loadh_pd(isaj,invsqrta+jnr2);
- isaprod = _mm_mul_pd(isai,isaj);
-
- /* Load charges */
- q = _mm_loadl_pd(q,charge+jnr1);
- q = _mm_loadh_pd(q,charge+jnr2);
- qq = _mm_mul_pd(iq,q);
-
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- qq = _mm_mul_pd(isaprod,qq);
- qq = _mm_mul_pd(qq,neg);
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
- tj2 = nti+2*type[jnr2];
-
- xmm1 = _mm_loadu_pd(vdwparam+tj);
- xmm2 = _mm_loadu_pd(vdwparam+tj2);
- c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-
- /* Load dvdaj */
- dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1);
- dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2);
-
- /* Calculate GB table index */
- r = _mm_mul_pd(rsq11,rinv);
- rt = _mm_mul_pd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
- eps2 = _mm_mul_pd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- H = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,H);
- vgb = _mm_mul_pd(qq,VV);
- fijC = _mm_mul_pd(qq,FF);
- fijC = _mm_mul_pd(fijC,gbscale);
-
- dvdatmp = _mm_mul_pd(fijC,r);
- dvdatmp = _mm_add_pd(vgb,dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp,neg);
- dvdatmp = _mm_mul_pd(dvdatmp,half);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_pd(dvdatmp,isaj);
- xmm1 = _mm_mul_pd(xmm1,isaj);
- dvdaj = _mm_add_pd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
- _mm_storeh_pd(dvda+jnr2,dvdaj);
-
- vctot = _mm_add_pd(vctot,vcoul);
- vgbtot = _mm_add_pd(vgbtot,vgb);
-
- /* Calculate VDW table index */
- rt = _mm_mul_pd(r,tabscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+
+ j3A = jnrA * 3;
+ j3B = jnrB * 3;
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ dx = _mm_sub_pd(ix,jx);
+ dy = _mm_sub_pd(iy,jy);
+ dz = _mm_sub_pd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_pd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ isaprod = _mm_mul_pd(isai,isaj);
+ qq = _mm_mul_pd(iq,jq);
+ vcoul = _mm_mul_pd(qq,rinv);
+ fscal = _mm_mul_pd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+ gbscale = _mm_mul_pd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_pd(rsq,rinv);
+ rtab = _mm_mul_pd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ /* Calculate VDW table index */
+ rtab = _mm_mul_pd(r,tabscale);
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_pd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Tabulated VdW interaction - dispersion */
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
FF = _mm_add_pd(Fp,G);
FF = _mm_add_pd(FF,xmm1);
- Vvdw6 = _mm_mul_pd(c6,VV);
+ vvdw6 = _mm_mul_pd(c6,VV);
fijD = _mm_mul_pd(c6,FF);
-
- /* Tabulated VdW interaction - repulsion */
- nnn = _mm_add_epi32(nnn,four);
-
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
FF = _mm_add_pd(Fp,G);
FF = _mm_add_pd(FF,xmm1);
- Vvdw12 = _mm_mul_pd(c12,VV);
+ vvdw12 = _mm_mul_pd(c12,VV);
fijR = _mm_mul_pd(c12,FF);
- Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-
+ vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
+ vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
+
xmm1 = _mm_add_pd(fijD,fijR);
xmm1 = _mm_mul_pd(xmm1,tabscale);
- xmm1 = _mm_add_pd(xmm1,fijC);
+ xmm1 = _mm_add_pd(xmm1,fijGB);
xmm1 = _mm_sub_pd(xmm1,fscal);
fscal = _mm_mul_pd(xmm1,neg);
fscal = _mm_mul_pd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_pd(fscal,dx);
- t2 = _mm_mul_pd(fscal,dy);
- t3 = _mm_mul_pd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
- xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-
- xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
- xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-
- /* transpose */
- xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
- xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
- xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_pd(xmm5,t1);
- xmm6 = _mm_sub_pd(xmm6,t2);
- xmm7 = _mm_sub_pd(xmm7,t3);
-
- xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
- xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* store fx and fy */
- _mm_storeu_pd(faction+j13,xmm1);
- _mm_storeu_pd(faction+j23,xmm2);
-
- /* .. then fz */
- _mm_storel_pd(faction+j13+2,xmm7);
- _mm_storel_pd(faction+j23+2,xmm7);
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
/* In double precision, offset can only be either 0 or 1 */
- if(offset!=0)
+ if(k<nj1)
{
- jnr1 = jjnr[k];
- j13 = jnr1*3;
-
- jx = _mm_load_sd(pos+j13);
- jy = _mm_load_sd(pos+j13+1);
- jz = _mm_load_sd(pos+j13+2);
-
- isaj = _mm_load_sd(invsqrta+jnr1);
- isaprod = _mm_mul_sd(isai,isaj);
- dvdaj = _mm_load_sd(dvda+jnr1);
- q = _mm_load_sd(charge+jnr1);
- qq = _mm_mul_sd(iq,q);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- qq = _mm_mul_sd(isaprod,qq);
- qq = _mm_mul_sd(qq,neg);
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
-
- c6 = _mm_load_sd(vdwparam+tj);
- c12 = _mm_load_sd(vdwparam+tj+1);
-
- /* Calculate GB table index */
- r = _mm_mul_sd(rsq11,rinv);
- rt = _mm_mul_sd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
- eps2 = _mm_mul_sd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2);
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2);
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0));
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1));
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- H = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,H);
- vgb = _mm_mul_sd(qq,VV);
- fijC = _mm_mul_sd(qq,FF);
- fijC = _mm_mul_sd(fijC,gbscale);
-
- dvdatmp = _mm_mul_sd(fijC,r);
- dvdatmp = _mm_add_sd(vgb,dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp,neg);
- dvdatmp = _mm_mul_sd(dvdatmp,half);
- dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_sd(dvdatmp,isaj);
- xmm1 = _mm_mul_sd(xmm1,isaj);
- dvdaj = _mm_add_sd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
-
- vctot = _mm_add_sd(vctot,vcoul);
- vgbtot = _mm_add_sd(vgbtot,vgb);
-
- /* Calculate VDW table index */
- rt = _mm_mul_sd(r,tabscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
+ jnrA = jjnr[k];
+ j3A = jnrA * 3;
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_sd(rinv,rinv);
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vvdw6 = _mm_setzero_pd();
+ vvdw12 = _mm_setzero_pd();
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+
+ isaprod = _mm_mul_sd(isai,isaj);
+ qq = _mm_mul_sd(jq,iq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
+ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+ gbscale = _mm_mul_sd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_sd(rsq,rinv);
+ rtab = _mm_mul_sd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+
+ /* Calculate VDW table index */
+ rtab = _mm_mul_sd(r,tabscale);
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_sd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Tabulated VdW interaction - dispersion */
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_sd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
FF = _mm_add_sd(Fp,G);
FF = _mm_add_sd(FF,xmm1);
- Vvdw6 = _mm_mul_sd(c6,VV);
+ vvdw6 = _mm_mul_sd(c6,VV);
fijD = _mm_mul_sd(c6,FF);
-
- /* Tabulated VdW interaction - repulsion */
- nnn = _mm_add_epi32(nnn,four);
-
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_sd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
FF = _mm_add_sd(Fp,G);
FF = _mm_add_sd(FF,xmm1);
- Vvdw12 = _mm_mul_sd(c12,VV);
+ vvdw12 = _mm_mul_sd(c12,VV);
fijR = _mm_mul_sd(c12,FF);
- Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-
+ vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
+ vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
+
xmm1 = _mm_add_sd(fijD,fijR);
xmm1 = _mm_mul_sd(xmm1,tabscale);
- xmm1 = _mm_add_sd(xmm1,fijC);
+ xmm1 = _mm_add_sd(xmm1,fijGB);
xmm1 = _mm_sub_sd(xmm1,fscal);
fscal = _mm_mul_sd(xmm1,neg);
fscal = _mm_mul_sd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_sd(fscal,dx);
- t2 = _mm_mul_sd(fscal,dy);
- t3 = _mm_mul_sd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_sd(fix,t1);
- fiy = _mm_add_sd(fiy,t2);
- fiz = _mm_add_sd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm5 = _mm_load_sd(faction+j13); /* fx */
- xmm6 = _mm_load_sd(faction+j13+1); /* fy */
- xmm7 = _mm_load_sd(faction+j13+2); /* fz */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_sd(xmm5,t1);
- xmm6 = _mm_sub_sd(xmm6,t2);
- xmm7 = _mm_sub_sd(xmm7,t3);
-
- /* store forces */
- _mm_store_sd(faction+j13,xmm5);
- _mm_store_sd(faction+j13+1,xmm6);
- _mm_store_sd(faction+j13+2,xmm7);
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- /* fix/fiy/fiz now contain four partial terms, that all should be
- * added to the i particle forces
- */
- t1 = _mm_unpacklo_pd(t1,fix);
- t2 = _mm_unpacklo_pd(t2,fiy);
- t3 = _mm_unpacklo_pd(t3,fiz);
-
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
- fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
- fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-
- /* Load i forces from memory */
- xmm1 = _mm_load_sd(faction+ii3);
- xmm2 = _mm_load_sd(faction+ii3+1);
- xmm3 = _mm_load_sd(faction+ii3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(faction+ii3,fix);
- _mm_store_sd(faction+ii3+1,fiy);
- _mm_store_sd(faction+ii3+2,fiz);
-
- /* now do dvda */
- dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
- _mm_storeh_pd(&dva,dvdasum);
- dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-
- ggid = gid[n];
-
- /* Coulomb potential */
- vcoul = _mm_unpacklo_pd(vcoul,vctot);
- vctot = _mm_add_pd(vctot,vcoul);
- _mm_storeh_pd(&vct,vctot);
- Vc[ggid] = Vc[ggid] + vct;
-
- /* VdW potential */
- Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
- _mm_storeh_pd(&vdwt,Vvdwtot);
- Vvdw[ggid] = Vvdw[ggid] + vdwt;
-
- /* GB potential */
- vgb = _mm_unpacklo_pd(vgb,vgbtot);
- vgbtot = _mm_add_pd(vgbtot,vgb);
- _mm_storeh_pd(&vgbt,vgbtot);
- gpol[ggid] = gpol[ggid] + vgbt;
- }
-
- *outeriter = nri;
- *inneriter = nj1;
-}
-
-
-/*
- * Gromacs nonbonded kernel nb_kernel430nf
- * Coulomb interaction: Generalized-Born
- * VdW interaction: Tabulated
- * water optimization: No
- * Calculate forces: no
- */
-void nb_kernel430nf_sse2_double(
- int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
-{
- int nri,ntype,nthreads;
- double facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
- int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
- double shX,shY,shZ;
- double iq;
- double qq,vcoul,vctot;
- int nti;
- int tj;
- double Vvdw6,Vvdwtot;
- double Vvdw12;
- double r,rt,eps,eps2;
- int n0,nnn;
- double Y,F,Geps,Heps2,Fp,VV;
- double isai,isaj,isaprod,gbscale;
- double ix1,iy1,iz1;
- double jx1,jy1,jz1;
- double dx11,dy11,dz11,rsq11,rinv11;
- double c6,c12;
-
- nri = *p_nri;
- ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = *p_facel;
- krf = *p_krf;
- crf = *p_crf;
- tabscale = *p_tabscale;
- gbtabscale = *p_gbtabscale;
- nj1 = 0;
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
- for(n=0; (n<nri); n++)
- {
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
- ix1 = shX + pos[ii3+0];
- iy1 = shY + pos[ii3+1];
- iz1 = shZ + pos[ii3+2];
- iq = facel*charge[ii];
- isai = invsqrta[ii];
- nti = 2*ntype*type[ii];
- vctot = 0;
- Vvdwtot = 0;
-
- for(k=nj0; (k<nj1); k++)
- {
- jnr = jjnr[k];
- j3 = 3*jnr;
- jx1 = pos[j3+0];
- jy1 = pos[j3+1];
- jz1 = pos[j3+2];
- dx11 = ix1 - jx1;
- dy11 = iy1 - jy1;
- dz11 = iz1 - jz1;
- rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
- rinv11 = gmx_invsqrt(rsq11);
- isaj = invsqrta[jnr];
- isaprod = isai*isaj;
- qq = iq*charge[jnr];
- vcoul = qq*rinv11;
- qq = isaprod*(-qq);
- gbscale = isaprod*gbtabscale;
- tj = nti+2*type[jnr];
- c6 = vdwparam[tj];
- c12 = vdwparam[tj+1];
- r = rsq11*rinv11;
- rt = r*gbscale;
- n0 = rt;
- eps = rt-n0;
- eps2 = eps*eps;
- nnn = 4*n0;
- Y = GBtab[nnn];
- F = GBtab[nnn+1];
- Geps = eps*GBtab[nnn+2];
- Heps2 = eps2*GBtab[nnn+3];
- Fp = F+Geps+Heps2;
- VV = Y+eps*Fp;
- vgb = qq*VV;
- vctot = vctot + vcoul;
- r = rsq11*rinv11;
- rt = r*tabscale;
- n0 = rt;
- eps = rt-n0;
- eps2 = eps*eps;
- nnn = 8*n0;
- Y = VFtab[nnn];
- F = VFtab[nnn+1];
- Geps = eps*VFtab[nnn+2];
- Heps2 = eps2*VFtab[nnn+3];
- Fp = F+Geps+Heps2;
- VV = Y+eps*Fp;
- Vvdw6 = c6*VV;
- nnn = nnn+4;
- Y = VFtab[nnn];
- F = VFtab[nnn+1];
- Geps = eps*VFtab[nnn+2];
- Heps2 = eps2*VFtab[nnn+3];
- Fp = F+Geps+Heps2;
- VV = Y+eps*Fp;
- Vvdw12 = c12*VV;
- Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12;
- }
-
- ggid = gid[n];
- Vc[ggid] = Vc[ggid] + vctot;
- Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
- }
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
- *outeriter = nri;
- *inneriter = nj1;
+ }
+
+ *outeriter = nri;
+ *inneriter = nj1;
}
-
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-#include "nb_kernel400_sse2_single.h"
-
-
void nb_kernel400_sse2_single(int * p_nri,
int * iinr,
fix = _mm_setzero_ps();
fiy = _mm_setzero_ps();
fiz = _mm_setzero_ps();
-
+
for(k=nj0; k<nj1-7; k+=8)
{
jnrA = jjnr[k];
rB = _mm_mul_ps(rsqB,rinvB);
rtab = _mm_mul_ps(r,gbscale);
rtabB = _mm_mul_ps(rB,gbscaleB);
-
+
n0 = _mm_cvttps_epi32(rtab);
n0B = _mm_cvttps_epi32(rtabB);
eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
FB = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
vgbB = _mm_mul_ps(YB, qqB);
fijGBB = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
-
-
+
dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
/* Calculate GB table index */
r = _mm_mul_ps(rsq,rinv);
rtab = _mm_mul_ps(r,gbscale);
-
+
n0 = _mm_cvttps_epi32(rtab);
eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
nnn = _mm_slli_epi32(n0,2);
F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
vgb = _mm_mul_ps(Y, qq);
fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
-
+
dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
vgbtot = _mm_add_ps(vgbtot, vgb);
/* Calculate GB table index */
r = _mm_mul_ps(rsq,rinv);
rtab = _mm_mul_ps(r,gbscale);
-
+
n0 = _mm_cvttps_epi32(rtab);
eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
nnn = _mm_slli_epi32(n0,2);
F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
vgb = _mm_mul_ps(Y, qq);
fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
-
+
dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
vgbtot = _mm_add_ps(vgbtot, vgb);
* water optimization: No
* Calculate forces: no
*/
-void nb_kernel400nf_sse2_single(
+void nb_kernel400nf_x86_64_sse(
int * p_nri,
int * iinr,
int * jindex,
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-#include "nb_kernel410_sse2_single.h"
-
-
void nb_kernel410_sse2_single(int * p_nri,
int * iinr,
__m128i n0, nnn;
__m128i n0B, nnnB;
- const __m128 neg = {-1.0f,-1.0f,-1.0f,-1.0f};
- const __m128 zero = {0.0f,0.0f,0.0f,0.0f};
- const __m128 minushalf = {-0.5f,-0.5f,-0.5f,-0.5f};
- const __m128 two = {2.0f,2.0f,2.0f,2.0f};
- const __m128 six = {6.0f,6.0f,6.0f,6.0f};
- const __m128 twelve = {12.0f,12.0f,12.0f,12.0f};
+ const __m128 neg = _mm_set1_ps(-1.0f);
+ const __m128 zero = _mm_set1_ps(0.0f);
+ const __m128 minushalf = _mm_set1_ps(-0.5f);
+ const __m128 two = _mm_set1_ps(2.0f);
+ const __m128 six = _mm_set1_ps(6.0f);
+ const __m128 twelve = _mm_set1_ps(12.0f);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
* water optimization: No
* Calculate forces: no
*/
-void nb_kernel410nf_sse2_single(
+void nb_kernel410nf_x86_64_sse(
int * p_nri,
int * iinr,
int * jindex,
#include <xmmintrin.h>
#include <emmintrin.h>
+
#include <gmx_sse2_single.h>
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-#include "nb_kernel430_sse2_single.h"
-
-/* to extract single integers from a __m128i datatype */
-#define _mm_extract_epi32(x, imm) \
-_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
void nb_kernel430_sse2_single(int * p_nri,
int * iinr,
__m128 fac_sse,tabscale_sse,gbtabscale_sse;
__m128i n0, nnn;
- const __m128 neg = {-1.0f,-1.0f,-1.0f,-1.0f};
- const __m128 zero = {0.0f,0.0f,0.0f,0.0f};
- const __m128 half = {0.5f,0.5f,0.5f,0.5f};
- const __m128 two = {2.0f,2.0f,2.0f,2.0f};
- const __m128 three = {3.0f,3.0f,3.0f,3.0f};
- const __m128 six = {6.0f,6.0f,6.0f,6.0f};
- const __m128 twelwe = {12.0f,12.0f,12.0f,12.0f};
+ const __m128 neg = _mm_set1_ps(-1.0f);
+ const __m128 zero = _mm_set1_ps(0.0f);
+ const __m128 half = _mm_set1_ps(0.5f);
+ const __m128 two = _mm_set1_ps(2.0f);
+ const __m128 three = _mm_set1_ps(3.0f);
+ const __m128 six = _mm_set1_ps(6.0f);
+ const __m128 twelwe = _mm_set1_ps(12.0f);
- __m128i four = _mm_set_epi32(4,4,4,4);
+ __m128i four = _mm_set1_epi32(4);
__m128i maski = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);
__m128i mask = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);
gpol = gbdata->gpol;
nri = *p_nri;
- ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
- krf = *p_krf;
- crf = *p_crf;
- tabscale = *p_tabscale;
- gbtabscale = *p_gbtabscale;
- nj1 = 0;
+ ntype = *p_ntype;
+ nthreads = *p_nthreads;
+ facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
+ krf = *p_krf;
+ crf = *p_crf;
+ tabscale = *p_tabscale;
+ gbtabscale = *p_gbtabscale;
+ nj1 = 0;
/* Splat variables */
fac_sse = _mm_load1_ps(&facel);
nnn = _mm_slli_epi32(n0,2);
/* the tables are 16-byte aligned, so we can use _mm_load_ps */
- xmm1 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
- xmm2 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
- xmm3 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
- xmm4 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+ xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
/* transpose 4*4 */
xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
nnn = _mm_slli_epi32(n0,3);
/* Tabulated VdW interaction - disperion */
- xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
- xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
- xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
- xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
/* transpose 4*4 */
xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
/* Tabulated VdW interaction - repulsion */
nnn = _mm_add_epi32(nnn,four);
- xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
- xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
- xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
- xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
/* transpose 4*4 */
xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
nnn = _mm_slli_epi32(n0,2);
/* the tables are 16-byte aligned, so we can use _mm_load_ps */
- xmm1 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
- xmm2 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
- xmm3 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
- xmm4 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+ xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
/* transpose 4*4 */
xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
nnn = _mm_slli_epi32(n0,3);
/* Tabulated VdW interaction - disperion */
- xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
- xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
- xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
- xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
/* transpose 4*4 */
xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
/* Tabulated VdW interaction - repulsion */
nnn = _mm_add_epi32(nnn,four);
- xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
- xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
- xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
- xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
/* transpose 4*4 */
xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
* water optimization: No
* Calculate forces: no
*/
-void nb_kernel430nf_sse2_single(
+void nb_kernel430nf_x86_64_sse(
int * p_nri,
int * iinr,
int * jindex,
dy11 = iy1 - jy1;
dz11 = iz1 - jz1;
rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
- rinv11 = gmx_mm_invsqrt(rsq11);
+ rinv11 = gmx_invsqrt(rsq11);
isaj = invsqrta[jnr];
isaprod = isai*isaj;
qq = iq*charge[jnr];
int * inneriter,
double * work)
{
- int nri,nthreads;
- int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
- int jnrA,jnrB;
- int j3A,j3B;
+ int nri,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int jnrA,jnrB;
+ int j3A,j3B;
gmx_gbdata_t *gbdata;
double * gpol;
nri = *p_nri;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
ix = _mm_set1_pd(shX+pos[ii3+0]);
iy = _mm_set1_pd(shY+pos[ii3+1]);
iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
+
iq = _mm_load1_pd(charge+ii);
iq = _mm_mul_pd(iq,facel);
-
+
isai = _mm_load1_pd(invsqrta+ii);
vctot = _mm_setzero_pd();
j3A = jnrA * 3;
j3B = jnrB * 3;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_pd(rinv,rinv);
-
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
qq = _mm_mul_pd(iq,jq);
vcoul = _mm_mul_pd(qq,rinv);
fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
+ vctot = _mm_add_pd(vctot,vcoul);
/* Polarization interaction */
qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-
- fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
-
- /***********************************/
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
{
jnrA = jjnr[k];
j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
dx = _mm_sub_sd(ix,jx);
dy = _mm_sub_sd(iy,jy);
dz = _mm_sub_sd(iz,jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_sd(rinv,rinv);
-
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vgb = _mm_setzero_pd();
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-
+
isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
+ /* Since we need _mm_add_pd below, the order here og jq,iq becomes important */
+ qq = _mm_mul_sd(jq,iq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_sd(rsq,rinv);
rtab = _mm_mul_sd(r,gbscale);
-
+
n0 = _mm_cvttpd_epi32(rtab);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
- fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
-
- /***********************************/
+ fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
- ggid = gid[n];
-
- gmx_mm_update_1pot_pd(vctot,vc+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ }
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
int * inneriter,
double * work)
{
- int nri,ntype,nthreads;
- int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
int offset,nti;
- int jnrA,jnrB;
- int j3A,j3B;
+ int jnrA,jnrB;
+ int j3A,j3B;
int tjA,tjB;
gmx_gbdata_t *gbdata;
double * gpol;
-
+
__m128d iq,qq,jq,isai;
__m128d ix,iy,iz;
__m128d jx,jy,jz;
nri = *p_nri;
ntype = *p_ntype;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
- c6 = _mm_setzero_pd();
- c12 = _mm_setzero_pd();
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
ix = _mm_set1_pd(shX+pos[ii3+0]);
iy = _mm_set1_pd(shY+pos[ii3+1]);
iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
+
iq = _mm_load1_pd(charge+ii);
iq = _mm_mul_pd(iq,facel);
-
+
isai = _mm_load1_pd(invsqrta+ii);
nti = 2*ntype*type[ii];
j3A = jnrA * 3;
j3B = jnrB * 3;
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_pd(rinv,rinv);
-
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
tjB = nti+2*type[jnrB];
-
- GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
isaprod = _mm_mul_pd(isai,isaj);
qq = _mm_mul_pd(iq,jq);
vcoul = _mm_mul_pd(qq,rinv);
fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
+ vctot = _mm_add_pd(vctot,vcoul);
- /* Polarization interaction */
+ /* Polarization interaction */
qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_pd(rsq,rinv);
rtab = _mm_mul_pd(r,gbscale);
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
rinvsix = _mm_mul_pd(rinvsq,rinvsq);
rinvsix = _mm_mul_pd(rinvsix,rinvsq);
vvdw12 = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
- fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
- _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
- _mm_mul_pd(six,vvdw6))),
- _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
-
- /***********************************/
+ fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
+ _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+ _mm_mul_pd(six,vvdw6))),
+ _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
jnrA = jjnr[k];
j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
dx = _mm_sub_sd(ix,jx);
dy = _mm_sub_sd(iy,jy);
dz = _mm_sub_sd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_sd(rinv,rinv);
-
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vvdw6 = _mm_setzero_pd();
+ vvdw12 = _mm_setzero_pd();
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
-
- GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
+ qq = _mm_mul_sd(jq,iq);
vcoul = _mm_mul_sd(qq,rinv);
fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_sd(rsq,rinv);
rtab = _mm_mul_sd(r,gbscale);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
rinvsix = _mm_mul_sd(rinvsq,rinvsq);
rinvsix = _mm_mul_sd(rinvsix,rinvsq);
vvdw6 = _mm_mul_sd(c6,rinvsix);
vvdw12 = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
- vvdwtot = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
-
- fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
- _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
- _mm_mul_sd(six,vvdw6))),
- _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
-
- /***********************************/
+ vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
+
+ fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
+ _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+ _mm_mul_sd(six,vvdw6))),
+ _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
- ggid = gid[n];
-
- gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+
}
-
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
int * inneriter,
double * work)
{
- int nri,ntype,nthreads;
- int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
int offset,nti;
- int jnrA,jnrB;
- int j3A,j3B;
+ int jnrA,jnrB;
+ int j3A,j3B;
int tjA,tjB;
gmx_gbdata_t *gbdata;
double * gpol;
__m128d vcoul,fscal,gbscale,c6,c12;
__m128d rinvsq,r,rtab;
__m128d eps,Y,F,G,H;
- __m128d VV,FF,Fp;
+ __m128d VV,FF,Fp;
__m128d vgb,fijGB,dvdatmp;
__m128d rinvsix,vvdw6,vvdw12,vvdwtmp;
__m128d facel,gbtabscale,dvdaj;
- __m128d fijD,fijR;
- __m128d xmm1,tabscale,eps2;
+ __m128d fijD,fijR;
+ __m128d xmm1,tabscale,eps2;
__m128i n0, nnn;
nri = *p_nri;
ntype = *p_ntype;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
- tabscale = _mm_load1_pd(p_tabscale);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
- c6 = _mm_setzero_pd();
- c12 = _mm_setzero_pd();
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+ tabscale = _mm_load1_pd(p_tabscale);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- ii = iinr[n];
- ii3 = 3*ii;
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
ix = _mm_set1_pd(shX+pos[ii3+0]);
iy = _mm_set1_pd(shY+pos[ii3+1]);
iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
+
iq = _mm_load1_pd(charge+ii);
iq = _mm_mul_pd(iq,facel);
-
+
isai = _mm_load1_pd(invsqrta+ii);
-
+
nti = 2*ntype*type[ii];
vctot = _mm_setzero_pd();
j3A = jnrA * 3;
j3B = jnrB * 3;
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_pd(rinv,rinv);
-
+
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
tjB = nti+2*type[jnrB];
-
- GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
isaprod = _mm_mul_pd(isai,isaj);
qq = _mm_mul_pd(iq,jq);
vcoul = _mm_mul_pd(qq,rinv);
fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_pd(rsq,rinv);
rtab = _mm_mul_pd(r,gbscale);
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
- /* Calculate VDW table index */
+ /* Calculate VDW table index */
rtab = _mm_mul_pd(r,tabscale);
n0 = _mm_cvttpd_epi32(rtab);
eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_pd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
vvdw6 = _mm_mul_pd(c6,VV);
fijD = _mm_mul_pd(c6,FF);
-
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
- F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
- H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
-
+
xmm1 = _mm_add_pd(fijD,fijR);
xmm1 = _mm_mul_pd(xmm1,tabscale);
xmm1 = _mm_add_pd(xmm1,fijGB);
xmm1 = _mm_sub_pd(xmm1,fscal);
fscal = _mm_mul_pd(xmm1,neg);
fscal = _mm_mul_pd(fscal,rinv);
-
- /***********************************/
+
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
{
jnrA = jjnr[k];
j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
dx = _mm_sub_sd(ix,jx);
dy = _mm_sub_sd(iy,jy);
dz = _mm_sub_sd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
rinvsq = _mm_mul_sd(rinv,rinv);
-
- /***********************************/
+
+ /* These reason for zeroing these variables here is for fixing bug 585
+ * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+ * and r1=0, but it should be r1=a[1].
+ * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+ * To work around it, we zero these variables and use _mm_add_pd (**) instead
+ * Note that the only variables that get affected are the energies since
+ * the total sum needs to be correct
+ */
+ vgb = _mm_setzero_pd();
+ vcoul = _mm_setzero_pd();
+ dvdatmp = _mm_setzero_pd();
+ vvdw6 = _mm_setzero_pd();
+ vvdw12 = _mm_setzero_pd();
+
+ /***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
-
- GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
+ qq = _mm_mul_sd(jq,iq);
vcoul = _mm_mul_sd(qq,rinv);
fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
+ vctot = _mm_add_pd(vctot,vcoul); /* (**) */
+
+ /* Polarization interaction */
qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
+
/* Calculate GB table index */
r = _mm_mul_sd(rsq,rinv);
rtab = _mm_mul_sd(r,gbscale);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
- /* Calculate VDW table index */
+ /* Calculate VDW table index */
rtab = _mm_mul_sd(r,tabscale);
n0 = _mm_cvttpd_epi32(rtab);
eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_sd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
vvdw6 = _mm_mul_sd(c6,VV);
fijD = _mm_mul_sd(c6,FF);
-
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
fijR = _mm_mul_sd(c12,FF);
vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
- vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
+ vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
xmm1 = _mm_add_sd(fijD,fijR);
xmm1 = _mm_mul_sd(xmm1,tabscale);
fscal = _mm_mul_sd(xmm1,neg);
fscal = _mm_mul_sd(fscal,rinv);
- /***********************************/
+ /***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
- ggid = gid[n];
-
- gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+ gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+ gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+
+ }
+
*outeriter = nri;
- *inneriter = nj1;
+ *inneriter = nj1;
}
pmask1 = prologue_mask[i+1];
emask0 = epilogue_mask[i];
emask1 = epilogue_mask[i+1];
- imask_SSE0 = _mm_load1_pd((double *)(imask+i));
+ imask_SSE0 = _mm_load1_pd((double *)(imask+2*i));
imask_SSE1 = _mm_load1_pd((double *)(imask+2*i+2));
for(j=nj0; j<nj1; j+=UNROLLJ)
add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
target_link_libraries(gmxpreprocess md)
set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
-set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
if(GMX_OPENMM)
add_subdirectory(gmx_gpu_utils)
include_directories(./gmx_gpu_utils ${OpenMM_INCLUDE_DIR})
link_directories(${OpenMM_LIBRARY_DIR})
- # only define if this is a local build not a release
- # we assume that the auto-generated version is not used &&
- # version string does not contain "-dev" => it's a release build
- if(NOT USE_VERSION_H AND NOT PROJECT_VERSION MATCHES ".*-dev.*")
- add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" )
- else()
- add_definitions( -DOPENMM_PLUGIN_DIR="" )
- endif()
+ # with this define no evn.var. is needed with OPENMM_PLUGIN_DIR
+ # if the same OpenMM installation is used for running and building
+ add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" )
+ file(TO_CMAKE_PATH ${OpenMM_PLUGIN_DIR} _path)
add_library(openmm_api_wrapper STATIC openmm_wrapper.cpp)
target_link_libraries(openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
set(GMX_OPENMM_LIBRARIES openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
endif()
+install(TARGETS gmxpreprocess DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
+install(TARGETS mdrun DESTINATION ${BIN_INSTALL_DIR} COMPONENT mdrun)
install(TARGETS
grompp
tpbconv
gmxdump
g_x2top
gmxcheck
- mdrun
- gmxpreprocess DESTINATION ${LIB_INSTALL_DIR}
+ COMPONENT runtime
RUNTIME DESTINATION ${BIN_INSTALL_DIR})
-
-# if we build shared gromacs libs, when installing throught the install-mdrun target
-# these libs need to be installed as well
-if(BUILD_SHARED_LIBS)
- # in MDRUN_LIBS we store the libraries MDRUN links against (NOTE: hardcoded!!!)
- set(MDRUN_LIBS gmxpreprocess md gmx)
-
- # generate install-libXXX custom target for each shared lib that mdrun links against
- foreach(_lib ${MDRUN_LIBS})
- # double-check that the type is SHARED
- get_target_property(_type ${_lib} TYPE)
- if(NOT ${_type} STREQUAL "SHARED_LIBRARY")
- message(FATAL_ERROR " Internal error: library ${_lib} is not shared so it's not supposed to be processed for installing")
- endif()
-
- # figure out the path and filename under which the lib will be installed
- # (libname with pre- and suffix)
- get_target_property(_lib_path ${_lib} LOCATION)
- string(REGEX REPLACE "/" ";" _lib_fname ${_lib_path})
- list(REVERSE _lib_fname)
- list(GET _lib_fname 0 _lib_fname)
-
- # create custom target for copying each library to the install location
- # TODO: need to fix this to have the .so.6 form
- add_custom_target(install-${_lib}
- COMMAND ${CMAKE_COMMAND} -E copy
- "${_lib_path}" "${LIB_INSTALL_DIR}/${_lib_fname}.${SOVERSION}"
- COMMAND ${CMAKE_COMMAND} -E create_symlink
- "${_lib_fname}.${SOVERSION}" "${LIB_INSTALL_DIR}/${_lib_fname}"
- COMMENT "Installing library ${_lib}")
- add_dependencies(install-${_lib} ${_lib})
-
- # gather the custom target names in a string
- # set(_lib_install_targets "${_lib_install_targets} install-lib${_lib}")
- list(APPEND _lib_install_targets "install-${_lib}")
- endforeach(_lib)
-endif(BUILD_SHARED_LIBS)
-
-get_target_property(_mdrun_path mdrun LOCATION)
-add_custom_target(install-mdrun
- COMMAND ${CMAKE_COMMAND} -E copy "${_mdrun_path}"
- "${BIN_INSTALL_DIR}/${_mdrun_exec_name}"
- COMMENT "Installing mdrun")
-add_dependencies(install-mdrun mdrun ${_lib_install_targets})
+# Create the custom install-mdrun target
+if (BUILD_SHARED_LIBS)
+ # If shared libraries are used, we need to install the libraries in
+ # addition to the mdrun binary.
+ add_custom_target(install-mdrun
+ COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
+ -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+ COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
+ -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+ COMMENT "Installing mdrun")
+else (BUILD_SHARED_LIBS)
+ add_custom_target(install-mdrun
+ COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
+ -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+ COMMENT "Installing mdrun")
+endif (BUILD_SHARED_LIBS)
+add_dependencies(install-mdrun mdrun)
endif(GMX_FAHCORE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxpreprocess.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
- RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc
+ DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+ RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc"
+ COMPONENT development)
{
maxsize = max(maxsize,cgs->index[cg+1]-cgs->index[cg]);
}
-
- if (maxsize > 10)
+
+ if (maxsize > MAX_CHARGEGROUP_SIZE)
+ {
+ gmx_fatal(FARGS,"The largest charge group contains %d atoms. The maximum is %d.",maxsize,MAX_CHARGEGROUP_SIZE);
+ }
+ else if (maxsize > 10)
{
set_warning_line(wi,topfn,-1);
sprintf(warn_buf,
return count;
}
+static void check_gbsa_params_charged(gmx_mtop_t *sys, gpp_atomtype_t atype)
+{
+ int i,nmiss,natoms,mt;
+ real q;
+ const t_atoms *atoms;
+
+ nmiss = 0;
+ for(mt=0;mt<sys->nmoltype;mt++)
+ {
+ atoms = &sys->moltype[mt].atoms;
+ natoms = atoms->nr;
+
+ for(i=0;i<natoms;i++)
+ {
+ q = atoms->atom[i].q;
+ if ((get_atomtype_radius(atoms->atom[i].type,atype) == 0 ||
+ get_atomtype_vol(atoms->atom[i].type,atype) == 0 ||
+ get_atomtype_surftens(atoms->atom[i].type,atype) == 0 ||
+ get_atomtype_gb_radius(atoms->atom[i].type,atype) == 0 ||
+ get_atomtype_S_hct(atoms->atom[i].type,atype) == 0) &&
+ q != 0)
+ {
+ fprintf(stderr,"\nGB parameter(s) zero for atom type '%s' while charge is %g\n",
+ get_atomtype_name(atoms->atom[i].type,atype),q);
+ nmiss++;
+ }
+ }
+ }
+
+ if (nmiss > 0)
+ {
+ gmx_fatal(FARGS,"Can't do GB electrostatics; the implicit_genborn_params section of the forcefield has parameters with value zero for %d atomtypes that occur as charged atoms.",nmiss);
+ }
+}
+
+
static void check_gbsa_params(t_inputrec *ir,gpp_atomtype_t atype)
{
int nmiss,i;
get_atomtype_gb_radius(i,atype) < 0 ||
get_atomtype_S_hct(i,atype) < 0)
{
- fprintf(stderr,"GB parameter(s) missing or negative for atom type '%s'\n",
+ fprintf(stderr,"\nGB parameter(s) missing or negative for atom type '%s'\n",
get_atomtype_name(i,atype));
nmiss++;
}
if (nmiss > 0)
{
- gmx_fatal(FARGS,"Can't do GB electrostatics; the forcefield is missing %d values for\n"
- "atomtype radii, or they might be negative\n.",nmiss);
+ gmx_fatal(FARGS,"Can't do GB electrostatics; the implicit_genborn_params section of the forcefield is missing parameters for %d atomtypes or they might be negative.",nmiss);
}
}
{
/* Now we have renumbered the atom types, we can check the GBSA params */
check_gbsa_params(ir,atype);
+
+ /* Check that all atoms that have charge and/or LJ-parameters also have
+ * sensible GB-parameters
+ */
+ check_gbsa_params_charged(sys,atype);
}
/* PELA: Copy the atomtype data to the topology atomtype list */
}
/* macro set at build time */
-#ifdef OpenMM_PLUGIN_DIR
+#ifdef OPENMM_PLUGIN_DIR
if (!hasLoadedPlugins)
{
loadedPlugins = Platform::loadPluginsFromDirectory(OPENMM_PLUGIN_DIR);
int nid_used;
int this_chainstart;
int prev_chainstart;
+ gmx_bool bMerged;
gmx_atomprop_t aps;
this_chainstart = 0;
pdb_ch=NULL;
+ bMerged = FALSE;
for (i=0; (i<natom); i++)
{
ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
prev_resnum = this_resnum;
prev_chainid = this_chainid;
prev_chainnumber = this_chainnumber;
- prev_chainstart = this_chainstart;
+ if (!bMerged)
+ {
+ prev_chainstart = this_chainstart;
+ }
this_atomname = *pdba_all.atomname[i];
this_atomnum = (pdba_all.pdbinfo != NULL) ? pdba_all.pdbinfo[i].atomnr : i+1;
select[0] = 'n';
}
- if (select[0] == 'y')
+ bMerged = (select[0] == 'y');
+ if (bMerged)
{
pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
- pdba_all.atom[i].resind - prev_chainstart;
+ pdba_all.atom[i].resind - prev_chainstart;
pdb_ch[nch-1].nterpairs++;
srenew(pdb_ch[nch-1].chainstart,pdb_ch[nch-1].nterpairs+1);
}
bHisMan,bArgMan,bGlnMan,angle,distance,&symtab,
nrtprename,rtprename);
- for(i=0; i<cc->nterpairs; i++) {
-
- cc->chainstart[cc->nterpairs] = pdba->nres;
-
- find_nc_ter(pdba,cc->chainstart[i],cc->chainstart[i+1],
- &(cc->r_start[i]),&(cc->r_end[i]),rt);
+ cc->chainstart[cc->nterpairs] = pdba->nres;
+ j = 0;
+ for(i=0; i<cc->nterpairs; i++)
+ {
+ find_nc_ter(pdba,cc->chainstart[i],cc->chainstart[i+1],
+ &(cc->r_start[j]),&(cc->r_end[j]),rt);
-
- if ( (cc->r_start[i]<0) || (cc->r_end[i]<0) ) {
- printf("Problem with chain definition, or missing terminal residues.\n"
- "This chain does not appear to contain a recognized chain molecule.\n"
- "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
-
- cc->nterpairs = i;
- break;
- }
- }
+ if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
+ {
+ j++;
+ }
+ }
+ cc->nterpairs = j;
+ if (cc->nterpairs == 0)
+ {
+ printf("Problem with chain definition, or missing terminal residues.\n"
+ "This chain does not appear to contain a recognized chain molecule.\n"
+ "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
+ }
/* Check for disulfides and other special bonds */
nssbonds = mk_specbonds(pdba,x,bCysMan,&ssbonds,bVerbose);
{
if(bTerMan && ntdblist>1)
{
- cc->ntdb[i] = choose_ter(ntdblist,tdblist,"Select start terminus type");
+ sprintf(select,"Select start terminus type for %s-%d",
+ *pdba->resinfo[cc->r_start[i]].name,
+ pdba->resinfo[cc->r_start[i]].nr);
+ cc->ntdb[i] = choose_ter(ntdblist,tdblist,select);
}
else
{
cc->ntdb[i] = tdblist[0];
}
- printf("Start terminus: %s\n",(cc->ntdb[i])->name);
+ printf("Start terminus %s-%d: %s\n",
+ *pdba->resinfo[cc->r_start[i]].name,
+ pdba->resinfo[cc->r_start[i]].nr,
+ (cc->ntdb[i])->name);
sfree(tdblist);
}
}
{
if(bTerMan && ntdblist>1)
{
- cc->ctdb[i] = choose_ter(ntdblist,tdblist,"Select end terminus type");
+ sprintf(select,"Select end terminus type for %s-%d",
+ *pdba->resinfo[cc->r_end[i]].name,
+ pdba->resinfo[cc->r_end[i]].nr);
+ cc->ctdb[i] = choose_ter(ntdblist,tdblist,select);
}
else
{
cc->ctdb[i] = tdblist[0];
}
- printf("End terminus: %s\n",(cc->ctdb[i])->name);
+ printf("End terminus %s-%d: %s\n",
+ *pdba->resinfo[cc->r_end[i]].name,
+ pdba->resinfo[cc->r_end[i]].nr,
+ (cc->ctdb[i])->name);
sfree(tdblist);
}
}
pdb2top(top_file2,posre_fn,molname,pdba,&x,atype,&symtab,
nrtp,restp,
restp_chain,hb_chain,
- cc->nterpairs,cc->ntdb,cc->ctdb,cc->r_start,cc->r_end,bAllowMissing,
+ cc->nterpairs,cc->ntdb,cc->ctdb,bAllowMissing,
bVsites,bVsiteAromatics,forcefield,ffdir,
mHmult,nssbonds,ssbonds,
long_bond_dist,short_bond_dist,bDeuterate,bChargeGroups,bCmap,
int nrtp, t_restp rtp[],
t_restp *restp, t_hackblock *hb,
int nterpairs,t_hackblock **ntdb, t_hackblock **ctdb,
- int *rn, int *rc, gmx_bool bAllowMissing,
+ gmx_bool bAllowMissing,
gmx_bool bVsites, gmx_bool bVsiteAromatics,
const char *ff, const char *ffdir,
real mHmult,
add_library(md ${MDLIB_SOURCES})
target_link_libraries(md gmx ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${XML_LIBRARIES})
-set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION})
+set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
-install(TARGETS md DESTINATION ${LIB_INSTALL_DIR})
+install(TARGETS md DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libmd.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig RENAME "libmd${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc
+ DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+ RENAME "libmd${GMX_LIBS_SUFFIX}.pc"
+ COMPONENT development)
{
cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc_null,
&ncons_loc,&p_ssd,&p_max,&p_imax);
- lincsd->rmsd_data[0] = ncons_loc;
/* Check if we are doing the second part of SD */
if (ir->eI == eiSD2 && v == NULL)
{
lincsd->rmsd_data[0] = ncons_loc;
lincsd->rmsd_data[i] = p_ssd;
}
+ else
+ {
+ lincsd->rmsd_data[0] = 0;
+ lincsd->rmsd_data[1] = 0;
+ lincsd->rmsd_data[2] = 0;
+ }
if (bLog && fplog && lincsd->nc > 0)
{
fprintf(fplog,
static real vrescale_gamdev(int ia, gmx_rng_t rng)
/* Gamma distribution, adapted from numerical recipes */
{
- int j;
- real am,e,s,v1,v2,x,y;
-
- if (ia < 6) {
- x = 1.0;
- for(j=1; j<=ia; j++) {
- x *= gmx_rng_uniform_real(rng);
+ int j;
+ real am,e,s,v1,v2,x,y;
+
+ if (ia < 6)
+ {
+ do
+ {
+ x = 1.0;
+ for(j=1; j<=ia; j++)
+ {
+ x *= gmx_rng_uniform_real(rng);
+ }
+ }
+ while (x == 0);
+ x = -log(x);
+ }
+ else
+ {
+ do
+ {
+ do
+ {
+ do
+ {
+ v1 = gmx_rng_uniform_real(rng);
+ v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
+ }
+ while (v1*v1 + v2*v2 > 1.0 ||
+ v1*v1*GMX_REAL_MAX < 3.0*ia);
+ /* The last check above ensures that both x (3.0 > 2.0 in s)
+ * and the pre-factor for e do not go out of range.
+ */
+ y = v2/v1;
+ am = ia - 1;
+ s = sqrt(2.0*am + 1.0);
+ x = s*y + am;
+ }
+ while (x <= 0.0);
+ e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
+ }
+ while (gmx_rng_uniform_real(rng) > e);
}
- x = -log(x);
- } else {
- do {
- do {
- do {
- v1 = gmx_rng_uniform_real(rng);
- v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
- } while (v1*v1 + v2*v2 > 1.0);
- y = v2/v1;
- am = ia - 1;
- s = sqrt(2.0*am + 1.0);
- x = s*y + am;
- } while (x <= 0.0);
- e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
- } while (gmx_rng_uniform_real(rng) > e);
- }
- return x;
+ return x;
}
static real vrescale_sumnoises(int nn,gmx_rng_t rng)
if (ir->bRot)
{
/* Update the local rotation groups */
- dd_make_local_rotation_groups(dd,ir->rot,mdatoms);
+ dd_make_local_rotation_groups(dd,ir->rot);
}
char buf[30];
rc = 0;
-
+
if (index < 0)
{
gmx_fatal(FARGS,"Invalid index in pr_ebin: %d",index);
{
nener = index + nener;
}
- for(i=index; (i<nener) && rc>=0; ) {
- if (bPrHead)
+ for(i=index; (i<nener) && rc>=0; )
+ {
+ if (bPrHead)
{
- i0=i;
- for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
+ i0=i;
+ for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
{
if (strncmp(eb->enm[i].name,"Pres",4) == 0)
{
rc = fprintf(fp,"%15s",eb->enm[i].name);
}
}
-
- if (rc >= 0)
+
+ if (rc >= 0)
{
- rc = fprintf(fp,"\n");
+ rc = fprintf(fp,"\n");
}
-
- i=i0;
- }
- for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
+
+ i=i0;
+ }
+ for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
{
switch (prmode) {
- case eprNORMAL: ee = eb->e[i].e; break;
- case eprAVER: ee = eb->e_sim[i].esum/eb->nsum_sim; break;
- default: gmx_fatal(FARGS,"Invalid print mode %d in pr_ebin",prmode);
+ case eprNORMAL: ee = eb->e[i].e; break;
+ case eprAVER: ee = eb->e_sim[i].esum/eb->nsum_sim; break;
+ default: gmx_fatal(FARGS,"Invalid print mode %d in pr_ebin",
+ prmode);
}
-
- rc = fprintf(fp," %12.5e",ee);
- }
- if (rc >= 0)
+
+ rc = fprintf(fp," %12.5e",ee);
+ }
+ if (rc >= 0)
{
- rc = fprintf(fp,"\n");
+ rc = fprintf(fp,"\n");
}
- }
- if (rc < 0)
- {
- gmx_fatal(FARGS,"Cannot write to logfile; maybe you are out of quota?");
- }
+ }
+ if (rc < 0)
+ {
+ gmx_fatal(FARGS,"Cannot write to logfile; maybe you are out of quota?");
+ }
}
#ifdef DEBUGEBIN
return 0;
}
-typedef union {
- real numlog;
- int exp;
-} u_table;
-
-void fill_log_table(const int n, real *table)
-{
- u_table log_table;
- real logfactor;
- int i;
-
- int incr = 1 << (23-n);
- int p=pow(2,n);
-
- logfactor = 1.0/log(2.0);
-
- log_table.exp = 0x3F800000;
-
- for(i=0;i<p;++i)
- {
- /* log2(numlog)=log(numlog)/log(2.0) */
- table[i]=log(log_table.numlog)*logfactor;
- log_table.exp+=incr;
- }
-}
-
-
-real table_log(real val, const real *table, const int n)
-{
- int *const exp_ptr = ((int*)&val);
- int x = *exp_ptr;
- const int log_2 = ((x>>23) & 255) - 127;
- x &= 0x7FFFFF;
- x = x >> (23-n);
- val = table[x];
- return ((val+log_2)*0.69314718);
-}
void gb_pd_send(t_commrec *cr, real *send_data, int nr)
{
return 0;
}
-
-
-#define LOG_TABLE_ACCURACY 15 /* Accuracy of the table logarithm */
-
-
/* Initialize all GB datastructs and compute polarization energies */
int init_gb(gmx_genborn_t **p_born,
const t_commrec *cr, t_forcerec *fr, const t_inputrec *ir,
snew(born,1);
*p_born = born;
- born->nr = fr->natoms_force;
born->nr = natoms;
snew(born->drobc, natoms);
}
}
- /* Init the logarithm table */
- p=pow(2,LOG_TABLE_ACCURACY);
- snew(born->log_table, p);
-
- fill_log_table(LOG_TABLE_ACCURACY, born->log_table);
-
/* Allocate memory for work arrays for temporary use */
snew(born->work,natoms+4);
snew(born->count,natoms);
{
ai = nl->iinr[i];
- nj0 = nl->jindex[ai];
- nj1 = nl->jindex[ai+1];
+ nj0 = nl->jindex[i];
+ nj1 = nl->jindex[i+1];
/* Load shifts for this list */
shift = nl->shift[i];
sk2_rinv = sk2*rinv;
prod = 0.25*sk2_rinv;
- /* log_term = table_log(uij*lij_inv,born->log_table,
- LOG_TABLE_ACCURACY); */
log_term = log(uij*lij_inv);
tmp = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term +
log_term = log(uij*lij_inv);
- /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
tmp = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
if(rai < sk-dr)
n = 0;
rb = born->work;
-
- n0 = md->start;
- n1 = md->start+md->homenr+1+natoms/2;
-
+ n0 = 0;
+ n1 = natoms;
+
if(gb_algorithm==egbSTILL)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
+ rbi = born->bRad[i];
+ rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
}
}
else if(gb_algorithm==egbHCT)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = rbi * rbi * dvda[k];
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * dvda[i];
}
}
else if(gb_algorithm==egbOBC)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
}
}
#if ( defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || (defined(GMX_DOUBLE) && defined(GMX_SSE2)) )
if(fr->UseOptimizedKernels)
{
- calc_gb_chainrule_sse2_double(born->nr, &(fr->gblist), fr->dadx, fr->dvda,
+ calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x[0], f[0], fr->fshift[0], fr->shift_vec[0],
gb_algorithm, born, md);
}
else
{
- calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda,
+ calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
}
#else
- calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda,
+ calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
#endif
/* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
if(fr->UseOptimizedKernels)
{
- calc_gb_chainrule_sse2_single(born->nr, &(fr->gblist), fr->dadx, fr->dvda,
+ calc_gb_chainrule_sse2_single(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x[0], f[0], fr->fshift[0], fr->shift_vec[0],
gb_algorithm, born, md);
}
else
{
- calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda,
+ calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
}
#else
/* Calculate the forces due to chain rule terms with non sse code */
- calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda,
+ calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
#endif
#endif
natoms = mdatoms->nr;
ni0 = mdatoms->start;
- ni1 = mdatoms->homenr;
+ ni1 = mdatoms->start+mdatoms->homenr;
factor = 0.5*ONE_4PI_EPS0;
n = 0;
natoms = mdatoms->nr;
ni0 = mdatoms->start;
- ni1 = mdatoms->homenr;
+ ni1 = mdatoms->start+mdatoms->homenr;
n = 0;
prod = 0;
natoms = mdatoms->nr;
ni0 = mdatoms->start;
- ni1 = mdatoms->homenr;
+ ni1 = mdatoms->start+mdatoms->homenr;
dadx = fr->dadx;
aadata = (gmx_allvsallgb2_data_t *)work;
rb = born->work;
- jjnr = nl->jjnr;
-
+ jjnr = nl->jjnr;
+
/* Loop to get the proper form for the Born radius term, sse style */
- n0 = md->start;
- n1 = md->start+md->homenr+1+natoms/2;
+ n0 = 0;
+ n1 = natoms;
if(gb_algorithm==egbSTILL)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
+ rbi = born->bRad[i];
+ rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
}
}
else if(gb_algorithm==egbHCT)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = rbi * rbi * dvda[k];
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * dvda[i];
}
}
else if(gb_algorithm==egbOBC)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
+ rbi = born->bRad[k];
+ rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
}
}
- jz = _mm_setzero_pd();
-
- n = j3A = j3B = 0;
-
+ jz = _mm_setzero_pd();
+
+ n = j3A = j3B = 0;
+
for(i=0;i<nl->nri;i++)
{
- ii = nl->iinr[i];
+ ii = nl->iinr[i];
ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_pd(shX+x[ii3+0]);
+ is3 = 3*nl->shift[i];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = nl->jindex[i];
+ nj1 = nl->jindex[i+1];
+
+ ix = _mm_set1_pd(shX+x[ii3+0]);
iy = _mm_set1_pd(shY+x[ii3+1]);
iz = _mm_set1_pd(shZ+x[ii3+2]);
-
+
rbai = _mm_load1_pd(rb+ii);
fix = _mm_setzero_pd();
fiy = _mm_setzero_pd();
fiz = _mm_setzero_pd();
+
-
- for(k=nj0;k<nj1-1;k+=2)
+ for(k=nj0;k<nj1-1;k+=2)
{
jnrA = jjnr[k];
jnrB = jjnr[k+1];
-
- j3A = 3*jnrA;
+
+ j3A = 3*jnrA;
j3B = 3*jnrB;
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
-
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
+
dx = _mm_sub_pd(ix,jx);
dy = _mm_sub_pd(iy,jy);
dz = _mm_sub_pd(iz,jz);
-
- GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
-
+
+ GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
+
/* load chain rule terms for j1-4 */
f_gb = _mm_load_pd(dadx);
dadx += 2;
f_gb_ai = _mm_load_pd(dadx);
dadx += 2;
- /* calculate scalar force */
- f_gb = _mm_mul_pd(f_gb,rbai);
- f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
- f_gb = _mm_add_pd(f_gb,f_gb_ai);
-
- tx = _mm_mul_pd(f_gb,dx);
- ty = _mm_mul_pd(f_gb,dy);
- tz = _mm_mul_pd(f_gb,dz);
-
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
+ /* calculate scalar force */
+ f_gb = _mm_mul_pd(f_gb,rbai);
+ f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
+ f_gb = _mm_add_pd(f_gb,f_gb_ai);
+
+ tx = _mm_mul_pd(f_gb,dx);
+ ty = _mm_mul_pd(f_gb,dy);
+ tz = _mm_mul_pd(f_gb,dz);
+
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
}
-
+
/*deal with odd elements */
if(k<nj1)
{
- jnrA = jjnr[k];
- j3A = 3*jnrA;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
-
- /* load chain rule terms */
- f_gb = _mm_load_pd(dadx);
- dadx += 2;
- f_gb_ai = _mm_load_pd(dadx);
- dadx += 2;
-
- /* calculate scalar force */
- f_gb = _mm_mul_sd(f_gb,rbai);
- f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
- f_gb = _mm_add_sd(f_gb,f_gb_ai);
-
- tx = _mm_mul_sd(f_gb,dx);
- ty = _mm_mul_sd(f_gb,dy);
- tz = _mm_mul_sd(f_gb,dz);
-
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
+ jnrA = jjnr[k];
+ j3A = 3*jnrA;
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
+
+ /* load chain rule terms */
+ f_gb = _mm_load_pd(dadx);
+ dadx += 2;
+ f_gb_ai = _mm_load_pd(dadx);
+ dadx += 2;
+
+ /* calculate scalar force */
+ f_gb = _mm_mul_sd(f_gb,rbai);
+ f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
+ f_gb = _mm_add_sd(f_gb,f_gb_ai);
+
+ tx = _mm_mul_sd(f_gb,dx);
+ ty = _mm_mul_sd(f_gb,dy);
+ tz = _mm_mul_sd(f_gb,dz);
+
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
}
-
+
/* fix/fiy/fiz now contain four partial force terms, that all should be
- * added to the i particle forces and shift forces.
- */
+ * added to the i particle forces and shift forces.
+ */
gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
}
-
+
return 0;
}
/* Loop to get the proper form for the Born radius term, sse style */
offset=natoms%4;
- n0 = md->start;
- n1 = md->start+md->homenr+1+natoms/2;
-
+ n0 = 0;
+ n1 = natoms;
+
if(gb_algorithm==egbSTILL)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
+ rbi = born->bRad[i];
+ rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
}
}
else if(gb_algorithm==egbHCT)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = rbi * rbi * dvda[k];
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * dvda[i];
}
}
else if(gb_algorithm==egbOBC)
{
for(i=n0;i<n1;i++)
{
- k = i % natoms;
- rbi = born->bRad[k];
- rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
}
}
#else
int
-gmx_fft_fftw2_empty;
-#endif /* GMX_FFT_FFTW2 */
+gmx_fft_fftw3_empty;
+#endif /* GMX_FFT_FFTW3 */
}
static void print_cycles(FILE *fplog, double c2t, const char *name, int nnodes,
- int n, gmx_cycles_t c, gmx_cycles_t tot)
+ int n, double c, double tot)
{
- char num[11];
+ char num[11];
- if (c > 0) {
- if (n > 0)
- sprintf(num,"%10d",n);
- else
- sprintf(num," ");
- fprintf(fplog," %-19s %4d %10s %12.3f %10.1f %5.1f\n",
- name,nnodes,num,c*1e-9,c*c2t,100*(double)c/(double)tot);
- }
+ if (c > 0)
+ {
+ if (n > 0)
+ {
+ sprintf(num,"%10d",n);
+ }
+ else
+ {
+ sprintf(num," ");
+ }
+ fprintf(fplog," %-19s %4d %10s %12.3f %10.1f %5.1f\n",
+ name,nnodes,num,c*1e-9,c*c2t,100*c/tot);
+ }
}
static gmx_bool subdivision(int ewc)
}
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
- gmx_wallcycle_t wc, double cycles[])
+ gmx_wallcycle_t wc, double cycles[])
{
double c2t,tot,sum;
int i,j,npp;
static const char *boxs_nm[] = { "Box-X", "Box-Y", "Box-Z" };
-static const char *tricl_boxs_nm[] = { "Box-XX", "Box-YX", "Box-YY",
- "Box-ZX", "Box-ZY", "Box-ZZ" };
+static const char *tricl_boxs_nm[] = {
+ "Box-XX", "Box-YY", "Box-ZZ",
+ "Box-YX", "Box-ZX", "Box-ZY"
+};
static const char *vol_nm[] = { "Volume" };
}
if (md->bDynBox)
{
- md->ib = get_ebin_space(md->ebin, md->bTricl ? NTRICLBOXS :
- NBOXS, md->bTricl ? tricl_boxs_nm : boxs_nm,
+ md->ib = get_ebin_space(md->ebin,
+ md->bTricl ? NTRICLBOXS : NBOXS,
+ md->bTricl ? tricl_boxs_nm : boxs_nm,
unit_length);
md->ivol = get_ebin_space(md->ebin, 1, vol_nm, unit_volume);
md->idens = get_ebin_space(md->ebin, 1, dens_nm, unit_density_SI);
}
if (md->bDynBox)
{
+ int nboxs;
if(md->bTricl)
{
bs[0] = box[XX][XX];
- bs[1] = box[YY][XX];
- bs[2] = box[YY][YY];
- bs[3] = box[ZZ][XX];
- bs[4] = box[ZZ][YY];
- bs[5] = box[ZZ][ZZ];
+ bs[1] = box[YY][YY];
+ bs[2] = box[ZZ][ZZ];
+ bs[3] = box[YY][XX];
+ bs[4] = box[ZZ][XX];
+ bs[5] = box[ZZ][YY];
+ nboxs=NTRICLBOXS;
}
else
{
bs[0] = box[XX][XX];
bs[1] = box[YY][YY];
bs[2] = box[ZZ][ZZ];
+ nboxs=NBOXS;
}
vol = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
dens = (tmass*AMU)/(vol*NANO*NANO*NANO);
}
}
}
- add_ebin(md->ebin,md->ib ,NBOXS,bs ,bSum);
+
+ add_ebin(md->ebin,md->ib ,nboxs,bs ,bSum);
add_ebin(md->ebin,md->ivol ,1 ,&vol ,bSum);
add_ebin(md->ebin,md->idens,1 ,&dens,bSum);
add_ebin(md->ebin,md->ipv ,1 ,&pv ,bSum);
order = pme->pme_order;
- thx = atc->theta[XX];
- thy = atc->theta[YY];
- thz = atc->theta[ZZ];
energy = 0;
for(n=0; (n<atc->n); n++) {
/* We only use the A-charges grid */
grid = pme->pmegridA;
- spread_on_grid(pme,atc,grid,TRUE,FALSE);
+ spread_on_grid(pme,atc,NULL,TRUE,FALSE);
*V = gather_energy_bsplines(pme,grid,atc);
}
}
atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
}
+ else
+ {
+ /* This could be necessary for TPI */
+ pme->atc[0].n = homenr;
+ }
for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
if (q == 0) {
}
#endif
where();
+
+ unwrap_periodic_pmegrid(pme,grid);
}
if (flags & GMX_PME_CALC_F)
{
- unwrap_periodic_pmegrid(pme,grid);
-
/* interpolate forces for our local atoms */
GMX_BARRIER(cr->mpi_comm_mygroup);
GMX_MPE_LOG(ev_gather_f_bsplines_start);
#include "gmxfio.h"
#include "mpelogging.h"
#include "groupcoord.h"
+#include "pull_rotation.h"
#include "gmx_sort.h"
}
-extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot,t_mdatoms *md)
+extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot)
{
gmx_ga2la_t ga2la;
int g;
endforeach(PROG)
install(TARGETS ${NGMX_PROGRAMS}
+ COMPONENT ngmx
RUNTIME DESTINATION ${BIN_INSTALL_DIR})
endif(X11_FOUND)
g_helixorient g_principal g_dipoles g_disre g_dist
g_dyndom g_enemat g_energy g_lie g_filter g_gyrate
g_h2order g_hbond g_helix g_mindist g_msd g_morph g_nmeig
- g_nmens g_order g_polystat g_potential g_rama g_rdf g_rms
+ g_nmens g_order g_kinetics g_polystat g_potential g_rama g_rdf g_rms
g_rmsf g_rotacf g_saltbr g_sas g_select g_sgangle g_sham g_sorient
g_spol g_spatial g_tcaf g_traj g_tune_pme g_vanhove
g_velacc g_clustsize g_mdmat g_wham g_sigeps g_bar
endforeach(TOOL ${GMX_TOOLS_PROGRAMS})
+install(TARGETS gmxana DESTINATION ${LIB_INSTALL_DIR} COMPONENT runtime)
install(TARGETS ${GMX_TOOLS_PROGRAMS}
- gmxana DESTINATION ${LIB_INSTALL_DIR}
- RUNTIME DESTINATION ${BIN_INSTALL_DIR})
+ DESTINATION ${BIN_INSTALL_DIR}
+ COMPONENT runtime)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxana.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxana.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxana.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig RENAME "libgmxana${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxana.pc
+ DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+ RENAME "libgmxana${GMX_LIBS_SUFFIX}.pc"
+ COMPONENT development)
(blk->sub[0].nr < 1) ||
(blk->sub[1].nr < 1) )
{
- gmx_fatal(FARGS, "Unexpected block data in file %s", filename);
+ gmx_fatal(FARGS,
+ "Unexpected/corrupted block data in file %s around time %g.",
+ filename, start_time);
}
derivative = blk->sub[0].ival[0];
( (derivative!=0) != (s->derivative!=0) ) )
{
fprintf(stderr, "Got foreign lambda=%g, expected: %g\n",
- s->foreign_lambda, foreign_lambda);
- fprintf(stderr, "Got derivative=%d, derivative: %d\n",
+ foreign_lambda, s->foreign_lambda);
+ fprintf(stderr, "Got derivative=%d, expected: %d\n",
derivative, s->derivative);
- gmx_fatal(FARGS, "Inconsistent data in file %s around t=%g", filename,
- start_time);
+ gmx_fatal(FARGS, "Corrupted data in file %s around t=%g.",
+ filename, start_time);
}
/* make room for the data */
(blk->sub[0].nr < 2) ||
(blk->sub[1].nr < 2) )
{
- gmx_fatal(FARGS, "Unexpected block data in file %s", filename);
+ gmx_fatal(FARGS,
+ "Unexpected/corrupted block data in file %s around time %g",
+ filename, start_time);
}
nhist=blk->nsub-2;
}
if (nhist > 2)
{
- gmx_fatal(FARGS, "Unexpected block data in file %s", filename);
+ gmx_fatal(FARGS,
+ "Unexpected/corrupted block data in file %s around time %g",
+ filename, start_time);
}
snew(s, 1);
srenew(hb0->h[0],4+nnframes/hb->wordlen);
srenew(hb0->g[0],4+nnframes/hb->wordlen);
}
- clearPshift(&(hb->per->pHist[a1][a2]));
+ if (NULL != hb->per->pHist)
+ {
+ clearPshift(&(hb->per->pHist[a1][a2]));
+ }
/* Copy temp array to target array */
for(m=0; (m<=nnframes); m++) {
trrStatus = (read_next_x(oenv,status,&t,natoms,x,box));
nframes++; /* + */
} /* + */
-#ifdef HAVE_OPENMP /* ++++++++++++++++� */
+#ifdef HAVE_OPENMP /* +++++++++++++++++ */
#pragma omp barrier
#endif
} while (trrStatus);
for (n = 0; n < nr_grps; n++)
{
+ /* Check whether we actually have all positions of the requested index
+ * group in the trajectory file */
+ if (gnx[n] > natoms)
+ {
+ gmx_fatal(FARGS, "You selected a group with %d atoms, but only %d atoms\n"
+ "were found in the trajectory.\n", gnx[n], natoms);
+ }
for (i = 0; i < gnx[n]; i++) /* loop over all atoms in index file */
{
if (bSpherical)
for (n = 0; n < nr_grps; n++)
{
fprintf(pot," %20.16g", potential[n][slice]);
- fprintf(fie," %20.16g", field[n][slice]);
+ fprintf(fie," %20.16g", field[n][slice]/1e9); /* convert to V/nm */
fprintf(cha," %20.16g", charge[n][slice]);
}
fprintf(pot,"\n");
static gmx_bool check_have_atoms(t_atoms *atoms, char *string)
{
if ( atoms==NULL ) {
- printf("Can not process '%s' without atoms info\n", string);
+ printf("Can not process '%s' without atom info, use option -f\n", string);
return FALSE;
} else
return TRUE;