# Keep CMake suitably quiet on Cygwin
set(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required
-# override bugs on OS X where Cmake picks gcc (GNU) for C instead of system default cc (Clang).
-if(APPLE)
- set(CMAKE_C_COMPILER_INIT "cc")
-endif(APPLE)
+# CMake modules/macros are in a subdirectory to keep this file cleaner
+# This needs to be set before project() in order to pick up toolchain files
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
project(Gromacs)
include(Dart)
# provide backward compatibility of software written against the Gromacs API.
set(API_VERSION ${NUM_VERSION})
-# Cmake modules/macros are in a subdirectory to keep this file cleaner
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
endif()
endif()
########################################################################
-set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
-# Fix stupid flags on Windows
-########################################################################
-SET(SHARED_LIBS_DEFAULT ON)
-IF( WIN32 AND NOT CYGWIN)
- option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
- mark_as_advanced(GMX_PREFER_STATIC_LIBS)
- SET(SHARED_LIBS_DEFAULT OFF) #is currently not working on Windows
- # This makes windows.h not declare min/max as macros that would break
- # C++ code using std::min/std::max.
- add_definitions(-DNOMINMAX)
-
- IF (GMX_PREFER_STATIC_LIBS)
- #Only setting Debug and Release flags. Others configurations current not used.
- STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
- SET(CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE} CACHE STRING "" FORCE)
- STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
- SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
- STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
- SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
- STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
- ENDIF()
-
- #Workaround for cmake bug 13174. Replace deprecated options.
- IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
- STRING(REPLACE /GZ /RTC1 CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
- SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
- ENDIF()
- IF( CMAKE_CXX_COMPILER_ID MATCHES "Intel" )
- STRING(REPLACE /GZ /RTC1 CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
-
- STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
- SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
- ENDIF()
-ENDIF()
-
+option(GMX_COOL_QUOTES "Enable Gromacs cool quotes" ON)
+mark_as_advanced(GMX_COOL_QUOTES)
-########################################################################
+set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
# User input options #
########################################################################
option(GMX_DOUBLE "Use double precision (much slower, use only if you really need it)" OFF)
option(GMX_THREAD_MPI "Build a thread-MPI-based multithreaded version of GROMACS (not compatible with MPI)" ON)
option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
mark_as_advanced(GMX_SOFTWARE_INVSQRT)
-option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" OFF)
-mark_as_advanced(GMX_POWERPC_INVSQRT)
option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
mark_as_advanced(GMX_FAHCORE)
include(gmxDetectAcceleration)
if(NOT DEFINED GMX_CPU_ACCELERATION)
if(CMAKE_CROSSCOMPILING)
- set(GMX_SUGGESTED_CPU_ACCELERATION "None")
+ if("${CMAKE_SYSTEM_NAME}" MATCHES "BlueGeneQ")
+ set(GMX_SUGGESTED_CPU_ACCELERATION "IBM_QPX")
+ else()
+ set(GMX_SUGGESTED_CPU_ACCELERATION "None")
+ endif()
else(CMAKE_CROSSCOMPILING)
gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
endif(CMAKE_CROSSCOMPILING)
endif(NOT DEFINED GMX_CPU_ACCELERATION)
set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
- CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
+ CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
set(GMX_FFT_LIBRARY "fftw3"
CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
option(GMX_CYCLE_SUBCOUNTERS "Enable cycle subcounters to get a more detailed cycle timings" OFF)
mark_as_advanced(GMX_CYCLE_SUBCOUNTERS)
+option(GMX_SKIP_DEFAULT_CFLAGS "Don't automatically add suggested/required Compiler flags." OFF)
+mark_as_advanced(GMX_SKIP_DEFAULT_CFLAGS)
+
######################################################################
# Compiler tests
# These need to be done early (before further tests).
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
+# First exclude compilers known to not work with OpenMP although claim to support it:
+# gcc 4.2.1 and gcc-llvm 4.2.1 (also claims to be 4.2.1) on Mac OS X
+# This fixes redmine 900 and needs to run before OpenMP flags are set below.
+message("CMAKE_COMPILER_IS_GNUCC: ${CMAKE_COMPILER_IS_GNUCC}")
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND
+ CMAKE_COMPILER_IS_GNUCC AND C_COMPILER_VERSION VERSION_LESS 4.3)
+ message(STATUS "OpenMP multithreading not supported with gcc/llvm-gcc 4.2 on Mac OS X, disabled")
+ set(GMX_OPENMP OFF CACHE BOOL
+ "OpenMP multithreading not not supported with gcc/llvm-gcc 4.2 on Mac OS X, disabled!" FORCE)
+endif()
+
# OpenMP check must come before other CFLAGS!
if(GMX_OPENMP)
find_package(OpenMP)
if(GMX_SOFTWARE_INVSQRT)
set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
endif(GMX_SOFTWARE_INVSQRT)
-if(GMX_POWERPC_INVSQRT)
- set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
-endif(GMX_POWERPC_INVSQRT)
-
-########################################################################
-#Process MPI settings
-########################################################################
-include(gmxManageMPI)
#######################################################################
# Check for options incompatible with OpenMM build #
test_big_endian(GMX_INTEGER_BIG_ENDIAN)
+if(APPLE OR CYGWIN OR ${CMAKE_SYSTEM_NAME} MATCHES "Linux|.*BSD")
+ # Maybe Solaris should be here? Patch this if you know!
+ SET(SHARED_LIBS_DEFAULT ON)
+elseif(WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "BlueGene")
+ # Support for shared libs on native Windows is a bit new. Its
+ # default might change later if/when we sort things out. Also,
+ # Cray should go here. What variable value can detect it?
+ SET(SHARED_LIBS_DEFAULT OFF)
+else()
+ message(STATUS "Defaulting to building static libraries")
+ SET(SHARED_LIBS_DEFAULT OFF)
+endif()
+
+# Management of GROMACS options for specific toolchains should go
+# here. Because the initial settings for some of the main options have
+# already happened, but things like library detection and MPI compiler
+# feature detection have not, the docstrings for any over-rides of
+# GROMACS defaults or user settings will make sense. Also, any
+# toolchain-related reasons for choosing whether to detect various
+# things can be sorted out now, before the detection takes place.
+if(${CMAKE_SYSTEM_NAME} MATCHES BlueGene)
+ include(gmxManageBlueGene)
+endif()
+
+if(UNIX AND GMX_PREFER_STATIC_LIBS AND SHARED_LIBS_DEFAULT)
+ if(BUILD_SHARED_LIBS)
+ # Warn the user about the combination. But don't overwrite the request.
+ message(WARNING "Searching for static libraries requested, and building shared Gromacs libraries requested. This might cause problems linking later.")
+ elseif(NOT DEFINED BUILD_SHARED_LIBS)
+ # Change default to OFF. Don't warn if it's already off.
+ message(WARNING "Searching for static libraries requested, so the GROMACS libraries will also be built statically (BUILD_SHARED_LIBS=OFF)")
+ set(SHARED_LIBS_DEFAULT OFF)
+ endif()
+endif()
+
+# By now, all tool chains should have spoken up if they care about
+# the setting of SHARED_LIBS_DEFAULT.
+option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic e.g. with MPI, or on some HPC systems)" ${SHARED_LIBS_DEFAULT})
+########################################################################
+#Process MPI settings
+########################################################################
+include(gmxManageMPI)
########################################################################
# Find external packages #
########################################################################
-if(UNIX)
- if(GMX_PREFER_STATIC_LIBS)
- # On Linux .a is the static library suffix, on Mac OS X .lib can also
- # be used, so we'll add both to the preference list.
- SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib;.a" ${CMAKE_FIND_LIBRARY_SUFFIXES})
- if(SHARED_LIBS_DEFAULT)
- if(BUILD_SHARED_LIBS) #Warn the user about the combination. But don't overwrite the request.
- message(WARNING "Static libraries requested, and shared Gromacs libraries requested.")
- elseif(NOT DEFINED BUILD_SHARED_LIBS) #Change default to OFF. Don't warn if it's already off.
- message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
- set(SHARED_LIBS_DEFAULT OFF)
- endif()
- endif()
- endif()
+if(UNIX AND GMX_PREFER_STATIC_LIBS)
+ # On Linux .a is the static library suffix, on Mac OS X .lib can also
+ # be used, so we'll add both to the preference list.
+ SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib;.a" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
-option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic with MPI, Windows)" ${SHARED_LIBS_DEFAULT})
+
+IF( WIN32 AND NOT CYGWIN)
+ # This makes windows.h not declare min/max as macros that would break
+ # C++ code using std::min/std::max.
+ add_definitions(-DNOMINMAX)
+
+ if (NOT BUILD_SHARED_LIBS)
+ option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
+ if(NOT GMX_PREFER_STATIC_LIBS)
+ message(WARNING "Shared system libraries requested, and static Gromacs libraries requested.")
+ endif()
+ else()
+ message(FATAL_ERROR "BUILD_SHARED_LIBS not yet working for Windows in the master branch")
+ option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" OFF)
+ if(GMX_PREFER_STATIC_LIBS)
+ #this combination segfaults (illigal passing of file handles)
+ message(FATAL_ERROR "Static system libraries requested, and shared Gromacs libraries requested.")
+ endif()
+ add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
+ set(PKG_CFLAGS "$PKG_CFLAGS -DUSE_VISIBILITY -DTMPI_USE_VISIBILITY")
+ endif()
+ mark_as_advanced(GMX_PREFER_STATIC_LIBS)
+
+ IF (GMX_PREFER_STATIC_LIBS)
+ #Only setting Debug and Release flags. Others configurations are current not used.
+ STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+ STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+ if(CMAKE_CXX_COMPILER_LOADED)
+ STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+ STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+ endif()
+ ENDIF()
+ IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
+ if(BUILD_SHARED_LIBS) #not sure why incremental building with shared libs doesn't work
+ STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+ endif()
+ ENDIF()
+ENDIF()
option(GMX_XML "Use libxml2 to parse xml files" ON)
if (GMX_XML)
gmx_test__finite(HAVE__FINITE)
include(gmxTestCXX11)
-gmx_test_cxx11(GMX_CXX11 CXX11_FLAG)
-set(GROMACS_CXX_FLAGS "${CXX11_FLAG} ${GROMACS_CXX_FLAGS}")
+gmx_test_cxx11(GMX_CXX11 GMX_CXX11_FLAGS)
if(CXX11_FLAG AND GMX_GPU)
#FIXME: add proper solution for progate all but cxx11 flag
set(CUDA_PROPAGATE_HOST_FLAGS no)
set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_INTERNAL_XDR")
endif(NOT GMX_SYSTEM_XDR)
+# include avx test source, used if the AVX flags are set below
+include(gmxTestAVXMaskload)
+
# Process nonbonded accelerated kernels settings
string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
# nothing to do
elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
- GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" ACCELERATION_C_FLAGS)
if(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
endif(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" ACCELERATION_CXX_FLAGS)
if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
# We dont warn for lacking SSE2 flag support, since that is probably standard today.
# Only test the include after we have tried to add the correct flag for SSE2 support
- check_include_file(emmintrin.h HAVE_EMMINTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(emmintrin.h HAVE_EMMINTRIN_H ${ACCELERATION_C_FLAGS})
if(NOT HAVE_EMMINTRIN_H)
message(FATAL_ERROR "Cannot find emmintrin.h, which is required for SSE2 intrinsics support.")
elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
- GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" ACCELERATION_C_FLAGS)
if (NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" ACCELERATION_C_FLAGS)
endif(NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
- message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
# Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
- # intrinsics when SSE2 support is enabled, so we try that instead.
+ # intrinsics when SSE2 support is enabled, so we try that instead first.
if (GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
+ message(WARNING "Neither SSE4.1 or SSE2 seems to be supported by your Windows compiler. Something is likely broken.")
+ else()
+ message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance")
endif()
endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
# Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
# intrinsics when SSE2 support is enabled, so we try that instead.
if (GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
endif()
endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
# This must come after we have added the -msse4.1 flag on some platforms.
- check_include_file(smmintrin.h HAVE_SMMINTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(smmintrin.h HAVE_SMMINTRIN_H ${ACCELERATION_C_FLAGS})
if(NOT HAVE_SMMINTRIN_H)
message(FATAL_ERROR "Cannot find smmintrin.h, which is required for SSE4.1 intrinsics support.")
# Set the AVX compiler flag for both these choices!
- GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" ACCELERATION_C_FLAGS)
if (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" ACCELERATION_C_FLAGS)
endif (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
- GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
# Set the FMA4 flags (MSVC doesn't require any)
if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
- GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+ if (${CMAKE_COMPILER_ID} MATCHES "Clang")
+ message(FATAL_ERROR "Clang up to at least version 3.2 produces incorrect code for AVX_128_FMA. Sorry, but you will have to select a different compiler or acceleration.")
+ endif()
+ GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" ACCELERATION_C_FLAGS)
if (NOT GNU_FMA_CFLAG)
message(WARNING "No C FMA4 flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
endif(NOT GNU_FMA_CFLAG)
- GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" ACCELERATION_C_FLAGS)
# No big deal if we do not have xop, so no point yelling warnings about it.
if (CMAKE_CXX_COMPILER_LOADED)
- GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" ACCELERATION_CXX_FLAGS)
if (NOT GNU_FMA_CXXFLAG)
message(WARNING "No C++ FMA flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
endif (NOT GNU_FMA_CXXFLAG)
- GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" ACCELERATION_CXX_FLAGS)
# No big deal if we do not have xop, so no point yelling warnings about it.
endif()
endif()
# Only test the header after we have tried to add the flag for AVX support
- check_include_file(immintrin.h HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(immintrin.h HAVE_IMMINTRIN_H ${ACCELERATION_C_FLAGS})
if(NOT HAVE_IMMINTRIN_H)
message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
try_compile(TEST_AVX ${CMAKE_BINARY_DIR}
"${CMAKE_SOURCE_DIR}/cmake/TestAVX.c"
- COMPILE_DEFINITIONS "${GROMACS_C_FLAGS}")
+ COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
if(NOT TEST_AVX)
message(FATAL_ERROR "Cannot compile AVX intrinsics. Consider switching compiler.")
endif()
endif()
# GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
- check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
- check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(x86intrin.h HAVE_X86INTRIN_H ${ACCELERATION_C_FLAGS})
+ check_include_file(intrin.h HAVE_INTRIN_H ${ACCELERATION_C_FLAGS})
# The user should not be able to set this orthogonally to the acceleration
set(GMX_X86_SSE4_1 1)
endif()
endif()
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
-# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
- if (NOT ACCELERATION_QUIETLY)
- message(STATUS "Configuring for BlueGene")
+ # Unfortunately gcc-4.5.2 and gcc-4.6.0 has a bug where they use the wrong datatype for the formal
+ # parameter of the mask for maskload/maskstore arguments. Check if this is present, since we can work around it.
+ gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
+
+else(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
+ # Used on BlueGene/Q
+ if (CMAKE_C_COMPILER_ID MATCHES "XL")
+ GMX_TEST_CFLAG(XLC_BLUEGENEQ_CFLAG "-qarch=qp -qtune=qp" ACCELERATION_C_FLAGS)
+ try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c"
+ COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
+ if(NOT TEST_QPX)
+ message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
+ endif()
+ endif()
+ if (CMAKE_CXX_COMPILER_ID MATCHES "XL" AND CMAKE_CXX_COMPILER_LOADED)
+ GMX_TEST_CXXFLAG(XLC_BLUEGENEQ_CXXFLAG "-qarch=qp -qtune=qp" ACCELERATION_CXX_FLAGS)
+ try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
+ "cmake/TestQPX.c"
+ COMPILE_DEFINITIONS "${ACCELERATION_CXX_FLAGS")
+ if(NOT TEST_QPX)
+ message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
+ endif()
endif()
- set(GMX_BLUEGENE 1)
- if (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
- set(SHARED_LIBS_DEFAULT OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
- endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
- set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
- set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
- set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
- set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI not compatible with BlueGene, disabled!" FORCE)
- set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
-# Access to /etc/passwd is not available on the back end of BlueGene,
-# despite being detected by CMake. This can cause linker warnings
-# about harmless things in src/gmxlib/string2.h.
- set(HAVE_PWD_H OFF)
-# The automatic testing for endianness does not work for the BlueGene cross-compiler
- set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
- set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
- set(GMX_POWER6 1)
- set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
- set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
+
+ if (TEST_QPX)
+ message(WARNING "IBM QPX acceleration was selected and could be compiled, but the accelerated kernels are not yet available.")
+ set(GMX_CPU_ACCELERATION_IBM_QPX 1)
+ else()
+ message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
+ endif()
+
else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
- MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
+ MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
set(COREWRAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../corewrap" CACHE STRING
"Path to swindirect.h")
include_directories(${COREWRAP_INCLUDE_DIR})
+ set_property(CACHE GMX_COOL_QUOTES VALUE OFF)
endif(GMX_FAHCORE)
# # # # # # # # # # NO MORE TESTS AFTER THIS LINE! # # # # # # # # # # #
# these are set after everything else
-if (NOT DEFINED GROMACS_C_FLAGS_SET)
- set(GROMACS_C_FLAGS_SET true CACHE INTERNAL "Whether to reset the C flags"
- FORCE)
- set(CMAKE_C_FLAGS "${GROMACS_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING
- "Flags used by the compiler during all build types" FORCE)
- set(CMAKE_CXX_FLAGS "${GROMACS_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING
- "Flags used by the compiler during all build types" FORCE)
- set(CMAKE_EXE_LINKER_FLAGS
- "${GROMACS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
- CACHE STRING "Linker flags for creating executables" FORCE)
- set(CMAKE_SHARED_LINKER_FLAGS
- "${GROMACS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
- CACHE STRING "Linker flags for creating shared libraries" FORCE)
-endif (NOT DEFINED GROMACS_C_FLAGS_SET)
+if (NOT GMX_SKIP_DEFAULT_CFLAGS)
+ set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMX_CXX11_FLAGS} ${CMAKE_CXX_FLAGS}")
+ set(CMAKE_EXE_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+ set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+else()
+ message("Recommended flags which are not added because GMX_SKIP_DEFAULT_CFLAGS=yes:")
+ message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CFLAGS}")
+ message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}")
+ message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}")
+ message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMX_CXX11_FLAGS} ${GMXC_CXXFLAGS}")
+ message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
+ message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
+ message("CMAKE_EXE_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+ message("CMAKE_SHARED_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+endif()
if(NOT GMX_OPENMP)
#Unset all OpenMP flags in case OpenMP was disabled either by the user
set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} ${OpenMP_LINKER_FLAGS})
set(GMX_SHARED_LINKER_FLAGS ${GMX_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS})
endif()
+set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
######################################
# Output compiler and CFLAGS used
HEAD|Analyzing bonded interactions
g_angle|calculates distributions and correlations for angles and dihedrals
g_bond|calculates bond length distributions
-g_dih|analyzes dihedral transitions
mk_angndx|generates index files for g_angle
END
endif (${FFTW}_FOUND)
set(${FFTW}_HAVE_SIMD FALSE CACHE BOOL "If ${${FFTW}_PKG} was built with SIMD support")
-mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD)
+mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD ${FFTW}_HAVE_AVX)
--- /dev/null
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+#
+# BlueGeneQ base platform file.
+#
+# NOTE: Do not set your platform to "BlueGeneQ-base". This file is included
+# by the real platform files. Use one of these two platforms instead:
+#
+# BlueGeneQ-dynamic For dynamically linked builds
+# BlueGeneQ-static For statically linked builds
+#
+# This platform file tries its best to adhere to the behavior of the MPI
+# compiler wrappers included with the latest BG/Q drivers.
+#
+
+
+#
+# For BG/Q builds, we're cross compiling, but we don't want to re-root things
+# (e.g. with CMAKE_FIND_ROOT_PATH) because users may have libraries anywhere on
+# the shared filesystems, and this may lie outside the root. Instead, we set the
+# system directories so that the various system BG/Q CNK library locations are
+# searched first. This is not the clearest thing in the world, given IBM's driver
+# layout, but this should cover all the standard ones.
+#
+set(CMAKE_SYSTEM_LIBRARY_PATH
+ /bgsys/drivers/ppcfloor/comm/xl/lib # default comm layer (used by mpi compiler wrappers)
+ /bgsys/drivers/ppcfloor/spi/lib/ # other low-level stuff
+ /bgsys/drivers/ppcfloor/gnu-linux/powerpc64-bgq-linux/lib # CNK Linux image -- standard runtime libs, pthread, etc.
+)
+
+#
+# This adds directories that find commands should specifically ignore for cross compiles.
+# Most of these directories are the includeand lib directories for the frontend on BG/Q systems.
+# Not ignoring these can cause things like FindX11 to find a frontend PPC version mistakenly.
+# We use this on BG instead of re-rooting because backend libraries are typically strewn about
+# the filesystem, and we can't re-root ALL backend libraries to a single place.
+#
+set(CMAKE_SYSTEM_IGNORE_PATH
+ /lib /lib64 /include
+ /usr/lib /usr/lib64 /usr/include
+ /usr/local/lib /usr/local/lib64 /usr/local/include
+ /usr/X11/lib /usr/X11/lib64 /usr/X11/include
+ /usr/lib/X11 /usr/lib64/X11 /usr/include/X11
+ /usr/X11R6/lib /usr/X11R6/lib64 /usr/X11R6/include
+ /usr/X11R7/lib /usr/X11R7/lib64 /usr/X11R7/include
+)
+
+#
+# Indicate that this is a unix-like system
+#
+set(UNIX 1)
+
+#
+# Library prefixes, suffixes, extra libs.
+#
+set(CMAKE_LINK_LIBRARY_SUFFIX "")
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib") # lib
+set(CMAKE_STATIC_LIBRARY_SUFFIX ".a") # .a
+
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib") # lib
+set(CMAKE_SHARED_LIBRARY_SUFFIX ".so") # .so
+set(CMAKE_EXECUTABLE_SUFFIX "") # .exe
+set(CMAKE_DL_LIBS "dl")
+
+#
+# This macro needs to be called for dynamic library support. Unfortunately on BG/Q,
+# We can't support both static and dynamic links in the same platform file. The
+# dynamic link platform file needs to call this explicitly to set up dynamic linking.
+#
+macro(__BlueGeneQ_set_dynamic_flags compiler_id lang)
+ if (${compiler_id} STREQUAL XL)
+ # Flags for XL compilers if we explicitly detected XL
+ set(CMAKE_SHARED_LIBRARY_${lang}_FLAGS "-qpic")
+ set(CMAKE_SHARED_LIBRARY_CREATE_${lang}_FLAGS "-qmkshrobj -qnostaticlink")
+ set(BG/Q_${lang}_DYNAMIC_EXE_FLAGS "-qnostaticlink -qnostaticlink=libgcc")
+ else()
+ # Assume flags for GNU compilers (if the ID is GNU *or* anything else).
+ set(CMAKE_SHARED_LIBRARY_${lang}_FLAGS "-fPIC")
+ set(CMAKE_SHARED_LIBRARY_CREATE_${lang}_FLAGS "-shared")
+ set(BG/Q_${lang}_DYNAMIC_EXE_FLAGS "-dynamic")
+ endif()
+
+ # Both toolchains use the GNU linker on BG/Q, so these options are shared.
+ set(CMAKE_SHARED_LIBRARY_RUNTIME_${lang}_FLAG "-Wl,-rpath,")
+ set(CMAKE_SHARED_LIBRARY_RPATH_LINK_${lang}_FLAG "-Wl,-rpath-link,")
+ set(CMAKE_SHARED_LIBRARY_SONAME_${lang}_FLAG "-Wl,-soname,")
+ set(CMAKE_EXE_EXPORTS_${lang}_FLAG "-Wl,--export-dynamic")
+ set(CMAKE_SHARED_LIBRARY_LINK_${lang}_FLAGS "") # +s, flag for exe link to use shared lib
+ set(CMAKE_SHARED_LIBRARY_RUNTIME_${lang}_FLAG_SEP ":") # : or empty
+
+ set(BG/Q_${lang}_DEFAULT_EXE_FLAGS
+ "<FLAGS> <CMAKE_${lang}_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+ set(CMAKE_${lang}_LINK_EXECUTABLE
+ "<CMAKE_${lang}_COMPILER> ${BG/Q_${lang}_DYNAMIC_EXE_FLAGS} ${BG/Q_${lang}_DEFAULT_EXE_FLAGS}")
+endmacro()
+
+#
+# This macro needs to be called for static builds. Right now it just adds -Wl,-relax
+# to the link line.
+#
+macro(__BlueGeneQ_set_static_flags compiler_id lang)
+ set(BG/Q_${lang}_DEFAULT_EXE_FLAGS
+ "<FLAGS> <CMAKE_${lang}_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+ set(CMAKE_${lang}_LINK_EXECUTABLE
+ "<CMAKE_${lang}_COMPILER> ${BG/Q_${lang}_DEFAULT_EXE_FLAGS}")
+endmacro()
--- /dev/null
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+include(BlueGeneQ-static)
+__BlueGeneQ_set_static_flags(XL C)
+
+set(CMAKE_SYSTEM_NAME BlueGeneQ-static)
+# xl.ndebug is appropriate for production calculations. For debugging,
+# use xl to add back error checks and assertions
+set(CMAKE_C_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpicc)
+set(CMAKE_C_FLAGS_RELEASE "-O4 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+
+mark_as_advanced(CMAKE_XL_CreateExportList) # No idea what spams this
--- /dev/null
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+include(BlueGeneQ-static)
+__BlueGeneQ_set_static_flags(XL CXX)
+
+set(CMAKE_SYSTEM_NAME BlueGeneQ-static CACHE STRING "Cross-compiling for BlueGene/Q" FORCE)
+# xl.ndebug is appropriate for production calculations. For debugging,
+# use xl to add back error checks and assertions
+set(CMAKE_CXX_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpicxx)
+set(CMAKE_CXX_FLAGS_RELEASE "-O4 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+
+mark_as_advanced(CMAKE_XL_CreateExportList) # No idea what spams this
--- /dev/null
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+include(BlueGeneQ-base)
+set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+set(CMAKE_FIND_LIBRARY_PREFIXES "lib")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
--- /dev/null
+#include<immintrin.h>
+int main()
+{
+ __m256d a;
+ __m256i mask;
+ double d[4]={1,2,3,4};
+
+ a = _mm256_setzero_pd();
+ mask = _mm256_castpd_si256(a);
+
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+ a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
+#else
+ a = _mm256_maskload_pd(d,mask);
+#endif
+}
+
--- /dev/null
+int main()
+{
+ vector4double one = vec_splats(1.0);
+ vector4double zero = vec_sub(one,one);
+ return 0;
+}
# gcc
if(CMAKE_COMPILER_IS_GNUCC)
-
- #Fix for LLVM OpenMP bug (redmine 900). Needs to run before OpenMP flags are set below.
- if(GMX_OPENMP)
- exec_program(${CMAKE_C_COMPILER} ARGS --version OUTPUT_VARIABLE _compiler_output)
- if(_compiler_output MATCHES "llvm.*4\\.2")
- message(STATUS "OpenMP multithreading not supported with llvm-gcc 4.2, disabled")
- set(GMX_OPENMP OFF CACHE BOOL
- "OpenMP multithreading not not supported with llvm-gcc 4.2, disabled!" FORCE)
- endif()
- endif()
-
#flags are added in reverse order and -Wno* need to appear after -Wall
if(NOT GMX_OPENMP)
GMX_TEST_CFLAG(CFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CFLAGS)
GMX_TEST_CFLAG(CFLAGS_WARN "-Wall" GMXC_CFLAGS)
GMX_TEST_CFLAG(CFLAGS_STDGNU "-std=gnu99" GMXC_CFLAGS)
GMX_TEST_CFLAG(CFLAGS_OPT "-ip -funroll-all-loops" GMXC_CFLAGS_RELEASE)
- GMX_TEST_CFLAG(CFLAGS_SSE2 "-msse2" GMXC_CFLAGS_RELEASE)
GMX_TEST_CFLAG(CFLAGS_X86 "-mtune=core2" GMXC_CFLAGS_RELEASE)
GMX_TEST_CFLAG(CFLAGS_IA64 "-mtune=itanium2" GMXC_CFLAGS_RELEASE)
else()
GMX_TEST_CFLAG(CFLAGS_WARN "/W2" GMXC_CFLAGS)
- GMX_TEST_CFLAG(CFLAGS_SSE2 "/arch:SSE2" GMXC_CFLAGS_RELEASE)
GMX_TEST_CFLAG(CFLAGS_X86 "/Qip" GMXC_CFLAGS_RELEASE)
endif()
endif()
endif()
GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall" GMXC_CXXFLAGS)
GMX_TEST_CXXFLAG(CXXFLAGS_OPT "-ip -funroll-all-loops" GMXC_CXXFLAGS_RELEASE)
- GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "-msse2" GMXC_CXXFLAGS_RELEASE)
GMX_TEST_CXXFLAG(CXXFLAGS_X86 "-mtune=core2" GMXC_CXXFLAGS_RELEASE)
GMX_TEST_CXXFLAG(CXXFLAGS_IA64 "-mtune=itanium2"
GMXC_CXXFLAGS_RELEASE)
else()
GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W2" GMXC_CXXFLAGS)
- GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "/arch:SSE2" GMXC_CXXFLAGS_RELEASE)
GMX_TEST_CXXFLAG(CXXFLAGS_X86 "/Qip" GMXC_CXXFLAGS_RELEASE)
endif()
endif()
# now actually set the flags:
# C
- if ( NOT DEFINED GMXCFLAGS_SET AND NOT DEFINED ENV{CFLAGS} )
- set(GMXCFLAGS_SET true CACHE INTERNAL "Whether to reset the C flags"
- FORCE)
-
- set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}"
- CACHE STRING "Flags used by the compiler during all build types."
- FORCE)
- set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}"
- CACHE STRING "Flags used by the compiler during release builds."
- FORCE)
- set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}"
- CACHE STRING "Flags used by the compiler during debug builds."
- FORCE)
+ if ( NOT GMX_SKIP_DEFAULT_CFLAGS )
+ set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}")
+ set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
+ set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
endif()
# C++
- if ( NOT DEFINED GMXCXXFLAGS_SET AND NOT DEFINED ENV{CXXFLAGS} )
- set(GMXCXXFLAGS_SET true CACHE INTERNAL "Whether to reset the C++ flags"
- FORCE)
- set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}"
- CACHE STRING "Flags used by the compiler during all build types."
- FORCE)
+ if ( NOT GMX_SKIP_DEFAULT_CFLAGS)
+ set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE
- "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}"
- CACHE STRING "Flags used by the compiler during release builds."
- FORCE)
+ "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
set(CMAKE_CXX_FLAGS_DEBUG
- "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}"
- CACHE STRING "Flags used by the compiler during debug builds."
- FORCE)
+ "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
endif()
ENDMACRO(gmx_c_flags)
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+# Managing configuration for all kinds of BlueGene systems
+# BlueGene/L is probably obsolete, but does no harm
+# BlueGene/P needs testing, but hasn't changed
+# BlueGene/Q works
+message(STATUS "Configuring for BlueGene")
+
+if (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
+ # BlueGene/L never had shared lib support.
+ set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
+endif()
+if (${CMAKE_SYSTEM_NAME} MATCHES "BlueGene.*static")
+ # BlueGene/P claims shared library support, but Mark Abraham never
+ # got it to work. BlueGene/Q claims it, but discourages it for
+ # performance reasons. So unless information to the contrary ever
+ # comes to light, we should not mess about giving the user options
+ # that are useless when they've already selected a static toolchain.
+ set(BUILD_SHARED_LIBS OFF CACHE BOOL "Static BlueGene build toolchain selected, so shared libraries are disabled" FORCE)
+endif()
+
+set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
+set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
+set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on BlueGene" FORCE)
+
+# It is conceivable you could use ThreadMPI on BlueGene/Q by using its
+# facility to run lots of jobs on small chunks of the machine. You
+# certainly need proper MPI to use a whole chunk of the machine that
+# the scheduler will allocate.
+set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI generally not compatible with BlueGene, defaulting to disabled!")
+set(GMX_MPI ON CACHE BOOL "MPI is normally required on BlueGene" FORCE)
+
+# Access to /etc/passwd is not available on the back end of BlueGeneP
+# (at least), despite being detected by CMake. This can cause linker
+# warnings about harmless things in src/gmxlib/string2.h.
+set(HAVE_PWD_H OFF)
+
+# The automatic testing for endianness does not work for the BlueGene cross-compiler
+set(GMX_FLOAT_FORMAT_IEEE754 1 CACHE INTERNAL "" FORCE)
+set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big-endian floating-point byte order (by default)" FORCE)
+set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big-endian floating-point word order (by default)" FORCE)
endif()
# detect GPUs in the build host machine
-if (GMX_GPU OR GMX_GPU_AUTO AND NOT GMX_GPU_DETECTION_DONE)
+if ((GMX_GPU OR GMX_GPU_AUTO) AND NOT GMX_GPU_DETECTION_DONE)
include(gmxDetectGpu)
gmx_detect_gpu()
endif()
# We need to call find_package even when we've already done the detection/setup
if(GMX_GPU OR GMX_GPU_AUTO)
- if(NOT GMX_GPU AND GMX_GPU_AUTO AND GMX_GPU_DETECTION_DONE)
+ if(NOT GMX_GPU AND NOT GMX_DETECT_GPU_AVAILABLE)
# Stay quiet when detection has occured and found no GPU.
# Noise is acceptable when there is a GPU or the user required one.
set(FIND_CUDA_QUIETLY QUIET)
# - ON , FALSE: The user requested GPU builds, will require CUDA and will fail
# if it is not available.
# - ON , TRUE : Can't happen (GMX_GPU=ON can only be user-set at this point)
-if(GMX_GPU OR GMX_GPU_AUTO AND NOT GMX_GPU_DETECTION_DONE)
+if((GMX_GPU OR GMX_GPU_AUTO) AND NOT GMX_GPU_DETECTION_DONE)
if (EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
set(CUDA_FOUND TRUE CACHE INTERNAL "Whether the CUDA toolkit was found" FORCE)
else()
# user turns GMX_GPU=OFF after a failed cmake pass, these variables will be
# left behind in the cache.
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_SDK_ROOT_DIR CUDA_VERBOSE_BUILD)
+if(NOT GMX_GPU)
+ mark_as_advanced(CUDA_TOOLKIT_ROOT_DIR)
+endif()
macro(gmx_gpu_setup)
# set up nvcc options
endif()
find_package(MPI)
if(${${MPI_PREFIX}_FOUND})
- set(GROMACS_C_FLAGS ${GROMACS_C_FLAGS} ${${MPI_PREFIX}_COMPILE_FLAGS})
- set(GROMACS_LINKER_FLAGS ${GROMACS_LINKER_FLAGS} ${${MPI_PREFIX}_LINK_FLAGS})
+ set(MPI_COMPILE_FLAGS ${${MPI_PREFIX}_COMPILE_FLAGS})
+ set(MPI_LINKER_FLAGS ${${MPI_PREFIX}_LINK_FLAGS})
include_directories(${${MPI_PREFIX}_INCLUDE_PATH})
list(APPEND GMX_EXTRA_LIBRARIES ${${MPI_PREFIX}_LIBRARIES})
endif()
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# GMX_TEST_AVX_GCC_MASKLOAD_BUG(VARIABLE)
+#
+# VARIABLE will be set if the compiler is a buggy version
+# of GCC (prior to 4.5.3, and maybe 4.6) that has an incorrect second
+# argument to the AVX _mm256_maskload_ps() intrinsic.
+#
+# You need to use this variable in a cmakedefine, and then handle
+# the case separately in your code - no automatic cure, unfortunately.
+#
+MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG AVX_CFLAGS VARIABLE)
+ IF(NOT DEFINED ${VARIABLE})
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug")
+ # some compilers like clang accept both cases,
+ # so first try a normal compile to avoid flagging those as buggy.
+ TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+ "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+ COMPILE_DEFINITIONS "${AVX_CFLAGS}" )
+ IF(${VARIABLE}_COMPILEOK)
+ SET(${VARIABLE} 0 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+ ELSE()
+ TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+ "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+ COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+ IF(${VARIABLE}_COMPILEOK)
+ SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
+ ELSE()
+ MESSAGE(WARNING "Cannot compile AVX code - assuming gcc AVX maskload bug not present." )
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+ ENDIF()
+ ENDIF()
+ ENDIF(NOT DEFINED ${VARIABLE})
+ENDMACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE)
+
+
+
+
include(CheckCXXSourceCompiles)
MACRO(GMX_TEST_CXX11 VARIABLE FLAG)
- IF(NOT DEFINED HAVE_${VARIABLE})
- MESSAGE(STATUS "Checking for C++11 support")
- if(NOT WIN32)
- set(CXX11_FLAG "-std=c++0x")
- else()
- set(CXX11_FLAG "/Qstd=c++0x")
- endif()
- CHECK_CXX_COMPILER_FLAG("${CXX11_FLAG}" CXXFLAG_STD_CXX0X)
- if(NOT CXXFLAG_STD_CXX0X)
- set(CXX11_FLAG "")
- endif()
- set(CMAKE_REQUIRED_DEFINITIONS "${CXX11_FLAG}")
- check_cxx_source_compiles(
+ if(NOT WIN32)
+ set(CXX11_FLAG "-std=c++0x")
+ else()
+ set(CXX11_FLAG "/Qstd=c++0x")
+ endif()
+ CHECK_CXX_COMPILER_FLAG("${CXX11_FLAG}" CXXFLAG_STD_CXX0X)
+ if(NOT CXXFLAG_STD_CXX0X)
+ set(CXX11_FLAG "")
+ endif()
+ set(CMAKE_REQUIRED_DEFINITIONS "${CXX11_FLAG}")
+ check_cxx_source_compiles(
"#include <vector>
#include <memory>
#include <utility>
std::vector<A> v2;
v2.push_back(A()); //requires default move constructor
v2.push_back(A(new int(5))); //detects bug in ICC
-}" HAVE_${VARIABLE})
- set(CMAKE_REQUIRED_DEFINITIONS "")
- if(HAVE_${VARIABLE})
- set(${VARIABLE} 1 CACHE INTERNAL "Result of C++11 support test" FORCE)
- set(${FLAG} ${CXX11_FLAG} CACHE INTERNAL "Compiler flag for C++11 support" FORCE)
- MESSAGE(STATUS "Checking for C++11 support - yes")
- else()
- set(${VARIABLE} 0 CACHE INTERNAL "Result of C++11 support test" FORCE)
- set(${FLAG} "" CACHE INTERNAL "Compiler flag for C++11 support" FORCE)
- MESSAGE(STATUS "Checking for C++11 support - no")
- endif()
- ENDIF(NOT DEFINED HAVE_${VARIABLE})
+}" ${VARIABLE})
+ set(CMAKE_REQUIRED_DEFINITIONS "")
+ if(${VARIABLE})
+ set(${FLAG} ${CXX11_FLAG})
+ endif()
ENDMACRO()
# If you only use one shell you can copy that GMXRC.* instead.
-# only csh/tcsh understand 'set'
-set is_csh = 123
-test "$is_csh" = 123 && goto CSH
+# only csh/tcsh set the variable $shell (note: lower case!)
+test $shell && goto CSH
# if we got here, shell is bsh/bash/zsh/ksh
. @BIN_INSTALL_DIR@/GMXRC.bash
<br><a href=online/g_density.html>g_density</a>
<br><a href=online/g_densmap.html>g_densmap</a>
<br><a href=online/g_dielectric.html>g_dielectric</a>
-<br><a href=online/g_dih.html>g_dih</a>
<br><a href=online/g_dipoles.html>g_dipoles</a>
<br><a href=online/g_disre.html>g_disre</a>
<br><a href=online/g_dist.html>g_dist</a>
<TR><TD><A HREF="online/g_bond.html">g_bond</A></TD><TD>calculates bond length distributions</TD>
<TR><TD><A HREF="online/mk_angndx.html">mk_angndx</A></TD><TD>generates index files for g_angle</TD>
<TR><TD><A HREF="online/g_angle.html">g_angle</A></TD><TD>calculates distributions and correlations for angles and dihedrals</TD>
-<TR><TD><A HREF="online/g_dih.html">g_dih</A></TD><TD>analyzes dihedral transitions</TD>
</TABLE>
<A NAME="HNR11">
+++ /dev/null
-<HTML>
-<HEAD>
-<TITLE>g_dih</TITLE>
-<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
-<TABLE WIDTH="98%" NOBORDER >
-<TR><TD WIDTH=400>
-<TABLE WIDTH=400 NOBORDER>
-<TD WIDTH=116>
-<a href="http://www.gromacs.org/"><img SRC="../images/gmxlogo_small.png"BORDER=0 </a></td>
-<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>g_dih</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p><B>VERSION 4.5<br>
-Thu 26 Aug 2010</B></td></tr></TABLE>
-<HR>
-<H3>Description</H3>
-<p>
-g_dih can do two things. The default is to analyze dihedral transitions
-by merely computing all the dihedral angles defined in your topology
-for the whole trajectory. When a dihedral flips over to another minimum
-an angle/time plot is made.<p>
-The opther option is to discretize the dihedral space into a number of
-bins, and group each conformation in dihedral space in the
-appropriate bin. The output is then given as a number of dihedral
-conformations sorted according to occupancy.
-<P>
-<H3>Files</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>filename</TH><TH>type</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-f</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> traj.xtc</a></tt> </TD><TD> Input </TD><TD> Trajectory: <a href="xtc.html">xtc</a> <a href="trr.html">trr</a> <a href="trj.html">trj</a> <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> cpt </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-s</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> topol.tpr</a></tt> </TD><TD> Input </TD><TD> Run input file: <a href="tpr.html">tpr</a> <a href="tpb.html">tpb</a> <a href="tpa.html">tpa</a> </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-o</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="out.html"> hello.out</a></tt> </TD><TD> Output </TD><TD> Generic output file </TD></TR>
-</TABLE>
-<P>
-<H3>Other options</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>type</TH><TH>default</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> Print help info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]version</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> Print version info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0 </tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0 </tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0 </tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]sa</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> Perform cluster analysis in dihedral space instead of analysing dihedral transitions. </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-mult</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> mulitiplicity for dihedral angles (by default read from topology) </TD></TD>
-</TABLE>
-<P>
-<hr>
-<div ALIGN=RIGHT>
-<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
-<font size="-1"><a href="mailto:gromacs@gromacs.org">gromacs@gromacs.org</a></font><br>
-</div>
-</BODY>
<dl>
<dt>C format
-<dd><tt>"%5d%5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
+<dd><tt>"%5d%-5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
<dt>Fortran format
<dd><tt>(i5,2a5,i5,3f8.3,3f8.4)</tt>
<dt>Pascal format
wall-density, wall-ewald-zfac)
<li><A HREF="#pull"><b>COM pulling</b></A> (pull, ...)
<li><A HREF="#nmr"><b>NMR refinement</b></A> (disre, disre-weighting, disre-mixed, disre-fc, disre-tau, nstdisreout, orire, orire-fc, orire-tau, orire-fitgrp, nstorireout)
-<li><A HREF="#free"><b>Free energy calculations</b></A> (free-energy, nstdhdl, dhdl-print-energy, init-lambda, delta-lambda, fep-lambdas, coul-lambdas, vdw-lambdas, bonded-lambdas, restraint-lambdas, mass-lambdas, sc-alpha, sc-coul, sc-power, sc-r-power, sc-sigma, couple-moltype, couple-lambda0, couple-lambda1, couple-intramol)
-<li><A HREF="#expanded"><b>Expanded ensemble simulation</b></A> (lmc-stats, lmc-mc-move, lmc-seed, lmc-gibbsdelta, mc-temperature, nst-transition-matrix,init-lambda-weights,initial-wl-delta,wl-scale,wl-ratio,symmetrized-transition-matrix,lmc-forced-nstart,weight-c-range,mininum-var-min,lmc-weights-equil,weight-equil-wl-delta,weight-equil-number-all-lambda,weight-equil-number-steps,weight-equil-number-samples,weight-equil-count-ratio,simulated-tempering,simulated-tempering-scaling,sim-temp-low,sim-temp-high)
+<li><A HREF="#free"><b>Free energy calculations</b></A> (free-energy, nstdhdl, dhdl-print-energy, init-lambda, delta-lambda, fep-lambdas, coul-lambdas, vdw-lambdas, bonded-lambdas, restraint-lambdas, mass-lambdas, temperature-lambdas, sc-alpha, sc-coul, sc-power, sc-r-power, sc-sigma, couple-moltype, couple-lambda0, couple-lambda1, couple-intramol)
+<li><A HREF="#expanded"><b>Expanded ensemble simulation</b></A> (lmc-stats, lmc-mc-move, lmc-seed, lmc-gibbsdelta, mc-temperature, nst-transition-matrix, init-lambda-weights, initial-wl-delta, wl-scale, wl-ratio, symmetrized-transition-matrix, lmc-forced-nstart, mininum-var-min, lmc-weights-equil, weight-equil-wl-delta, weight-equil-number-all-lambda, weight-equil-number-steps, weight-equil-number-samples, weight-equil-count-ratio, simulated-tempering, simulated-tempering-scaling, sim-temp-low, sim-temp-high)
<li><A HREF="#neq"><b>Non-equilibrium MD</b></A> (acc-grps, accelerate, freezegrps, freezedim, cos-acceleration, deform)
<li><A HREF="#ef"><b>Electric fields</b></A> (E-x, E-xt, E-y, E-yt, E-z, E-zt )
<li><A HREF="#qmmm"><b>Mixed quantum/classical dynamics</b></A> (QMMM, QMMM-grps, QMMMscheme, QMmethod, QMbasis, QMcharge, Qmmult, CASorbitals, CASelectrons, SH)
use <b>verlet-buffer-drift</b>=-1 and set <b>rlist</b> manually.</dd>
<dt><b>rlist: (1) [nm]</b></dt>
-<dd>Cut-off distance for the short-range neighbor list, should be ≥ 0.
+<dd>Cut-off distance for the short-range neighbor list.
With <b>cutoff-scheme</b>=<b>Verlet</b>, this is by default set by the
<b>verlet-buffer-drift</b> option and the value of <b>rlist</b> is ignored.</dd>
<dt><b>rcoulomb-switch: (0) [nm]</b></dt>
<dd>where to start switching the Coulomb potential</dd>
-<dt><b>rcoulomb: (-1) [nm]</b></dt>
-<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx-->, should be ≥ 0</dd>
+<dt><b>rcoulomb: (1) [nm]</b></dt>
+<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx--></dd>
<dt><b>epsilon-r: (1)</b></dt>
<dd>The relative <!--Idx-->dielectric constant<!--EIdx-->.
<dt><b>rvdw-switch: (0) [nm]</b></dt>
<dd>where to start switching the LJ potential</dd>
-<dt><b>rvdw: (-1) [nm]</b></dt>
-<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx-->, should be ≥ 0</dd>
+<dt><b>rvdw: (1) [nm]</b></dt>
+<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx--></dd>
<dt><b>DispCorr:</b></dt>
<dd><dl compact></dd>
<dd>starting value for the lambda state (integer). Specified which columm of the lambda vector should be used.</dd>
<dt><b>delta-lambda: (0)</b></dt>
<dd>increment per time step for lambda</dd>
+<dt><b>fep-lambdas: ()</b></dt>
+<dd>Zero, one or more lambda values for which Delta H values will
+be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
+Free energy differences between different lambda values can then
+be determined with <tt>g_bar</tt>. <b>fep-lambdas</b> is different from the other -lambdas keywords because
+all components of the lambda vector that are not specified will use <b>fep-lambdas</b>.</dd>
<dt><b>coul-lambdas: ()</b></dt>
<dd>Zero, one or more lambda values for which Delta H values will
be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
Only the temperatures controlled with this component of the lambda vector.
Note that these lambdas should not be used for replica exchange, only for simulated tempering.</dd>
-<dt><b>fep-lambdas: ()</b></dt>
-<dd>Zero, one or more lambda values for which Delta H values will
-be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
-Free energy differences between different lambda values can then
-be determined with <tt>g_bar</tt>. <b>fep-lambdas</b> is different from the other -lambdas keywords because
-all components of the lambda vector that are not specified will use <b>fep-lambdas</b>.</dd>
-<dt><b>dhdl-derivatives: (yes)</b></dt>
-<dd>If yes (the default), the derivatives of the Hamiltonian with respect to lambda at each <b>nstdhdl</b> step are written out. These values are needed for interpolation of linear energy differences with <tt>g_bar</tt> (although the same can also be achieved with the right <b>foreign lambda</b> setting, that may not be as flexible), or with thermodynamic integration</dd>
<dt><b>sc-alpha: (0)</b></dt>
<dd>the soft-core parameter, a value of 0 results in linear interpolation of
the LJ and Coulomb interactions</dd>
<dt><b>q</b></dt>
<dd>the Van der Waals interactions are turned at lambda=0; soft-core interactions will be required to avoid singularities
<dt><b>none</b></dt>
-<dd>the Van der Waals interactions are turned off and the charges are zero at lambda=0; soft-core interactions will be required to avoid singularities
+<dd>the Van der Waals interactions are turned off and the charges are zero at lambda=0; soft-core interactions will be required to avoid singularities.
</dl>
<dt><b>couple-lambda1:</b></dt>
<dd> analogous to <b>couple-lambda1</b>, but for lambda=1
<dt><b>nstdhdl: (100)</b></dt>
<dd>the frequency for writing dH/dlambda and possibly Delta H to dhdl.xvg,
0 means no ouput, should be a multiple of <b>nstcalcenergy</b></dd>.</dd>
+<dt><b>dhdl-derivatives: (yes)</b></dt>
+<dd>If yes (the default), the derivatives of the Hamiltonian with respect to lambda at each <b>nstdhdl</b> step are written out. These values are needed for interpolation of linear energy differences with <tt>g_bar</tt> (although the same can also be achieved with the right <b>foreign lambda</b> setting, that may not be as flexible), or with thermodynamic integration</dd>
+<dt><b>dhdl-print-energy: (no)</b></dt>
+<dd> Include the total energy in the dhdl file. This information is needed for later analysis if the states of interest in the free e energy calculation are at different temperatures. If all are at the same temperature, this information is not needed.</dd>
<dt><b>separate-dhdl-file: (yes)</b></dt>
<dd><dl compact>
<dt><b>yes</b></dt>
<dt><b>lmc-stats:</b></dt>
<dd><dl compact>
<dt><b>no</b></dt>
-<dd>No Monte Carlo in state space</dd>
+<dd>No Monte Carlo in state space is performed.</dd>
<dt><b>metropolis-transition</b></dt>
-<dd> Uses the Metropolis weights to update the expanded ensemble weight of the state.
+<dd> Uses the Metropolis weights to update the expanded ensemble weight of each state.
Min{1,exp(-(beta_new u_new - beta_old u_old)}</dd>
<dt><b>barker-transition</b></dt>
-<dd> Uses the Barker transition critera to update the expanded ensemble weight of the state.</dd>
+<dd> Uses the Barker transition critera to update the expanded ensemble weight of each state i, defined by
+exp(-beta_new u_new)/[exp(-beta_new u_new)+exp(-beta_old u_old)</dd>
<dt><b>wang-landau</b></dt>
-<dd>Uses the Wang-Landau algorithm (in state space) to update the expanded ensemble weights.</dd>
+<dd>Uses the Wang-Landau algorithm (in state space, not energy space) to update the expanded ensemble weights.</dd>
<dt><b>min-variance</b></dt>
-<dd>Uses the minimum variance updating method of Escobedo et al to update the expanded ensemble weights. Weights
-will not be the free energies, but will rather emphasize states that need more sampling to give even uncertainty.
+<dd>Uses the minimum variance updating method of Escobedo et al. to update the expanded ensemble weights. Weights
+will not be the free energies, but will rather emphasize states that need more sampling to give even uncertainty.</dd>
</dl>
<dt><b>lmc-mc-move:</b></dt>
<dd><dl compact>
<dd> Randomly chooses a new state up or down, then uses the Metropolis critera to decide whether to accept or reject:
Min{1,exp(-(beta_new u_new - beta_old u_old)}</dd>
<dt><b>barker-transition</b></dt>
-<dd> Randomly chooses a new state up or down, then uses the Barker transition critera to decide whether to accept or reject: exp(-beta_new u_new)/[exp(-beta_new u_new)+exp(-beta_old u_old)] </dd>
+<dd> Randomly chooses a new state up or down, then uses the Barker transition critera to decide whether to accept or reject: exp(-beta_new u_new)/[exp(-beta_new u_new)+exp(-beta_old u_old)]</dd>
<dt><b>gibbs</b></dt>
<dd> Uses the conditional weights of the state given the coordinate (exp(-beta_i u_i) / sum_k exp(beta_i u_i) to
decide which state to move to.</dd>
<dd>
<dd> Uses the conditional weights of the state given the coordinate (exp(-beta_i u_i) / sum_k exp(beta_i u_i) to
decide which state to move to, EXCLUDING the current state, then uses a rejection step to ensure detailed
-balance. Always more efficient that Gibbs, though marginally so in many situations.</dd>
+balance. Always more efficient that Gibbs, though only marginally so in many situations, such as when only the nearest neighbors have decent phase space overlap.</dd>
</dl>
-
<dt><b>lmc-seed:</b></dt>
-<dd> random seed to use for Monte Carlo moves in state space. If not specified, <b>ld-seed</b> is used instead. </dd>
+<dd> random seed to use for Monte Carlo moves in state space. If not specified, <b>ld-seed</b> is used instead.</dd>
<dt><b>mc-temperature:</b></dt>
<dd> Temperature used for acceptance/rejection for Monte Carlo moves. If not specified, the temperature of the
simulation specified in the first group of <b>ref_t</b> is used.</dd>
-
-<dt><b>wl-scale: (0.8)</b></dt>
<dt><b>wl-ratio: (0.8)</b></dt>
-<dt><b>init-wl-delta: (1.0) </b></dt>
-<dt><b>wl-oneovert: (no) </b></dt>
+<dd>The cutoff for the histogram of state occupancies to be reset, and the free energy incrementor to be reset as delta -> delta*wl-scale. If we define the Nratio = (number of samples at each histogram) / (average number of samples at each histogram). <b>wl-ratio</b> of 0.8 means that means that the histogram is only considered flat if all Nratio > 0.8 AND simultaneously all 1/Nratio > 0.8.</dd>
+<dt><b>wl-scale: (0.8)</b></dt>
+<dd> Each time the histogram is considered flat, then the current value of the Wang-Landau incrementor for the free energies is multiplied by <b>wl-scale</b>. Value must be between 0 and 1.</dd>
+<dt><b>init-wl-delta: (1.0)</b></dt>
+<dd>The initial value of the Wang-Landau incrementor in kT. Some value near 1 kT is usually most efficient, though sometimes a value of 2-3 in units of kT works better if the free energy differences are large.</dd>
+<dt><b>wl-oneovert: (no)</b></dt>
+<dd>Set Wang-Landau incrementor to scale with 1/(simulation time) in the large sample limit. There is significant evidence that the standard Wang-Landau algorithms in state space presented here result in free energies getting 'burned in' to incorrect values that depend on the initial state. when <b>wl-oneovert</b> is true, then when the incrementor becomes less than 1/N, where N is the mumber of samples collected (and thus proportional to the data collection time, hence '1 over t'), then the Wang-Lambda incrementor is set to 1/N, decreasing every step. Once this occurs, <b>wl-ratio</b> is ignored, but the weights will still stop updating when the equilibration criteria set in <b>lmc-weights-equil</b> is achieved.</dd>
<dt><b>lmc-repeats: (1)</b></dt>
-<dt><b>lmc-gibbsdelta: (-1) </b></dt>
-<dt><b>lmc-forced-nstart: (0) </b></dt>
+<dd>Controls the number of times that each Monte Carlo swap type is performed each iteration. In the limit of large numbers of Monte Carlo repeats, then all methods converge to Gibbs sampling. The value will generally not need to be different from 1.</dd>
+<dt><b>lmc-gibbsdelta: (-1)</b></dt>
+<dd> Limit Gibbs sampling to selected numbers of neighboring states. For Gibbs sampling, it is sometimes inefficient to perform Gibbs sampling over all of the states that are defined. A positive value of <b>lmc-gibbsdelta</b> means that only states plus or minus <b>lmc-gibbsdelta</b> are considered in exchanges up and down. A value of -1 means that all states are considered. For less than 100 states, it is probably not that expensive to include all states.</dd>
+<dt><b>lmc-forced-nstart: (0)</b></dt>
+<dd> Force initial state space sampling to generate weights. In order to come up with reasonable initial weights, this setting allows the simulation to drive from the initial to the final lambda state, with <b>lmc-forced-nstart</b> steps at each state before moving on to the next lambda state. If <b>lmc-forced-nstart</b> is sufficiently long (thousands of steps, perhaps), then the weights will be close to correct. However, in most cases, it is probably better to simply run the standard weight equilibration algorithms.
<dt><b>nst-transition-matrix: (-1)</b></dt>
<dd>Frequency of outputting the expanded ensemble transition matrix. A negative number means it will only be printed at the end of the simulation.<dd>
<dt><b>symmetrized-transition-matrix: (no) </b></dt>
-<dd>Whether to symmetrize the empirical transition matrix</dd>
-<dt><b>mininum-var-min</b></dt>
-<dt><b>weight-c-range</b></dt>
-
+<dd>Whether to symmetrize the empirical transition matrix. In the infinite limit the matrix will be symmetric, but will diverge with statistical noise for short timescales. Forced symmetrization, by using the matrix T_sym = 1/2 (T + transpose(T)), removes problems like the existence of (small magnitude) negative eigenvalues.</dd>
+<dt><b>mininum-var-min: (100)</b></dt>
+<dd> The <b>min-variance</b> strategy (option of <b>lmc-stats</b> is only valid for larger number of samples, and can get stuck if too few samples are used at each state. <b>mininum-var-min</b> is the minimum number of samples that each state that are allowed before the <b>min-variance</b> strategy is activated if selected.
+<dt><b>init-lambda-weights: </b></dt>
+<dd>The initial weights (free energies) used for the expanded ensemble states. Default is a vector of zero weights. format is similar to the lambda vector settings in <b>fep-lambdas</b>, except the weights can be any floating point number. Units are kT. Its length must match the lambda vector lengths.<dd>
+<dt><b>lmc-weights-equil: (no)</b><dt>
+<dd><dl compact>
+<dt><b>no</b><dt>
+<dd>Expanded ensemble weights continue to be updated throughout the simulation.</dd>
+<dt><b>yes</b><dt>
+<dd>The input expanded ensemble weights are treated as equilibrated, and are not updated throughout the simulation.</dd>
+<dt><b>wl-delta</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the Wang-Landau incrementor falls below the value specified by <b>weight-equil-wl-delta</b>.</dd>
+<dt><b>number-all-lambda</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the number of samples at all of the lambda states is greater than the value specified by <b>weight-equil-number-all-lambda</b>.</dd>
+<dt><b>number-steps</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the number of steps is greater than the level specified by <b>weight-equil-number-steps</b>.</dd>
+<dt><b>number-samples</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the number of total samples across all lambda states is greater than the level specified by <b>weight-equil-number-samples</b>.</dd>
+<dt><b>count-ratio</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the ratio of samples at the least sampled lambda state and most sampled lambda state greater than the value specified by <b>weight-equil-count-ratio</b>.</dd>
+</dl>
<dt><b>simulated-tempering: (no)</b></dt>
-<dt><b>simulated-tempering-scaling: ()</b></dt>
-<dt><b>sim-temp-low: (300):</b></dt>
-<dd>Low temperature for simulated tempering</dd>
-<dt><b>sim-temp-high: (300):</b></dt>
-<dd>High temperature for simulated tempering</dd>
+<dd>Turn simulated tempering on or off. Simulated tempering is implemented as expanded ensemble sampling with different temperatures instead of different Hamiltonians.</dd>
+<dt><b>sim-temp-low: (300)</b></dt>
+<dd>Low temperature for simulated tempering.</dd>
+<dt><b>sim-temp-high: (300)</b></dt>
+<dd>High temperature for simulated tempering.</dd>
+<dt><b>simulated-tempering-scaling: (linear)</b></dt>
+<dd>Controls the way that the temperatures at intermediate lambdas are calculated from the <b>temperature-lambda</b> part of the lambda vector.</dd>
+<dd><dl compact>
+<dt><b>linear</b><dt>
+<dd>Linearly interpolates the temperatures using the values of <b>temperature-lambda</b>,i.e. if <b>sim-temp-low</b>=300, <b>sim-temp-high</b>=400, then lambda=0.5 correspond to a temperature of 350. A nonlinear set of temperatures can always be implemented with uneven spacing in lambda.</dd>
+<dt><b>geometric</b><dt>
+<dd> Interpolates temperatures geometrically between <b>sim-temp-low</b> and <b>sim-temp-high</b>. The ith state has temperature <b>sim-temp-low</b> * (<b>sim-temp-high</b>/<b>sim-temp-low</b>)^(i/(ntemps-1)). Should give roughly equal exchange for constant heat capacity, though of course things simulations that involve protein folding have very high heat capacity peaks.</dd>
+<dt><b>exponential</b><dt>
+<dd> Interpolates temperatures exponentially between <b>sim-temp-low</b> and <b>sim-temp-high</b>. The ith state has temperature
+<b>sim-temp-low</b> + (<b>sim-temp-high</b>-<b>sim-temp-low</b>)*((exp(<b>temperature-lambdas</b>[i])-1)/(exp(1.0)-1)).</dd>
+</dl>
</dl>
<A NAME="neq"><br>
<A HREF="#vel">gen-vel</A><br>
<A HREF="#pp">include</A><br>
<A HREF="#free">init-lambda</A><br>
+<A HREF="#expanded">init-lambda-weights</A><br>
<A HREF="#run">init-step</A><br>
+<A HREF="#expanded">initial-wl-delta</A><br>
<A HREF="#run">integrator</A><br>
<A HREF="#ld">ld-seed</A><br>
<A HREF="#bond2">lincs-iter</A><br>
<A HREF="#bond2">lincs-order</A><br>
<A HREF="#bond2">lincs-warnangle</A><br>
+<A HREF="#expanded">lmc-forced-nstart</A><br>
+<A HREF="#expanded">lmc-gibbsdelta</A><br>
+<A HREF="#expanded">lmc-mc-move</A><br>
+<A HREF="#expanded">lmc-seed</A><br>
+<A HREF="#expanded">lmc-stats</A><br>
+<A HREF="#expanded">lmc-weights-equil</A><br>
+<A HREF="#expanded">mc-temperature</A><br>
+<A HREF="#expanded">mininum-var-min</A><br>
<A HREF="#bond2">morse</A><br>
<A HREF="#em">nbfgscorr</A><br>
<A HREF="#xmdrun">niter</A><br>
<A HREF="#out">nstvout</A><br>
<A HREF="#out">nstxout</A><br>
<A HREF="#out">nstxtcout</A><br>
+<A HREF="#expanded">nst-transition-matrix</A><br>
<A HREF="#nl">ns-type</A><br>
<A HREF="#wall">nwall</A><br>
<A HREF="#ewald">optimize-fft</A><br>
<A HREF="#free">sc-power</A><br>
<A HREF="#free">sc-sigma</A><br>
<A HREF="#bond2">shake-tol</A><br>
+<A HREF="#expanded">sim-temp-low</A><br>
+<A HREF="#expanded">sim-temp-high</A><br>
+<A HREF="#expanded">simulated-tempering</A><br>
+<A HREF="#expanded">simulated-tempering-scaling</A><br>
+<A HREF="#expanded">symmetrized-transition-matrix</A><br>
<A HREF="#table">table-extension</A><br>
<A HREF="#pc">tau-p</A><br>
<A HREF="#tc">tau-t</A><br>
<A HREF="#walls">wall-ewald-zfac</A><br>
<A HREF="#walls">wall-r-linpot</A><br>
<A HREF="#walls">wall-type</A><br>
+<A HREF="#expanded">weight-equil-count-ratio</A><br>
+<A HREF="#expanded">weight-equil-number-all-lambda</A><br>
+<A HREF="#expanded">weight-equil-number-samples</A><br>
+<A HREF="#expanded">weight-equil-number-steps</A><br>
+<A HREF="#expanded">weight-equil-wl-delta</A><br>
+<A HREF="#expanded">wl-ratio</A><br>
+<A HREF="#expanded">wl-scale</A><br>
</multicol>
<hr>
g_covar
g_density
g_dielectric
-g_dih
g_dipoles
g_disre
g_dist
*/
#cmakedefine GMX_FLOAT_FORMAT_IEEE754
-/* Use assembly intrinsics kernels for BlueGene */
-#cmakedefine GMX_BLUEGENE
-
/* Work around broken calloc() */
#cmakedefine GMX_BROKEN_CALLOC
/* AVX 256-bit instructions available */
#cmakedefine GMX_X86_AVX_256
+/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
+#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+
/* SSE2 was selected as CPU acceleration level */
#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
/* AVX 256-bit was selected as CPU acceleration level */
#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_256
+/* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */
+#cmakedefine GMX_CPU_ACCELERATION_IBM_QPX
+
/* String for CPU acceleration choice (for writing to log files and stdout) */
#define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
/* Use the GROMACS software 1/sqrt(x) */
#cmakedefine GMX_SOFTWARE_INVSQRT
-/* Use the PowerPC hardware 1/sqrt(x) */
-#cmakedefine GMX_POWERPC_INVSQRT
-
/* Use sub-counters */
#cmakedefine GMX_CYCLE_SUBCOUNTERS
/* Build special-purpose mdrun library */
#cmakedefine GMX_FAHCORE
+/* Enable gromacs quotes */
+#cmakedefine GMX_COOL_QUOTES
+
#ifdef GMX_FAHCORE
#define FULLINDIRECT 1
#define USE_FAH_XDR 1
gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
endif()
-if(GMX_GPU)
- include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif()
-
add_library(libgromacs ${LIBGROMACS_SOURCES})
if (GMX_GIT_VERSION_INFO)
add_dependencies(libgromacs gmx_version)
ap->bAvail[i] = FALSE;
}
}
- upstring(atomnm);
- upstring(resnm);
ap->atomnm[ap->nprop] = strdup(atomnm);
ap->resnm[ap->nprop] = strdup(resnm);
j = ap->nprop;
else {
strncpy(atomname,atomnm,MAXQ-1);
}
- upstring(atomname);
strncpy(resname,resnm,MAXQ-1);
- upstring(resname);
j = get_prop_index(&(ap->prop[eprop]),ap->restype,resname,
atomname,&bExact);
* But old code can not read a new entry that is present in the file
* (but can read a new format when new entries are not present).
*/
-static const int cpt_version = 14;
+static const int cpt_version = 15;
const char *est_names[estNR]=
}
}
+static void do_cpt_real_err(XDR *xd,const char *desc,real *f)
+{
+ bool_t res=0;
+
+#ifdef GMX_DOUBLE
+ res = xdr_double(xd,f);
+#else
+ res = xdr_float(xd,f);
+#endif
+ if (res == 0)
+ {
+ cp_error();
+ }
+}
+
+static void do_cpt_n_rvecs_err(XDR *xd,const char *desc,int n, rvec f[],FILE *list)
+{
+ int i,j;
+
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<DIM; j++)
+ {
+ do_cpt_real_err(xd, desc, &f[i][j]);
+ }
+ }
+
+ if (list)
+ {
+ pr_rvecs(list,0,desc,f,n);
+ }
+}
+
/* If nval >= 0, nval is used; on read this should match the passed value.
* If nval n<0, *nptr is used; on read the value is stored in nptr
*/
int *natoms,int *ngtc, int *nnhpres, int *nhchainlength,
int *nlambda, int *flags_state,
int *flags_eks,int *flags_enh, int *flags_dfh,
+ int *nED,
FILE *list)
{
bool_t res=0;
} else {
*flags_dfh = 0;
}
+
+ if (*file_version >= 15)
+ {
+ do_cpt_int_err(xd,"ED data sets",nED,list);
+ }
+ else
+ {
+ *nED = 0;
+ }
}
static int do_cpt_footer(XDR *xd,gmx_bool bRead,int file_version)
return ret;
}
+
+/* This function stores the last whole configuration of the reference and
+ * average structure in the .cpt file
+ */
+static int do_cpt_EDstate(XDR *xd,gmx_bool bRead,
+ edsamstate_t *EDstate, FILE *list)
+{
+ int i,j;
+ int ret=0;
+ char buf[STRLEN];
+
+
+ EDstate->bFromCpt = bRead;
+
+ if (EDstate->nED <= 0)
+ {
+ return ret;
+ }
+
+ /* When reading, init_edsam has not been called yet,
+ * so we have to allocate memory first. */
+ if (bRead)
+ {
+ snew(EDstate->nref , EDstate->nED);
+ snew(EDstate->old_sref, EDstate->nED);
+ snew(EDstate->nav , EDstate->nED);
+ snew(EDstate->old_sav , EDstate->nED);
+ }
+
+ /* Read/write the last whole conformation of SREF and SAV for each ED dataset (usually only one) */
+ for (i=0; i< EDstate->nED; i++)
+ {
+ /* Reference structure SREF */
+ sprintf(buf, "ED%d # of atoms in reference structure", i+1);
+ do_cpt_int_err(xd, buf, &EDstate->nref[i],list);
+ sprintf(buf, "ED%d x_ref", i+1);
+ if (bRead)
+ {
+ snew(EDstate->old_sref[i], EDstate->nref[i]);
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref[i], list);
+ }
+ else
+ {
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref_p[i], list);
+ }
+
+ /* Average structure SAV */
+ sprintf(buf, "ED%d # of atoms in average structure", i+1);
+ do_cpt_int_err(xd, buf, &EDstate->nav[i] ,list);
+ sprintf(buf, "ED%d x_av", i+1);
+ if (bRead)
+ {
+ snew(EDstate->old_sav[i], EDstate->nav[i]);
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav[i], list);
+ }
+ else
+ {
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav_p[i], list);
+ }
+ }
+
+ return ret;
+}
+
+
static int do_cpt_files(XDR *xd, gmx_bool bRead,
gmx_file_position_t **p_outputfiles, int *nfiles,
FILE *list, int file_version)
DOMAINDECOMP(cr) ? cr->dd->nc : NULL,&npmenodes,
&state->natoms,&state->ngtc,&state->nnhpres,
&state->nhchainlength,&(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+ &state->edsamstate.nED,
NULL);
sfree(version);
(do_cpt_ekinstate(gmx_fio_getxdr(fp),FALSE,flags_eks,&state->ekinstate,NULL) < 0)||
(do_cpt_enerhist(gmx_fio_getxdr(fp),FALSE,flags_enh,&state->enerhist,NULL) < 0) ||
(do_cpt_df_hist(gmx_fio_getxdr(fp),FALSE,flags_dfh,&state->dfhist,NULL) < 0) ||
+ (do_cpt_EDstate(gmx_fio_getxdr(fp),FALSE,&state->edsamstate,NULL) < 0) ||
(do_cpt_files(gmx_fio_getxdr(fp),FALSE,&outputfiles,&noutputfiles,NULL,
file_version) < 0))
{
&eIntegrator_f,simulation_part,step,t,
&nppnodes_f,dd_nc_f,&npmenodes_f,
&natoms,&ngtc,&nnhpres,&nhchainlength,&nlambda,
- &fflags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+ &fflags,&flags_eks,&flags_enh,&flags_dfh,
+ &state->edsamstate.nED,NULL);
if (bAppendOutputFiles &&
file_version >= 13 && double_prec != GMX_CPT_BUILD_DP)
cp_error();
}
+ ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+ if (ret)
+ {
+ cp_error();
+ }
+
if (file_version < 6)
{
const char *warn="Reading checkpoint file in old format, assuming that the run that generated this file started at step 0, if this is not the case the averages stored in the energy file will be incorrect.";
&version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
&eIntegrator,simulation_part,step,t,&nppnodes,dd_nc,&npme,
&state->natoms,&state->ngtc,&state->nnhpres,&state->nhchainlength,
- &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+ &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+ &state->edsamstate.nED,NULL);
ret =
do_cpt_state(gmx_fio_getxdr(fp),TRUE,state->flags,state,bReadRNG,NULL);
if (ret)
cp_error();
}
+ ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+ if (ret)
+ {
+ cp_error();
+ }
+
ret = do_cpt_files(gmx_fio_getxdr(fp),TRUE,
outputfiles != NULL ? outputfiles : &files_loc,
outputfiles != NULL ? nfiles : &nfiles_loc,
&eIntegrator,&simulation_part,&step,&t,&nppnodes,dd_nc,&npme,
&state.natoms,&state.ngtc,&state.nnhpres,&state.nhchainlength,
&(state.dfhist.nlambda),&state.flags,
- &flags_eks,&flags_enh,&flags_dfh,out);
+ &flags_eks,&flags_enh,&flags_dfh,&state.edsamstate.nED,out);
ret = do_cpt_state(gmx_fio_getxdr(fp),TRUE,state.flags,&state,TRUE,out);
if (ret)
{
ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
flags_dfh,&state.dfhist,out);
}
+
+ if (ret == 0)
+ {
+ ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state.edsamstate,out);
+ }
+
if (ret == 0)
{
do_cpt_files(gmx_fio_getxdr(fp),TRUE,&outputfiles,&nfiles,out,file_version);
#ifdef HAVE_LIBMKL
#include <mkl.h>
#endif
-#ifdef GMX_GPU
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif
#ifdef GMX_FFT_FFTW3
#include <fftw3.h>
#endif
* but we dont call this routine often, and it avoids using
* a mutex for locking the variable...
*/
-#ifdef GMX_FAHCORE
+#ifdef GMX_COOL_QUOTES
+ return (getenv("GMX_NO_QUOTES") == NULL);
+#else
/*be uncool*/
return FALSE;
-#else
- return (getenv("GMX_NO_QUOTES") == NULL);
#endif
}
return _gmx_ver_string;
}
+void gmx_print_version_info_gpu(FILE *fp);
+
void gmx_print_version_info(FILE *fp)
{
-#ifdef GMX_GPU
- int cuda_driver,cuda_runtime;
-#endif
-
fprintf(fp, "Gromacs version: %s\n", _gmx_ver_string);
#ifdef GMX_GIT_VERSION_INFO
fprintf(fp, "GIT SHA1 hash: %s\n", _gmx_full_git_hash);
#else
fprintf(fp, "Precision: single\n");
#endif
+ fprintf(fp, "Memory model: %lu bit\n",8*sizeof(void *));
#ifdef GMX_THREAD_MPI
fprintf(fp, "MPI library: thread_mpi\n");
__INTEL_MKL__,__INTEL_MKL_MINOR__,__INTEL_MKL_UPDATE__);
#endif
#ifdef GMX_GPU
- fprintf(fp, "CUDA compiler: %s\n",CUDA_NVCC_COMPILER_INFO);
- cuda_driver = 0;
- cudaDriverGetVersion(&cuda_driver);
- cuda_runtime = 0;
- cudaRuntimeGetVersion(&cuda_runtime);
- fprintf(fp, "CUDA driver: %d.%d\n",cuda_driver/1000, cuda_driver%100);
- fprintf(fp, "CUDA runtime: %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+ gmx_print_version_info_gpu(fp);
#endif
}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include "buildinfo.h"
+
+extern "C" void gmx_print_version_info_gpu(FILE *fp)
+{
+ int cuda_driver,cuda_runtime;
+ fprintf(fp, "CUDA compiler: %s\n",CUDA_NVCC_COMPILER_INFO);
+ cuda_driver = 0;
+ cudaDriverGetVersion(&cuda_driver);
+ cuda_runtime = 0;
+ cudaRuntimeGetVersion(&cuda_runtime);
+ fprintf(fp, "CUDA driver: %d.%d\n",cuda_driver/1000, cuda_driver%100);
+ fprintf(fp, "CUDA runtime: %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+}
{ eftASC, ".m2p", "ps", NULL, "Input file for mat2ps"},
{ eftXDR, ".mtx", "hessian","-m", "Hessian matrix"},
{ eftASC, ".edi", "sam", NULL, "ED sampling input"},
- { eftASC, ".edo", "sam", NULL, "ED sampling output"},
{ eftASC, ".hat", "gk", NULL, "Fourier transform of spread function" },
{ eftASC, ".cub", "pot", NULL, "Gaussian cube file" },
{ eftASC, ".xpm", "root", NULL, "X PixMap compatible matrix file" },
#ifdef _MSC_VER
/* MSVC definition for __cpuid() */
#include <intrin.h>
+/* sysinfo functions */
+#include <windows.h>
#endif
#ifdef HAVE_UNISTD_H
/* sysconf() definition */
#include <unistd.h>
#endif
+#include "gmx_cpuid.h"
-#include "gmx_cpuid.h"
-
+/* For convenience, and to enable configure-time invocation, we keep all architectures
+ * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
+ */
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+# define GMX_CPUID_X86
+#endif
/* Global constant character strings corresponding to our enumerated types */
const char *
int stepping;
/* Not using gmx_bool here, since this file must be possible to compile without simple.h */
char feature[GMX_CPUID_NFEATURES];
+
+ /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between
+ * operating systems and sometimes even settings. For most other architectures you can likely just check
+ * the documentation and then write static information to these arrays rather than detecting on-the-fly.
+ */
+ int have_cpu_topology;
+ int nproc; /* total number of logical processors from OS */
+ int npackages;
+ int ncores_per_package;
+ int nhwthreads_per_core;
+ int * package_id;
+ int * core_id; /* Local core id in each package */
+ int * hwthread_id; /* Local hwthread id in each core */
+ int * locality_order; /* Processor indices sorted in locality order */
};
#endif
-/* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
- * if the compiler handles GNU-style inline assembly.
- */
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#ifdef GMX_CPUID_X86
/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
* contents of register output is returned. See Intel/AMD docs for details.
{
int rc = 0;
+ /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
+ * if the compiler handles GNU-style inline assembly.
+ */
+
#if (defined _MSC_VER)
int CPUInfo[4];
#endif
return rc;
}
-#endif /* architecture is x86 */
/* Identify CPU features common to Intel & AMD - mainly brand string,
execute_x86cpuid(0x80000007,0,&eax,&ebx,&ecx,&edx);
cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC] = (edx & (1 << 8)) != 0;
}
-
return 0;
}
+/* This routine returns the number of unique different elements found in the array,
+ * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2}
+ * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
+ * number of unique elements.
+ */
+static int
+cpuid_renumber_elements(int *data, int n)
+{
+ int *unique;
+ int i,j,nunique,found;
+
+ unique = malloc(sizeof(int)*n);
+
+ nunique=0;
+ for(i=0;i<n;i++)
+ {
+ for(j=0,found=0;j<nunique && !found;j++)
+ {
+ found = (data[i]==unique[j]);
+ }
+ if(!found)
+ {
+ /* Insert in sorted order! */
+ for(j=nunique++;j>0 && unique[j-1]>data[i];j--)
+ {
+ unique[j]=unique[j-1];
+ }
+ unique[j]=data[i];
+ }
+ }
+ /* renumber */
+ for(i=0;i<n;i++)
+ {
+ for(j=0;j<nunique;j++)
+ {
+ if(data[i]==unique[j])
+ {
+ data[i]=j;
+ }
+ }
+ }
+ return nunique;
+}
+
+/* APIC IDs, or everything you wanted to know about your x86 cores but were afraid to ask...
+ *
+ * Raw APIC IDs are unfortunately somewhat dirty. For technical reasons they are assigned
+ * in power-of-2 chunks, and even then there are no guarantees about specific numbers - all
+ * we know is that the part for each thread/core/package is unique, and how many bits are
+ * reserved for that part.
+ * This routine does internal renumbering so we get continuous indices, and also
+ * decodes the actual number of packages,cores-per-package and hwthreads-per-core.
+ */
+static void
+cpuid_x86_decode_apic_id(gmx_cpuid_t cpuid,int *apic_id,int core_bits,int hwthread_bits)
+{
+ int i,idx;
+ int hwthread_mask,core_mask_after_shift;
+
+ cpuid->hwthread_id = malloc(sizeof(int)*cpuid->nproc);
+ cpuid->core_id = malloc(sizeof(int)*cpuid->nproc);
+ cpuid->package_id = malloc(sizeof(int)*cpuid->nproc);
+ cpuid->locality_order = malloc(sizeof(int)*cpuid->nproc);
+
+ hwthread_mask = (1 << hwthread_bits) - 1;
+ core_mask_after_shift = (1 << core_bits) - 1;
+
+ for(i=0;i<cpuid->nproc;i++)
+ {
+ cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask;
+ cpuid->core_id[i] = (apic_id[i] >> hwthread_bits) & core_mask_after_shift;
+ cpuid->package_id[i] = apic_id[i] >> (core_bits + hwthread_bits);
+ }
+
+ cpuid->npackages = cpuid_renumber_elements(cpuid->package_id,cpuid->nproc);
+ cpuid->ncores_per_package = cpuid_renumber_elements(cpuid->core_id,cpuid->nproc);
+ cpuid->nhwthreads_per_core = cpuid_renumber_elements(cpuid->hwthread_id,cpuid->nproc);
+
+ /* Create a locality order array, i.e. first all resources in package0, which in turn
+ * are sorted so we first have all resources in core0, where threads are sorted in order, etc.
+ */
+ for(i=0;i<cpuid->nproc;i++)
+ {
+ idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i];
+ cpuid->locality_order[idx]=i;
+ }
+}
+
+
/* Detection of AMD-specific CPU features */
static int
cpuid_check_amd_x86(gmx_cpuid_t cpuid)
{
int max_stdfn,max_extfn;
unsigned int eax,ebx,ecx,edx;
-
+ int hwthread_bits,core_bits;
+ int * apic_id;
+
cpuid_check_common_x86(cpuid);
execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
cpuid->feature[GMX_CPUID_FEATURE_X86_XOP] = (ecx & (1 << 11)) != 0;
cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4] = (ecx & (1 << 16)) != 0;
}
-
+
+ /* Query APIC information on AMD */
+ if(max_extfn>=0x80000008)
+ {
+#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
+ /* Linux */
+ unsigned int i;
+ cpu_set_t cpuset,save_cpuset;
+ cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
+ apic_id = malloc(sizeof(int)*cpuid->nproc);
+ sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+ /* Get APIC id from each core */
+ CPU_ZERO(&cpuset);
+ for(i=0;i<cpuid->nproc;i++)
+ {
+ CPU_SET(i,&cpuset);
+ sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
+ execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+ apic_id[i]=ebx >> 24;
+ CPU_CLR(i,&cpuset);
+ }
+ /* Reset affinity to the value it had when calling this routine */
+ sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+#define CPUID_HAVE_APIC
+#elif defined GMX_NATIVE_WINDOWS
+ /* Windows */
+ DWORD_PTR i;
+ SYSTEM_INFO sysinfo;
+ unsigned int save_affinity,affinity;
+ GetSystemInfo( &sysinfo );
+ cpuid->nproc = sysinfo.dwNumberOfProcessors;
+ apic_id = malloc(sizeof(int)*cpuid->nproc);
+ /* Get previous affinity mask */
+ save_affinity = SetThreadAffinityMask(GetCurrentThread(),1);
+ for(i=0;i<cpuid->nproc;i++)
+ {
+ SetThreadAffinityMask(GetCurrentThread(),(((DWORD_PTR)1)<<i));
+ Sleep(0);
+ execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+ apic_id[i]=ebx >> 24;
+ }
+ SetThreadAffinityMask(GetCurrentThread(),save_affinity);
+#define CPUID_HAVE_APIC
+#endif
+#ifdef CPUID_HAVE_APIC
+ /* AMD does not support SMT yet - there are no hwthread bits in apic ID */
+ hwthread_bits = 0;
+ /* Get number of core bits in apic ID - try modern extended method first */
+ execute_x86cpuid(0x80000008,0,&eax,&ebx,&ecx,&edx);
+ core_bits = (ecx >> 12) & 0xf;
+ if(core_bits==0)
+ {
+ /* Legacy method for old single/dual core AMD CPUs */
+ int i = ecx & 0xF;
+ for(core_bits=0;(i>>core_bits)>0;core_bits++) ;
+ }
+ cpuid_x86_decode_apic_id(cpuid,apic_id,core_bits,hwthread_bits);
+ cpuid->have_cpu_topology = 1;
+#endif
+ }
return 0;
}
{
unsigned int max_stdfn,max_extfn;
unsigned int eax,ebx,ecx,edx;
- unsigned int i;
unsigned int max_logical_cores,max_physical_cores;
+ int hwthread_bits,core_bits;
+ int * apic_id;
cpuid_check_common_x86(cpuid);
cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0;
}
}
+
+ if(max_stdfn>=0xB)
+ {
+ /* Query x2 APIC information from cores */
+#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
+ /* Linux */
+ unsigned int i;
+ cpu_set_t cpuset,save_cpuset;
+ cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
+ apic_id = malloc(sizeof(int)*cpuid->nproc);
+ sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+ /* Get x2APIC ID from each hardware thread */
+ CPU_ZERO(&cpuset);
+ for(i=0;i<cpuid->nproc;i++)
+ {
+ CPU_SET(i,&cpuset);
+ sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
+ execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+ apic_id[i]=edx;
+ CPU_CLR(i,&cpuset);
+ }
+ /* Reset affinity to the value it had when calling this routine */
+ sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+#define CPUID_HAVE_APIC
+#elif defined GMX_NATIVE_WINDOWS
+ /* Windows */
+ DWORD_PTR i;
+ SYSTEM_INFO sysinfo;
+ unsigned int save_affinity,affinity;
+ GetSystemInfo( &sysinfo );
+ cpuid->nproc = sysinfo.dwNumberOfProcessors;
+ apic_id = malloc(sizeof(int)*cpuid->nproc);
+ /* Get previous affinity mask */
+ save_affinity = SetThreadAffinityMask(GetCurrentThread(),1);
+ for(i=0;i<cpuid->nproc;i++)
+ {
+ SetThreadAffinityMask(GetCurrentThread(),(((DWORD_PTR)1)<<i));
+ Sleep(0);
+ execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+ apic_id[i]=edx;
+ }
+ SetThreadAffinityMask(GetCurrentThread(),save_affinity);
+#define CPUID_HAVE_APIC
+#endif
+#ifdef CPUID_HAVE_APIC
+ execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+ hwthread_bits = eax & 0x1F;
+ execute_x86cpuid(0xB,1,&eax,&ebx,&ecx,&edx);
+ core_bits = (eax & 0x1F) - hwthread_bits;
+ cpuid_x86_decode_apic_id(cpuid,apic_id,core_bits,hwthread_bits);
+ cpuid->have_cpu_topology = 1;
+#endif
+ }
return 0;
}
+#endif /* GMX_CPUID_X86 */
+
+
/* Try to find the vendor of the current CPU, so we know what specific
* detection routine to call.
/* Set default first */
vendor = GMX_CPUID_VENDOR_UNKNOWN;
+#ifdef GMX_CPUID_X86
execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
memcpy(vendorstring,&ebx,4);
vendor = i;
}
}
-
+#else
+ vendor = GMX_CPUID_VENDOR_UNKNOWN;
+#endif
+
return vendor;
}
+int
+gmx_cpuid_topology(gmx_cpuid_t cpuid,
+ int * nprocessors,
+ int * npackages,
+ int * ncores_per_package,
+ int * nhwthreads_per_core,
+ const int ** package_id,
+ const int ** core_id,
+ const int ** hwthread_id,
+ const int ** locality_order)
+{
+ int rc;
+
+ if(cpuid->have_cpu_topology)
+ {
+ *nprocessors = cpuid->nproc;
+ *npackages = cpuid->npackages;
+ *ncores_per_package = cpuid->ncores_per_package;
+ *nhwthreads_per_core = cpuid->nhwthreads_per_core;
+ *package_id = cpuid->package_id;
+ *core_id = cpuid->core_id;
+ *hwthread_id = cpuid->hwthread_id;
+ *locality_order = cpuid->locality_order;
+ rc = 0;
+ }
+ else
+ {
+ rc = -1;
+ }
+ return rc;
+}
+
+
+enum gmx_cpuid_x86_smt
+gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
+{
+ enum gmx_cpuid_x86_smt rc;
+
+ if(cpuid->have_cpu_topology)
+ {
+ rc = (cpuid->nhwthreads_per_core>1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED;
+ }
+ else if(cpuid->vendor==GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0)
+ {
+ rc = GMX_CPUID_X86_SMT_DISABLED;
+ }
+ else
+ {
+ rc = GMX_CPUID_X86_SMT_CANNOTDETECT;
+ }
+ return rc;
+}
+
int
gmx_cpuid_init (gmx_cpuid_t * pcpuid)
{
cpuid->feature[i]=0;
}
-
+ cpuid->have_cpu_topology = 0;
+ cpuid->nproc = 0;
+ cpuid->npackages = 0;
+ cpuid->ncores_per_package = 0;
+ cpuid->nhwthreads_per_core = 0;
+ cpuid->package_id = NULL;
+ cpuid->core_id = NULL;
+ cpuid->hwthread_id = NULL;
+ cpuid->locality_order = NULL;
+
cpuid->vendor = cpuid_check_vendor();
-
+
switch(cpuid->vendor)
{
+#ifdef GMX_CPUID_X86
case GMX_CPUID_VENDOR_INTEL:
cpuid_check_intel_x86(cpuid);
break;
case GMX_CPUID_VENDOR_AMD:
cpuid_check_amd_x86(cpuid);
break;
+#endif
default:
/* Could not find vendor */
strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
cpuid->family = 0;
cpuid->model = 0;
cpuid->stepping = 0;
-
+
for(i=0;i<GMX_CPUID_NFEATURES;i++)
{
cpuid->feature[i]=0;
}
-enum gmx_cpuid_x86_smt
-gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
-{
-
-#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
- int i;
- int nproc;
- cpu_set_t cpuset,save_cpuset;
- int * apic_id;
- unsigned int eax,ebx,ecx,edx;
- int core_shift_bits;
- int smt_found;
-
- if( gmx_cpuid_vendor(cpuid)!=GMX_CPUID_VENDOR_INTEL ||
- gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0)
- {
- return GMX_CPUID_X86_SMT_DISABLED;
- }
-
- /* Check cpuid max standard function */
- execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
-
- /* Early CPUs that do not support function 11 do not support SMT either */
- if(eax<0xB)
- {
- return GMX_CPUID_X86_SMT_DISABLED;
- }
-
- /* If we got here, it is a modern Intel CPU that supports detection, as does our OS */
-
- /* How many processors? */
- nproc = sysconf(_SC_NPROCESSORS_ONLN);
-
- apic_id = malloc(sizeof(int)*nproc);
-
- sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
-
- /* Get x2APIC ID from each hardware thread */
- CPU_ZERO(&cpuset);
- for(i=0;i<nproc;i++)
- {
- CPU_SET(i,&cpuset);
- sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
- execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
- apic_id[i]=edx;
- CPU_CLR(i,&cpuset);
- }
- /* Reset affinity to the value it had when calling this routine */
- sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
-
- core_shift_bits = eax & 0x1F;
-
- /* Check if there is any other APIC id that is identical to [0], apart from
- * the hardware thread bit.
- */
- smt_found = 0;
- for(i=1;i<nproc && smt_found==0;i++)
- {
- smt_found = (apic_id[i]>>core_shift_bits == apic_id[0] >> core_shift_bits);
- }
-
- free(apic_id);
-
- if(smt_found==1)
- {
- return GMX_CPUID_X86_SMT_ENABLED;
- }
- else
- {
- return GMX_CPUID_X86_SMT_DISABLED;
- }
-#else
- /* Do the trivial stuff first. If Hyper-Threading isn't even supported it
- * cannot be enabled, no matter what OS detection we use!
- */
- if(0==gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT))
- {
- return GMX_CPUID_X86_SMT_DISABLED;
- }
- else
- {
- return GMX_CPUID_X86_SMT_CANNOTDETECT;
- }
-#endif
-}
-
-
-
#ifdef GMX_CPUID_STANDALONE
/* Stand-alone program to enable queries of CPU features from Cmake.
}
else
{
- sprintf(sbuf, "%d GPU%s %sselected to be used for this run: ",
+ sprintf(sbuf, "%d GPU%s %sselected for this run: ",
ngpu, (ngpu > 1) ? "s" : "",
gpu_info->bUserSet ? "user-" : "auto-");
for (i = 0; i < ngpu; i++)
if (idstr[i] < '0' || idstr[i] > '9')
{
gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
- invalid_gpuid_hint, idstr[i]);
+ idstr[i], invalid_gpuid_hint);
}
idlist[i] = idstr[i] - '0';
}
#endif
/* inform the user about the settings */
- if (SIMMASTER(cr) && bOMP)
+ if (bOMP)
{
#ifdef GMX_THREAD_MPI
const char *mpi_str="per tMPI thread";
/* for group scheme we print PME threads info only */
if (bFullOmpSupport)
{
- fprintf(stderr, "Using %d OpenMP thread%s %s\n",
- modth.gnth,modth.gnth > 1 ? "s" : "",
- cr->nnodes > 1 ? mpi_str : "");
+ md_print_info(cr, fplog, "Using %d OpenMP thread%s %s\n",
+ modth.gnth,modth.gnth > 1 ? "s" : "",
+ cr->nnodes > 1 ? mpi_str : "");
}
if (bSepPME && modth.gnth_pme != modth.gnth)
{
- fprintf(stderr, "Using %d OpenMP thread%s %s for PME\n",
- modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
- cr->nnodes > 1 ? mpi_str : "");
+ md_print_info(cr, fplog, "Using %d OpenMP thread%s %s for PME\n",
+ modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
+ cr->nnodes > 1 ? mpi_str : "");
}
}
def_bonded ("RBDIHS", "Ryckaert-Bell.", 4, 6, 6, eNR_RB, rbdihs ),
def_bonded ("FOURDIHS", "Fourier Dih.", 4, 4, 4, eNR_FOURDIH, rbdihs ),
def_bonded ("IDIHS", "Improper Dih.", 4, 2, 2, eNR_IMPROPER,idihs ),
- def_bonded ("PIDIHS", "Improper Dih.", 4, 3, 3, eNR_PROPER, pdihs ),
+ def_bonded ("PIDIHS", "Improper Dih.", 4, 3, 3, eNR_IMPROPER, pdihs ),
def_bondedt ("TABDIHS", "Tab. Dih.", 4, 2, 2, eNR_TABDIHS, tab_dihs ),
def_bonded ("CMAP", "CMAP Dih.", 5, -1, -1, eNR_CMAP, unimplemented ),
def_bonded ("GB12", "GB 1-2 Pol.", 2, 4, 0, eNR_GB, unimplemented ),
int i,nbtot;
gmx_bool bMultiPart;
- if (at_start != 0) {
- gmx_incons("In mk_graph_ilist at_start can not be != 0");
- }
- g->natoms = at_end;
+ /* The naming is somewhat confusing, but we need g->at0 and g->at1
+ * for shifthing coordinates to a new array (not in place) when
+ * some atoms are not connected by the graph, which runs from
+ * g->at_start (>= g->at0) to g->at_end (<= g->at1).
+ */
+ g->at0 = at_start;
+ g->at1 = at_end;
snew(nbond,at_end);
nbtot = calc_start_end(fplog,g,ilist,at_start,at_end,nbond);
sfree(nbond);
- snew(g->ishift,g->natoms);
+ snew(g->ishift,g->at1);
if (gmx_debug_at)
p_graph(debug,"graph",g);
* at all. If we return without doing this for a system without bonds
* (i.e. only settles) all water molecules are moved to the opposite octant
*/
- for(i=0; (i<g->natoms); i++) {
+ for(i=g->at0; (i<g->at1); i++) {
g->ishift[i][XX]=g->ishift[i][YY]=g->ishift[i][ZZ]=0;
}
g1 = g->at_end;
is = g->ishift;
- for(j=0; j<g0; j++) {
+ for(j=g->at0; j<g0; j++) {
copy_rvec(x[j],x_s[j]);
}
}
}
- for(j=g1; j<g->natoms; j++) {
+ for(j=g1; j<g->at1; j++) {
copy_rvec(x[j],x_s[j]);
}
}
g1 = g->at_end;
is = g->ishift;
- for(j=0; j<g0; j++) {
+ for(j=g->at0; j<g0; j++) {
copy_rvec(x_s[j],x[j]);
}
}
}
- for(j=g1; j<g->natoms; j++) {
+ for(j=g1; j<g->at1; j++) {
copy_rvec(x_s[j],x[j]);
}
}
* written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
* a full list of developers and information, check out http://www.gromacs.org
*
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) any
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
* later version.
* As a special exception, you may use this file as part of a free software
* library without restriction. Specifically, if other files instantiate
* templates or use macros or inline functions from this file, or you compile
* this file and link it with other files to produce an executable, this
* file does not by itself cause the resulting executable to be covered by
- * the GNU Lesser General Public License.
+ * the GNU Lesser General Public License.
*
* In plain-speak: do not worry about classes/macros/templates either - only
* changes to the library have to be LGPL, not an application linking with it.
__m128d xmm1)
{
__m128d t2;
-
+
t2 = _mm_unpackhi_pd(xmm1,xmm1);
- _mm_store_sd(ptrA,xmm1);
- _mm_store_sd(ptrB,t2);
+ _mm_store_sd(ptrA,xmm1);
+ _mm_store_sd(ptrB,t2);
}
static void
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
- _mm_store_sd(ptrA,xmm1);
+ _mm_store_sd(ptrA,xmm1);
}
double * gmx_restrict ptrB, __m128d xmm1)
{
__m128d t1;
-
+
t1 = _mm_unpackhi_pd(xmm1,xmm1);
xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA));
t1 = _mm_add_sd(t1,_mm_load_sd(ptrB));
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
-
+
tmp = gmx_mm_load_1real_pd(ptrA);
tmp = _mm_add_sd(tmp,xmm1);
gmx_mm_store_1real_pd(ptrA,tmp);
__m128d * gmx_restrict c12)
{
__m128d t1,t2,t3;
-
+
/* The c6/c12 array should be aligned */
t1 = _mm_loadu_pd(p1);
t2 = _mm_loadu_pd(p2);
- *c6 = _mm_unpacklo_pd(t1,t2);
- *c12 = _mm_unpackhi_pd(t1,t2);
+ *c6 = _mm_unpacklo_pd(t1,t2);
+ *c12 = _mm_unpackhi_pd(t1,t2);
}
static gmx_inline void
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1,
- __m128d * gmx_restrict y1,
- __m128d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1,
+ __m128d * gmx_restrict y1,
+ __m128d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz;
-
+
mem_xy = _mm_loadu_pd(xyz);
mem_z = _mm_load_sd(xyz+2);
mem_sxy = _mm_loadu_pd(xyz_shift);
mem_sz = _mm_load_sd(xyz_shift+2);
-
+
mem_xy = _mm_add_pd(mem_xy,mem_sxy);
mem_z = _mm_add_pd(mem_z,mem_sz);
-
+
*x1 = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
*y1 = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
*z1 = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-
+
t1 = _mm_loadu_pd(xyz);
t2 = _mm_loadu_pd(xyz+2);
t3 = _mm_loadu_pd(xyz+4);
t4 = _mm_loadu_pd(xyz+6);
t5 = _mm_load_sd(xyz+8);
-
+
sxy = _mm_loadu_pd(xyz_shift);
sz = _mm_load_sd(xyz_shift+2);
szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-
+
t1 = _mm_add_pd(t1,sxy);
t2 = _mm_add_pd(t2,szx);
t3 = _mm_add_pd(t3,syz);
t4 = _mm_add_pd(t4,sxy);
t5 = _mm_add_sd(t5,sz);
-
+
*x1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
*y1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
*z1 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
- __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+ __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-
+
t1 = _mm_loadu_pd(xyz);
t2 = _mm_loadu_pd(xyz+2);
t3 = _mm_loadu_pd(xyz+4);
t4 = _mm_loadu_pd(xyz+6);
t5 = _mm_loadu_pd(xyz+8);
t6 = _mm_loadu_pd(xyz+10);
-
+
sxy = _mm_loadu_pd(xyz_shift);
sz = _mm_load_sd(xyz_shift+2);
szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-
+
t1 = _mm_add_pd(t1,sxy);
t2 = _mm_add_pd(t2,szx);
t3 = _mm_add_pd(t3,syz);
t4 = _mm_add_pd(t4,sxy);
t5 = _mm_add_pd(t5,szx);
t6 = _mm_add_pd(t6,syz);
-
+
*x1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
*y1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
*z1 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
- *x = _mm_load_sd(p1);
- *y = _mm_load_sd(p1+1);
- *z = _mm_load_sd(p1+2);
+ *x = _mm_load_sd(p1);
+ *y = _mm_load_sd(p1+1);
+ *z = _mm_load_sd(p1+2);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
- *x1 = _mm_load_sd(p1);
- *y1 = _mm_load_sd(p1+1);
- *z1 = _mm_load_sd(p1+2);
- *x2 = _mm_load_sd(p1+3);
- *y2 = _mm_load_sd(p1+4);
- *z2 = _mm_load_sd(p1+5);
- *x3 = _mm_load_sd(p1+6);
- *y3 = _mm_load_sd(p1+7);
- *z3 = _mm_load_sd(p1+8);
+ *x1 = _mm_load_sd(p1);
+ *y1 = _mm_load_sd(p1+1);
+ *z1 = _mm_load_sd(p1+2);
+ *x2 = _mm_load_sd(p1+3);
+ *y2 = _mm_load_sd(p1+4);
+ *z2 = _mm_load_sd(p1+5);
+ *x3 = _mm_load_sd(p1+6);
+ *y3 = _mm_load_sd(p1+7);
+ *z3 = _mm_load_sd(p1+8);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
-__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+ __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrB);
t3 = _mm_loadu_pd(ptrA+2);
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy, __m128d z)
-{
- __m128d t1,t2;
-
- t1 = _mm_loadu_pd(ptrA);
- t2 = _mm_load_sd(ptrA+2);
-
- t1 = _mm_sub_pd(t1,xy);
- t2 = _mm_sub_sd(t2,z);
-
- _mm_storeu_pd(ptrA,t1);
- _mm_store_sd(ptrA+2,t2);
-}
-
-
-static void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3)
-{
- __m128d t1,t2;
- __m128d tA,tB,tC,tD,tE;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_load_sd(ptrA+8);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_sd(tE,z3);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_store_sd(ptrA+8,tE);
-}
-
-static void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3,
- __m128d xy4, __m128d z4)
-{
- __m128d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_loadu_pd(ptrA+8);
- tF = _mm_loadu_pd(ptrA+10);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
- t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
- t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_pd(tE,t3);
- tF = _mm_sub_pd(tF,t4);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_storeu_pd(ptrA+8,tE);
- _mm_storeu_pd(ptrA+10,tF);
-}
-
-
static void
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
__m128d t1,t2,t3;
-
+
t1 = _mm_load_sd(ptrA);
t2 = _mm_load_sd(ptrA+1);
t3 = _mm_load_sd(ptrA+2);
-
+
t1 = _mm_sub_sd(t1,x1);
t2 = _mm_sub_sd(t2,y1);
t3 = _mm_sub_sd(t3,z1);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_t1 = _mm_sub_pd(_t1,_x1);\
+_t2 = _mm_sub_pd(_t2,_z1);\
+_t3 = _mm_sub_pd(_t3,_y2);\
+_t4 = _mm_sub_pd(_t4,_x3);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
- __m128d x3, __m128d y3, __m128d z3)
+ __m128d x3, __m128d y3, __m128d z3)
{
__m128d t1,t2,t3,t4,t5;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t4 = _mm_loadu_pd(ptrA+6);
t5 = _mm_load_sd(ptrA+8);
-
+
x1 = _mm_unpacklo_pd(x1,y1);
z1 = _mm_unpacklo_pd(z1,x2);
y2 = _mm_unpacklo_pd(y2,z2);
x3 = _mm_unpacklo_pd(x3,y3);
/* nothing to be done for z3 */
-
+
t1 = _mm_sub_pd(t1,x1);
t2 = _mm_sub_pd(t2,z1);
t3 = _mm_sub_pd(t3,y2);
_mm_storeu_pd(ptrA+6,t4);
_mm_store_sd(ptrA+8,t5);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_z3 = _mm_unpacklo_pd(_z3,_x4);\
+_y4 = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA, _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
__m128d x3, __m128d y3, __m128d z3,
- __m128d x4, __m128d y4, __m128d z4)
+ __m128d x4, __m128d y4, __m128d z4)
{
__m128d t1,t2,t3,t4,t5,t6;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t4 = _mm_loadu_pd(ptrA+6);
t5 = _mm_loadu_pd(ptrA+8);
t6 = _mm_loadu_pd(ptrA+10);
-
+
x1 = _mm_unpacklo_pd(x1,y1);
z1 = _mm_unpacklo_pd(z1,x2);
y2 = _mm_unpacklo_pd(y2,z2);
x3 = _mm_unpacklo_pd(x3,y3);
z3 = _mm_unpacklo_pd(z3,x4);
y4 = _mm_unpacklo_pd(y4,z4);
-
+
_mm_storeu_pd(ptrA, _mm_sub_pd( t1,x1 ));
_mm_storeu_pd(ptrA+2, _mm_sub_pd( t2,z1 ));
_mm_storeu_pd(ptrA+4, _mm_sub_pd( t3,y2 ));
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
}
+#endif
+
static void
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
__m128d t1,t2,t3,t4,t5,t6,t7;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_load_sd(ptrA+2);
t3 = _mm_loadu_pd(ptrB);
t4 = _mm_load_sd(ptrB+2);
-
+
t5 = _mm_unpacklo_pd(x1,y1);
t6 = _mm_unpackhi_pd(x1,y1);
t7 = _mm_unpackhi_pd(z1,z1);
-
+
t1 = _mm_sub_pd(t1,t5);
t2 = _mm_sub_sd(t2,z1);
-
+
t3 = _mm_sub_pd(t3,t6);
t4 = _mm_sub_sd(t4,t7);
-
+
_mm_storeu_pd(ptrA,t1);
_mm_store_sd(ptrA+2,t2);
_mm_storeu_pd(ptrB,t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrB);\
+_t7 = _mm_loadu_pd(ptrB+2);\
+_t8 = _mm_loadu_pd(ptrB+4);\
+_t9 = _mm_loadu_pd(ptrB+6);\
+_t10 = _mm_load_sd(ptrB+8);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpackhi_pd(_z3,_z3);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_t6 = _mm_sub_pd(_t6,_tB);\
+_t7 = _mm_sub_pd(_t7,_tD);\
+_t8 = _mm_sub_pd(_t8,_tF);\
+_t9 = _mm_sub_pd(_t9,_tH);\
+_t10 = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
- __m128d x3, __m128d y3, __m128d z3)
+ __m128d x3, __m128d y3, __m128d z3)
{
__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128d tA,tB,tC,tD,tE,tF,tG,tH,tI;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t8 = _mm_loadu_pd(ptrB+4);
t9 = _mm_loadu_pd(ptrB+6);
t10 = _mm_load_sd(ptrB+8);
-
+
tA = _mm_unpacklo_pd(x1,y1);
tB = _mm_unpackhi_pd(x1,y1);
tC = _mm_unpacklo_pd(z1,x2);
tG = _mm_unpacklo_pd(x3,y3);
tH = _mm_unpackhi_pd(x3,y3);
tI = _mm_unpackhi_pd(z3,z3);
-
+
t1 = _mm_sub_pd(t1,tA);
t2 = _mm_sub_pd(t2,tC);
t3 = _mm_sub_pd(t3,tE);
t4 = _mm_sub_pd(t4,tG);
t5 = _mm_sub_sd(t5,z3);
-
+
t6 = _mm_sub_pd(t6,tB);
t7 = _mm_sub_pd(t7,tD);
t8 = _mm_sub_pd(t8,tF);
t9 = _mm_sub_pd(t9,tH);
t10 = _mm_sub_sd(t10,tI);
-
+
_mm_storeu_pd(ptrA,t1);
_mm_storeu_pd(ptrA+2,t2);
_mm_storeu_pd(ptrA+4,t3);
_mm_storeu_pd(ptrB+6,t9);
_mm_store_sd(ptrB+8,t10);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_t7 = _mm_loadu_pd(ptrB);\
+_t8 = _mm_loadu_pd(ptrB+2);\
+_t9 = _mm_loadu_pd(ptrB+4);\
+_t10 = _mm_loadu_pd(ptrB+6);\
+_t11 = _mm_loadu_pd(ptrB+8);\
+_t12 = _mm_loadu_pd(ptrB+10);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpacklo_pd(_z3,_x4);\
+_tJ = _mm_unpackhi_pd(_z3,_x4);\
+_tK = _mm_unpacklo_pd(_y4,_z4);\
+_tL = _mm_unpackhi_pd(_y4,_z4);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_pd(_t5,_tI);\
+_t6 = _mm_sub_pd(_t6,_tK);\
+_t7 = _mm_sub_pd(_t7,_tB);\
+_t8 = _mm_sub_pd(_t8,_tD);\
+_t9 = _mm_sub_pd(_t9,_tF);\
+_t10 = _mm_sub_pd(_t10,_tH);\
+_t11 = _mm_sub_pd(_t11,_tJ);\
+_t12 = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA, _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB, _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
__m128d x3, __m128d y3, __m128d z3,
- __m128d x4, __m128d y4, __m128d z4)
+ __m128d x4, __m128d y4, __m128d z4)
{
__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
__m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t10 = _mm_loadu_pd(ptrB+6);
t11 = _mm_loadu_pd(ptrB+8);
t12 = _mm_loadu_pd(ptrB+10);
-
+
tA = _mm_unpacklo_pd(x1,y1);
tB = _mm_unpackhi_pd(x1,y1);
tC = _mm_unpacklo_pd(z1,x2);
tJ = _mm_unpackhi_pd(z3,x4);
tK = _mm_unpacklo_pd(y4,z4);
tL = _mm_unpackhi_pd(y4,z4);
-
+
t1 = _mm_sub_pd(t1,tA);
t2 = _mm_sub_pd(t2,tC);
t3 = _mm_sub_pd(t3,tE);
t4 = _mm_sub_pd(t4,tG);
t5 = _mm_sub_pd(t5,tI);
t6 = _mm_sub_pd(t6,tK);
-
+
t7 = _mm_sub_pd(t7,tB);
t8 = _mm_sub_pd(t8,tD);
t9 = _mm_sub_pd(t9,tF);
t10 = _mm_sub_pd(t10,tH);
t11 = _mm_sub_pd(t11,tJ);
t12 = _mm_sub_pd(t12,tL);
-
+
_mm_storeu_pd(ptrA, t1);
_mm_storeu_pd(ptrA+2,t2);
_mm_storeu_pd(ptrA+4,t3);
_mm_storeu_pd(ptrB+8,t11);
_mm_storeu_pd(ptrB+10,t12);
}
-
+#endif
static gmx_inline void
{
fix1 = _mm_hadd_pd(fix1,fiy1);
fiz1 = _mm_hadd_pd(fiz1,fiz1);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
-
+
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+fix1 = _mm_add_pd(fix1,fix3);\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+fiz1 = _mm_add_sd(fiz1,_t2);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
double * gmx_restrict fshiftptr)
{
__m128d t1,t2;
-
+
fix1 = _mm_hadd_pd(fix1,fiy1);
fiz1 = _mm_hadd_pd(fiz1,fix2);
fiy2 = _mm_hadd_pd(fiy2,fiz2);
fix3 = _mm_hadd_pd(fix3,fiy3);
fiz3 = _mm_hadd_pd(fiz3,fiz3);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
-
+
fix1 = _mm_add_pd(fix1,fix3);
t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
fix1 = _mm_add_pd(fix1,t1); /* x and y sums */
-
+
t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));
fiz1 = _mm_add_sd(fiz1,fiz3);
fiz1 = _mm_add_sd(fiz1,t2); /* z sum */
-
+
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fix4);\
+fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));\
+_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+fix3 = _mm_add_pd(fix3,_t2);\
+fix1 = _mm_add_pd(fix1,fix3);\
+fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
double * gmx_restrict fshiftptr)
{
__m128d t1,t2;
-
+
fix1 = _mm_hadd_pd(fix1,fiy1);
fiz1 = _mm_hadd_pd(fiz1,fix2);
fiy2 = _mm_hadd_pd(fiy2,fiz2);
fix3 = _mm_hadd_pd(fix3,fiy3);
fiz3 = _mm_hadd_pd(fiz3,fix4);
fiy4 = _mm_hadd_pd(fiy4,fiz4);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
-
+
t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
fix1 = _mm_add_pd(fix1,t1);
t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));
fix3 = _mm_add_pd(fix3,t2);
fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */
-
+
fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));
fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));
fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */
-
+
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
+#endif
static gmx_inline void
{
pot1 = _mm_hadd_pd(pot1,pot2);
pot2 = _mm_unpackhi_pd(pot1,pot1);
-
+
_mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
_mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB)));
}
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1,
- __m128 * gmx_restrict y1,
- __m128 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
{
__m128 t1,t2,t3,t4;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = _mm_maskload_ps(ptrA,mask);
- t2 = _mm_maskload_ps(ptrB,mask);
- t3 = _mm_maskload_ps(ptrC,mask);
- t4 = _mm_maskload_ps(ptrD,mask);
+ t1 = gmx_mm_maskload_ps(ptrA,mask);
+ t2 = gmx_mm_maskload_ps(ptrB,mask);
+ t3 = gmx_mm_maskload_ps(ptrC,mask);
+ t4 = gmx_mm_maskload_ps(ptrD,mask);
_MM_TRANSPOSE4_PS(t1,t2,t3,t4);
*x1 = t1;
*y1 = t2;
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+ __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+ __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+ _t13 = _mm_unpackhi_ps(_x1,_y1);\
+ _x1 = _mm_unpacklo_ps(_x1,_y1);\
+ _t14 = _mm_unpackhi_ps(_z1,_x2);\
+ _z1 = _mm_unpacklo_ps(_z1,_x2);\
+ _t15 = _mm_unpackhi_ps(_y2,_z2);\
+ _y2 = _mm_unpacklo_ps(_y2,_z2);\
+ _t16 = _mm_unpackhi_ps(_x3,_y3);\
+ _x3 = _mm_unpacklo_ps(_x3,_y3);\
+ _t17 = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
+ _t18 = _mm_movehl_ps(_z3,_z3);\
+ _t19 = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
+ _t20 = _mm_movelh_ps(_x1,_z1);\
+ _t21 = _mm_movehl_ps(_z1,_x1);\
+ _t22 = _mm_movelh_ps(_t13,_t14);\
+ _t14 = _mm_movehl_ps(_t14,_t13);\
+ _t23 = _mm_movelh_ps(_y2,_x3);\
+ _t24 = _mm_movehl_ps(_x3,_y2);\
+ _t25 = _mm_movelh_ps(_t15,_t16);\
+ _t16 = _mm_movehl_ps(_t16,_t15);\
+ _t1 = _mm_loadu_ps(ptrA);\
+ _t2 = _mm_loadu_ps(ptrA+4);\
+ _t3 = _mm_load_ss(ptrA+8);\
+ _t1 = _mm_sub_ps(_t1,_t20);\
+ _t2 = _mm_sub_ps(_t2,_t23);\
+ _t3 = _mm_sub_ss(_t3,_z3);\
+ _mm_storeu_ps(ptrA,_t1);\
+ _mm_storeu_ps(ptrA+4,_t2);\
+ _mm_store_ss(ptrA+8,_t3);\
+ _t4 = _mm_loadu_ps(ptrB);\
+ _t5 = _mm_loadu_ps(ptrB+4);\
+ _t6 = _mm_load_ss(ptrB+8);\
+ _t4 = _mm_sub_ps(_t4,_t21);\
+ _t5 = _mm_sub_ps(_t5,_t24);\
+ _t6 = _mm_sub_ss(_t6,_t17);\
+ _mm_storeu_ps(ptrB,_t4);\
+ _mm_storeu_ps(ptrB+4,_t5);\
+ _mm_store_ss(ptrB+8,_t6);\
+ _t7 = _mm_loadu_ps(ptrC);\
+ _t8 = _mm_loadu_ps(ptrC+4);\
+ _t9 = _mm_load_ss(ptrC+8);\
+ _t7 = _mm_sub_ps(_t7,_t22);\
+ _t8 = _mm_sub_ps(_t8,_t25);\
+ _t9 = _mm_sub_ss(_t9,_t18);\
+ _mm_storeu_ps(ptrC,_t7);\
+ _mm_storeu_ps(ptrC+4,_t8);\
+ _mm_store_ss(ptrC+8,_t9);\
+ _t10 = _mm_loadu_ps(ptrD);\
+ _t11 = _mm_loadu_ps(ptrD+4);\
+ _t12 = _mm_load_ss(ptrD+8);\
+ _t10 = _mm_sub_ps(_t10,_t14);\
+ _t11 = _mm_sub_ps(_t11,_t16);\
+ _t12 = _mm_sub_ss(_t12,_t19);\
+ _mm_storeu_ps(ptrD,_t10);\
+ _mm_storeu_ps(ptrD+4,_t11);\
+ _mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
_mm_storeu_ps(ptrD+4,t11);
_mm_store_ss(ptrD+8,t12);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+ __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+ __m128 _t23,_t24;\
+ _t13 = _mm_unpackhi_ps(_x1,_y1);\
+ _x1 = _mm_unpacklo_ps(_x1,_y1);\
+ _t14 = _mm_unpackhi_ps(_z1,_x2);\
+ _z1 = _mm_unpacklo_ps(_z1,_x2);\
+ _t15 = _mm_unpackhi_ps(_y2,_z2);\
+ _y2 = _mm_unpacklo_ps(_y2,_z2);\
+ _t16 = _mm_unpackhi_ps(_x3,_y3);\
+ _x3 = _mm_unpacklo_ps(_x3,_y3);\
+ _t17 = _mm_unpackhi_ps(_z3,_x4);\
+ _z3 = _mm_unpacklo_ps(_z3,_x4);\
+ _t18 = _mm_unpackhi_ps(_y4,_z4);\
+ _y4 = _mm_unpacklo_ps(_y4,_z4);\
+ _t19 = _mm_movelh_ps(_x1,_z1);\
+ _z1 = _mm_movehl_ps(_z1,_x1);\
+ _t20 = _mm_movelh_ps(_t13,_t14);\
+ _t14 = _mm_movehl_ps(_t14,_t13);\
+ _t21 = _mm_movelh_ps(_y2,_x3);\
+ _x3 = _mm_movehl_ps(_x3,_y2);\
+ _t22 = _mm_movelh_ps(_t15,_t16);\
+ _t16 = _mm_movehl_ps(_t16,_t15);\
+ _t23 = _mm_movelh_ps(_z3,_y4);\
+ _y4 = _mm_movehl_ps(_y4,_z3);\
+ _t24 = _mm_movelh_ps(_t17,_t18);\
+ _t18 = _mm_movehl_ps(_t18,_t17);\
+ _t1 = _mm_loadu_ps(ptrA);\
+ _t2 = _mm_loadu_ps(ptrA+4);\
+ _t3 = _mm_loadu_ps(ptrA+8);\
+ _t1 = _mm_sub_ps(_t1,_t19);\
+ _t2 = _mm_sub_ps(_t2,_t21);\
+ _t3 = _mm_sub_ps(_t3,_t23);\
+ _mm_storeu_ps(ptrA,_t1);\
+ _mm_storeu_ps(ptrA+4,_t2);\
+ _mm_storeu_ps(ptrA+8,_t3);\
+ _t4 = _mm_loadu_ps(ptrB);\
+ _t5 = _mm_loadu_ps(ptrB+4);\
+ _t6 = _mm_loadu_ps(ptrB+8);\
+ _t4 = _mm_sub_ps(_t4,_z1);\
+ _t5 = _mm_sub_ps(_t5,_x3);\
+ _t6 = _mm_sub_ps(_t6,_y4);\
+ _mm_storeu_ps(ptrB,_t4);\
+ _mm_storeu_ps(ptrB+4,_t5);\
+ _mm_storeu_ps(ptrB+8,_t6);\
+ _t7 = _mm_loadu_ps(ptrC);\
+ _t8 = _mm_loadu_ps(ptrC+4);\
+ _t9 = _mm_loadu_ps(ptrC+8);\
+ _t7 = _mm_sub_ps(_t7,_t20);\
+ _t8 = _mm_sub_ps(_t8,_t22);\
+ _t9 = _mm_sub_ps(_t9,_t24);\
+ _mm_storeu_ps(ptrC,_t7);\
+ _mm_storeu_ps(ptrC+4,_t8);\
+ _mm_storeu_ps(ptrC+8,_t9);\
+ _t10 = _mm_loadu_ps(ptrD);\
+ _t11 = _mm_loadu_ps(ptrD+4);\
+ _t12 = _mm_loadu_ps(ptrD+8);\
+ _t10 = _mm_sub_ps(_t10,_t14);\
+ _t11 = _mm_sub_ps(_t11,_t16);\
+ _t12 = _mm_sub_ps(_t12,_t18);\
+ _mm_storeu_ps(ptrD,_t10);\
+ _mm_storeu_ps(ptrD+4,_t11);\
+ _mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
_mm_storeu_ps(ptrD+4,t11);
_mm_storeu_ps(ptrD+8,t12);
}
-
+#endif
static gmx_inline void
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
_mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4;\
+\
+ fix1 = _mm_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+ fix1 = _mm_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+ _t4 = _mm_load_ss(fshiftptr+2);\
+ _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+ _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+ _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+ _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+ _t3 = _mm_permute_ps(_t3 ,_MM_SHUFFLE(1,2,0,0));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _mm_store_ss(fshiftptr+2,_t1);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t1);
_mm_storeh_pi((__m64 *)(fshiftptr),t1);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5;\
+\
+ fix1 = _mm_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm_hadd_ps(fiz3,fix4);\
+ fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+ fix1 = _mm_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+ _t5 = _mm_load_ss(fshiftptr+2);\
+ _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+ _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
+ _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
+ _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
+ _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+ _t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _t5 = _mm_add_ps(_t5,_t1);\
+ _mm_store_ss(fshiftptr+2,_t5);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t5);
_mm_storeh_pi((__m64 *)(fshiftptr),t5);
}
-
+#endif
static gmx_inline void
}
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
- __m128 pot2, float * gmx_restrict ptrB,
- __m128 pot3, float * gmx_restrict ptrC,
- __m128 pot4, float * gmx_restrict ptrD)
-{
- _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
- pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
- pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(1,1,1,1));
- pot3 = _mm_permute_ps(pot1,_MM_SHUFFLE(2,2,2,2));
- pot4 = _mm_permute_ps(pot1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
- _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
- _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
#endif /* _kernelutil_x86_avx_128_fma_single_h_ */
t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
- return gmx_mm256_set_m128(t2,t1);
+ return gmx_mm256_set_m128d(t2,t1);
}
{
__m256d t1,t2;
- t1 = gmx_mm256_set_m128(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
- t2 = gmx_mm256_set_m128(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
+ t1 = gmx_mm256_set_m128d(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
+ t2 = gmx_mm256_set_m128d(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
*c6 = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
*c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
static gmx_inline void
gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m256d * gmx_restrict x1,
- __m256d * gmx_restrict y1,
- __m256d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m256d * gmx_restrict x1,
+ __m256d * gmx_restrict y1,
+ __m256d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
ty = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
- *x1 = gmx_mm256_set_m128(tx,tx);
- *y1 = gmx_mm256_set_m128(ty,ty);
- *z1 = gmx_mm256_set_m128(tz,tz);
+ *x1 = gmx_mm256_set_m128d(tx,tx);
+ *y1 = gmx_mm256_set_m128d(ty,ty);
+ *z1 = gmx_mm256_set_m128d(tz,tz);
}
static gmx_inline void
gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+ __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+ __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
tx = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
- *x1 = gmx_mm256_set_m128(tx,tx);
- *y1 = gmx_mm256_set_m128(ty,ty);
- *z1 = gmx_mm256_set_m128(tz,tz);
+ *x1 = gmx_mm256_set_m128d(tx,tx);
+ *y1 = gmx_mm256_set_m128d(ty,ty);
+ *z1 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
ty = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
tz = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
- *x2 = gmx_mm256_set_m128(tx,tx);
- *y2 = gmx_mm256_set_m128(ty,ty);
- *z2 = gmx_mm256_set_m128(tz,tz);
+ *x2 = gmx_mm256_set_m128d(tx,tx);
+ *y2 = gmx_mm256_set_m128d(ty,ty);
+ *z2 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
- *x3 = gmx_mm256_set_m128(tx,tx);
- *y3 = gmx_mm256_set_m128(ty,ty);
- *z3 = gmx_mm256_set_m128(tz,tz);
+ *x3 = gmx_mm256_set_m128d(tx,tx);
+ *y3 = gmx_mm256_set_m128d(ty,ty);
+ *z3 = gmx_mm256_set_m128d(tz,tz);
}
static gmx_inline void
gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
- __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+ __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+ __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
+ __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
tx = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
- *x1 = gmx_mm256_set_m128(tx,tx);
- *y1 = gmx_mm256_set_m128(ty,ty);
- *z1 = gmx_mm256_set_m128(tz,tz);
+ *x1 = gmx_mm256_set_m128d(tx,tx);
+ *y1 = gmx_mm256_set_m128d(ty,ty);
+ *z1 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
ty = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
tz = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
- *x2 = gmx_mm256_set_m128(tx,tx);
- *y2 = gmx_mm256_set_m128(ty,ty);
- *z2 = gmx_mm256_set_m128(tz,tz);
+ *x2 = gmx_mm256_set_m128d(tx,tx);
+ *y2 = gmx_mm256_set_m128d(ty,ty);
+ *z2 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
ty = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
tz = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
- *x3 = gmx_mm256_set_m128(tx,tx);
- *y3 = gmx_mm256_set_m128(ty,ty);
- *z3 = gmx_mm256_set_m128(tz,tz);
+ *x3 = gmx_mm256_set_m128d(tx,tx);
+ *y3 = gmx_mm256_set_m128d(ty,ty);
+ *z3 = gmx_mm256_set_m128d(tz,tz);
tx = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
ty = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
tz = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
- *x4 = gmx_mm256_set_m128(tx,tx);
- *y4 = gmx_mm256_set_m128(ty,ty);
- *z4 = gmx_mm256_set_m128(tz,tz);
+ *x4 = gmx_mm256_set_m128d(tx,tx);
+ *y4 = gmx_mm256_set_m128d(ty,ty);
+ *z4 = gmx_mm256_set_m128d(tz,tz);
}
}
-static void
-gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
- __m256d t1,t2,t3;
-
- t1 = _mm256_loadu_pd(p1); /* x2 z1 | y1 x1 */
- t2 = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4)); /* - - | z2 y2 */
-
- *x1 = t1;
- *y2 = t2;
-
- t3 = gmx_mm256_unpack128hi_pd(t1,t1);
-
- *z1 = t3;
- *y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
- *z2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
- *x2 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
-}
-
static void
gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
t1 = _mm256_loadu_pd(p1);
t2 = _mm256_loadu_pd(p1+4);
t3 = _mm256_loadu_pd(p1+8);
-
+
t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
*z1 = t4;
*x3 = t5;
*y4 = t6;
-
+
*y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
*z2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
*x4 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
}
-static void
-gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-{
- __m256d tA,tB,tC;
-
- tA = _mm256_loadu_pd(ptrA); /* - z1 | y1 x1 */
- tB = _mm256_loadu_pd(ptrB); /* - z2 | y2 x2 */
-
- tC = _mm256_unpacklo_pd(tA,tB); /* z2 z1 | x2 x1 */
-
- *x1 = tC;
- *y1 = _mm256_unpackhi_pd(tA,tB);
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
-}
-
-
-static void
-gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
- __m256d t1,t2,t3,t4,t5;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4)); /* - - | z2a y2a */
- t4 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4)); /* - - | z2b y2b */
-
- t5 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
- *y2 = _mm256_unpacklo_pd(t3,t4); /* - - | y2b y2a */
- *z2 = _mm256_unpackhi_pd(t3,t4); /* - - | z2b z2a */
- *x1 = t5;
- *y1 = t1;
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
- *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-}
-
-
-static void
-gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
- t4 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
- t5 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
- t6 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
-
- t7 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
-
- t2 = _mm256_unpacklo_pd(t3,t4); /* x3b x3a | y2b y2a */
- t3 = _mm256_unpackhi_pd(t3,t4); /* y3b y3a | z2b z2a */
-
- *z3 = _mm256_unpacklo_pd(t5,t6); /* - - | z3b z3a */
-
- *x1 = t7;
- *y1 = t1;
- *y2 = t2;
- *z2 = t3;
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
- *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
- *x3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
- *y3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-}
-
-
-static void
-gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
- __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
- t4 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
- t5 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
- t6 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
-
- t7 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
-
- t2 = _mm256_unpacklo_pd(t3,t4); /* x3b x3a | y2b y2a */
- t3 = _mm256_unpackhi_pd(t3,t4); /* y3b y3a | z2b z2a */
-
- t4 = _mm256_unpacklo_pd(t5,t6); /* y4b y4a | z3b z3a */
- t5 = _mm256_unpackhi_pd(t5,t6); /* z4b z4a | x4b x4a */
-
- *x1 = t7;
- *y1 = t1;
- *y2 = t2;
- *z2 = t3;
- *z3 = t4;
- *x4 = t5;
-
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
- *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
- *x3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
- *y3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
- *y4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
- *z4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
-}
-
-
-
static void
gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
{
- __m256d t1,t2,t3,t4,t5,t6;
+ __m256d t1,t2,t3,t4,t5,t6;
t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
*z1 = gmx_mm256_unpack128hi_pd(t5,t1);
}
-static void
-gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
- t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
- t5 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4)); /* - - | z2a y2a */
- t6 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4)); /* - - | z2b y2b */
- t7 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4)); /* - - | z2c y2c */
- t8 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4)); /* - - | z2d y2d */
-
- t9 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t10 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
- t1 = _mm256_unpacklo_pd(t3,t4); /* z1d z1c | x1d x1c */
- t2 = _mm256_unpackhi_pd(t3,t4); /* x2d x2c | y1d y1c */
- t3 = _mm256_unpacklo_pd(t5,t6); /* - - | y2b y2a */
- t4 = _mm256_unpackhi_pd(t5,t6); /* - - | z2b z2a */
- t5 = _mm256_unpacklo_pd(t7,t8); /* - - | y2d y2c */
- t6 = _mm256_unpackhi_pd(t7,t8); /* - - | z2d z2c */
-
- *x1 = gmx_mm256_unpack128lo_pd(t9,t1);
- *y1 = gmx_mm256_unpack128lo_pd(t10,t2);
- *z1 = gmx_mm256_unpack128hi_pd(t9,t1);
-
- *x2 = gmx_mm256_unpack128hi_pd(t10,t2);
- *y2 = gmx_mm256_unpack128lo_pd(t3,t5);
- *z2 = gmx_mm256_unpack128lo_pd(t4,t6);
-}
static void
-/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
-{
- __m256d t1,t2;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
- t1 = _mm256_sub_pd(t1,t2);
- /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
- _mm256_storeu_pd(ptrA,t1);
-}
-
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m256d xyz1, __m256d xyz2, __m256d xyz3)
-{
- __m256d t1,t2;
- __m256d tA,tB;
- __m128d tC;
-
- tA = _mm256_loadu_pd(ptrA);
- tB = _mm256_loadu_pd(ptrA+4);
- tC = _mm_load_sd(ptrA+8);
-
- /* xyz1: - z1 | y1 x1 */
- /* xyz2: - z2 | y2 x2 */
- /* xyz3: - z3 | y3 x3 */
-
- xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
- t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
- xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
- xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
- t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
- xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
-
- tA = _mm256_sub_pd(tA,xyz1);
- tB = _mm256_sub_pd(tB,xyz2);
- tC = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
-
- _mm256_storeu_pd(ptrA,tA);
- _mm256_storeu_pd(ptrA+4,tB);
- _mm_store_sd(ptrA+8,tC);
-}
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
-{
- __m256d t1,t2,t3;
- __m256d tA,tB,tC;
-
- tA = _mm256_loadu_pd(ptrA);
- tB = _mm256_loadu_pd(ptrA+4);
- tC = _mm256_loadu_pd(ptrA+8);
-
- /* xyz1: - z1 | y1 x1 */
- /* xyz2: - z2 | y2 x2 */
- /* xyz3: - z3 | y3 x3 */
- /* xyz4: - z4 | y4 x4 */
-
- xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
- t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
- xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
- xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
- t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
- xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
- xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z4 - | x4 y4 */
- t3 = _mm256_permute2f128_pd(xyz4,xyz4,0x21); /* x4 y4 | z4 - */
- t3 = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
- xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /* xz y4 | x4 z3 */
-
- tA = _mm256_sub_pd(tA,xyz1);
- tB = _mm256_sub_pd(tB,xyz2);
- tC = _mm256_sub_pd(tC,xyz4);
-
- _mm256_storeu_pd(ptrA,tA);
- _mm256_storeu_pd(ptrA+4,tB);
- _mm256_storeu_pd(ptrA+8,tC);
-}
-
-
-
-static void
-gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1)
-{
- __m128d t1,t2,t3;
-
- t1 = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
- t2 = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
- t3 = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
- _mm_store_sd(ptrA,t1);
- _mm_store_sd(ptrA+1,t2);
- _mm_store_sd(ptrA+2,t3);
-}
-
-
-static void
-gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1;
- __m128d tA;
- t1 = _mm256_loadu_pd(ptrA);
- tA = _mm_loadu_pd(ptrA+4);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-
- t1 = _mm256_sub_pd(x1,t1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm_storeu_pd(ptrA+4,tA);
-}
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
-{
- __m256d t1,t2;
- __m128d tA;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrA+4);
- tA = _mm_load_sd(ptrA+8);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
- y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
- t1 = _mm256_sub_pd(t1,x1);
- t2 = _mm256_sub_pd(t2,y2);
- tA = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrA+4,t2);
- _mm_store_sd(ptrA+8,tA);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
-{
- __m256d t1,t2,t3;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrA+4);
- t3 = _mm256_loadu_pd(ptrA+8);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- z3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
- y4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
- y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
- z3 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
-
- t1 = _mm256_sub_pd(t1,x1);
- t2 = _mm256_sub_pd(t2,y2);
- t3 = _mm256_sub_pd(t3,z3);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrA+4,t2);
- _mm256_storeu_pd(ptrA+8,t3);
-}
-
-static void
-gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
- double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1)
-{
- __m256d t1,t2,t3,t4;
- __m256i mask;
-
- t3 = _mm256_loadu_pd(ptrA);
- t4 = _mm256_loadu_pd(ptrB);
-
- t1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- t2 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- t1 = gmx_mm256_unpack128lo_pd(t1,z1); /* - z1a | y1a x1a */
- z1 = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
- t2 = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
-
- /* Construct a mask without executing any data loads */
- mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
- _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
-
- t3 = _mm256_sub_pd(t3,t1);
- t4 = _mm256_sub_pd(t4,t2);
-
- /* Careful with potentially overlapping stores, need to be masked */
- _mm256_maskstore_pd(ptrA,mask,t3);
- _mm256_maskstore_pd(ptrB,mask,t4);
-}
-
-static void
-gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1,t2,t5;
- __m128d t3,t4;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm_loadu_pd(ptrA+4);
- t4 = _mm_loadu_pd(ptrB+4);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t1 = _mm256_sub_pd(t1,z2);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
- t4 = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
-
- /* Careful with potentially overlapping stores, need to be masked */
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm_storeu_pd(ptrA+4,t3);
- _mm_storeu_pd(ptrB+4,t4);
-}
-
-static void
-gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
-{
- __m256d t1,t2,t3,t4,t5,t6;
- __m128d tA,tB;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrA+4);
- t4 = _mm256_loadu_pd(ptrB+4);
- tA = _mm_load_sd(ptrA+8);
- tB = _mm_load_sd(ptrB+8);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
-
- t6 = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
-
- y3 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t5 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
- x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
- t1 = _mm256_sub_pd(t1,y3);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm256_sub_pd(t3,t5);
- t4 = _mm256_sub_pd(t4,x1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
- tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrA+4,t3);
- _mm256_storeu_pd(ptrB+4,t4);
- _mm_store_sd(ptrA+8,tA);
- _mm_store_sd(ptrB+8,tB);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrA+4);
- t4 = _mm256_loadu_pd(ptrB+4);
- t5 = _mm256_loadu_pd(ptrA+8);
- t6 = _mm256_loadu_pd(ptrB+8);
-
- t7 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
-
- y3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
- z3 = _mm256_unpackhi_pd(z3,x4); /* - - | x4b z3b */
- x4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
- y4 = _mm256_unpackhi_pd(y4,z4); /* - - | z4b y4b */
-
- z4 = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t7 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
- x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
- x2 = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
- y2 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
-
- t1 = _mm256_sub_pd(t1,z4);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm256_sub_pd(t3,t7);
- t4 = _mm256_sub_pd(t4,x1);
- t5 = _mm256_sub_pd(t5,x2);
- t6 = _mm256_sub_pd(t6,y2);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrA+4,t3);
- _mm256_storeu_pd(ptrB+4,t4);
- _mm256_storeu_pd(ptrA+8,t5);
- _mm256_storeu_pd(ptrB+8,t6);
-}
-
-
-
static void
gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1)
+ double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+ __m256d x1, __m256d y1, __m256d z1)
{
__m256d t1,t2,tA,tB,tC,tD;
__m256i mask;
/* Construct a mask without executing any data loads */
mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
- _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
+ _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
tA = _mm256_loadu_pd(ptrA);
tB = _mm256_loadu_pd(ptrB);
_mm256_maskstore_pd(ptrD,mask,tD);
}
-static void
-gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1,t2,t3,t4,t5,t6;
- __m128d tA,tB,tC,tD,tE,tF;
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrC);
- t4 = _mm256_loadu_pd(ptrD);
- tA = _mm_loadu_pd(ptrA+4);
- tB = _mm_loadu_pd(ptrB+4);
- tC = _mm_loadu_pd(ptrC+4);
- tD = _mm_loadu_pd(ptrD+4);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
- y1 = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
- x2 = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
- t6 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- z2 = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
- t5 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
- y1 = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
-
- tE = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
- tF = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
-
- t1 = _mm256_sub_pd(t1,t6);
- t2 = _mm256_sub_pd(t2,t5);
- t3 = _mm256_sub_pd(t3,z2);
- t4 = _mm256_sub_pd(t4,y1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
- tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
- tC = _mm_sub_pd(tC,tE);
- tD = _mm_sub_pd(tD,tF);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrC,t3);
- _mm256_storeu_pd(ptrD,t4);
- _mm_storeu_pd(ptrA+4,tA);
- _mm_storeu_pd(ptrB+4,tB);
- _mm_storeu_pd(ptrC+4,tC);
- _mm_storeu_pd(ptrD+4,tD);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+ __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+ __m128d _tA,_tB,_tC,_tD,_tE;\
+ _t1 = _mm256_loadu_pd(ptrA);\
+ _t2 = _mm256_loadu_pd(ptrB);\
+ _t3 = _mm256_loadu_pd(ptrC);\
+ _t4 = _mm256_loadu_pd(ptrD);\
+ _t5 = _mm256_loadu_pd(ptrA+4);\
+ _t6 = _mm256_loadu_pd(ptrB+4);\
+ _t7 = _mm256_loadu_pd(ptrC+4);\
+ _t8 = _mm256_loadu_pd(ptrD+4);\
+ _tA = _mm_load_sd(ptrA+8);\
+ _tB = _mm_load_sd(ptrB+8);\
+ _tC = _mm_load_sd(ptrC+8);\
+ _tD = _mm_load_sd(ptrD+8);\
+ _t9 = _mm256_unpacklo_pd(_x1,_y1);\
+ _x1 = _mm256_unpackhi_pd(_x1,_y1);\
+ _y1 = _mm256_unpacklo_pd(_z1,_x2);\
+ _z1 = _mm256_unpackhi_pd(_z1,_x2);\
+ _x2 = _mm256_unpacklo_pd(_y2,_z2);\
+ _y2 = _mm256_unpackhi_pd(_y2,_z2);\
+ _z2 = _mm256_unpacklo_pd(_x3,_y3);\
+ _x3 = _mm256_unpackhi_pd(_x3,_y3);\
+ _t10 = gmx_mm256_unpack128lo_pd(_t9,_y1);\
+ _y3 = gmx_mm256_unpack128hi_pd(_t9,_y1);\
+ _t9 = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+ _y1 = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+ _x1 = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+ _z1 = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+ _x2 = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+ _z2 = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+ _t1 = _mm256_sub_pd(_t1,_t10);\
+ _t2 = _mm256_sub_pd(_t2,_t9);\
+ _t3 = _mm256_sub_pd(_t3,_y3);\
+ _t4 = _mm256_sub_pd(_t4,_y1);\
+ _t5 = _mm256_sub_pd(_t5,_x1);\
+ _t6 = _mm256_sub_pd(_t6,_x2);\
+ _t7 = _mm256_sub_pd(_t7,_z1);\
+ _t8 = _mm256_sub_pd(_t8,_z2);\
+ _tA = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3));\
+ _tB = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3),_GMX_MM_PERMUTE128D(1,1)));\
+ _tE = _mm256_extractf128_pd(_z3,0x1);\
+ _tC = _mm_sub_sd(_tC, _tE);\
+ _tD = _mm_sub_sd(_tD, _mm_permute_pd(_tE,_GMX_MM_PERMUTE128D(1,1)));\
+ _mm256_storeu_pd(ptrA,_t1);\
+ _mm256_storeu_pd(ptrB,_t2);\
+ _mm256_storeu_pd(ptrC,_t3);\
+ _mm256_storeu_pd(ptrD,_t4);\
+ _mm256_storeu_pd(ptrA+4,_t5);\
+ _mm256_storeu_pd(ptrB+4,_t6);\
+ _mm256_storeu_pd(ptrC+4,_t7);\
+ _mm256_storeu_pd(ptrD+4,_t8);\
+ _mm_store_sd(ptrA+8,_tA);\
+ _mm_store_sd(ptrB+8,_tB);\
+ _mm_store_sd(ptrC+8,_tC);\
+ _mm_store_sd(ptrD+8,_tD);\
}
-
-
+#else
+/* Real function for sane compilers */
static void
gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
+ double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+ __m256d x1, __m256d y1, __m256d z1,
+ __m256d x2, __m256d y2, __m256d z2,
+ __m256d x3, __m256d y3, __m256d z3)
{
__m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128d tA,tB,tC,tD,tE;
_mm_store_sd(ptrC+8,tC);
_mm_store_sd(ptrD+8,tD);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{ \
+ __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14;\
+ __m128d _tA,_tB,_tC,_tD,_tE;\
+ _t1 = _mm256_loadu_pd(ptrA);\
+ _t2 = _mm256_loadu_pd(ptrB);\
+ _t3 = _mm256_loadu_pd(ptrC);\
+ _t4 = _mm256_loadu_pd(ptrD);\
+ _t5 = _mm256_loadu_pd(ptrA+4);\
+ _t6 = _mm256_loadu_pd(ptrB+4);\
+ _t7 = _mm256_loadu_pd(ptrC+4);\
+ _t8 = _mm256_loadu_pd(ptrD+4);\
+ _t9 = _mm256_loadu_pd(ptrA+8);\
+ _t10 = _mm256_loadu_pd(ptrB+8);\
+ _t11 = _mm256_loadu_pd(ptrC+8);\
+ _t12 = _mm256_loadu_pd(ptrD+8);\
+ _t13 = _mm256_unpacklo_pd(_x1,_y1);\
+ _x1 = _mm256_unpackhi_pd(_x1,_y1);\
+ _y1 = _mm256_unpacklo_pd(_z1,_x2);\
+ _z1 = _mm256_unpackhi_pd(_z1,_x2);\
+ _x2 = _mm256_unpacklo_pd(_y2,_z2);\
+ _y2 = _mm256_unpackhi_pd(_y2,_z2);\
+ _z2 = _mm256_unpacklo_pd(_x3,_y3);\
+ _x3 = _mm256_unpackhi_pd(_x3,_y3);\
+ _y3 = _mm256_unpacklo_pd(_z3,_x4);\
+ _z3 = _mm256_unpackhi_pd(_z3,_x4);\
+ _x4 = _mm256_unpacklo_pd(_y4,_z4);\
+ _y4 = _mm256_unpackhi_pd(_y4,_z4);\
+ _z4 = gmx_mm256_unpack128lo_pd(_t13,_y1);\
+ _t13 = gmx_mm256_unpack128hi_pd(_t13,_y1);\
+ _y1 = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+ _x1 = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+ _z1 = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+ _x2 = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+ _z2 = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+ _y2 = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+ _x3 = gmx_mm256_unpack128lo_pd(_y3,_x4);\
+ _y3 = gmx_mm256_unpack128hi_pd(_y3,_x4);\
+ _x4 = gmx_mm256_unpack128lo_pd(_z3,_y4);\
+ _z3 = gmx_mm256_unpack128hi_pd(_z3,_y4);\
+ _t1 = _mm256_sub_pd(_t1,_z4);\
+ _t2 = _mm256_sub_pd(_t2,_y1);\
+ _t3 = _mm256_sub_pd(_t3,_t13);\
+ _t4 = _mm256_sub_pd(_t4,_x1);\
+ _t5 = _mm256_sub_pd(_t5,_z1);\
+ _t6 = _mm256_sub_pd(_t6,_z2);\
+ _t7 = _mm256_sub_pd(_t7,_x2);\
+ _t8 = _mm256_sub_pd(_t8,_y2);\
+ _t9 = _mm256_sub_pd(_t9,_x3);\
+ _t10 = _mm256_sub_pd(_t10,_x4);\
+ _t11 = _mm256_sub_pd(_t11,_y3);\
+ _t12 = _mm256_sub_pd(_t12,_z3);\
+ _mm256_storeu_pd(ptrA,_t1);\
+ _mm256_storeu_pd(ptrB,_t2);\
+ _mm256_storeu_pd(ptrC,_t3);\
+ _mm256_storeu_pd(ptrD,_t4);\
+ _mm256_storeu_pd(ptrA+4,_t5);\
+ _mm256_storeu_pd(ptrB+4,_t6);\
+ _mm256_storeu_pd(ptrC+4,_t7);\
+ _mm256_storeu_pd(ptrD+4,_t8);\
+ _mm256_storeu_pd(ptrA+8,_t9);\
+ _mm256_storeu_pd(ptrB+8,_t10);\
+ _mm256_storeu_pd(ptrC+8,_t11);\
+ _mm256_storeu_pd(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
+ double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+ __m256d x1, __m256d y1, __m256d z1,
+ __m256d x2, __m256d y2, __m256d z2,
+ __m256d x3, __m256d y3, __m256d z3,
+ __m256d x4, __m256d y4, __m256d z4)
{
__m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
__m128d tA,tB,tC,tD,tE;
_mm256_storeu_pd(ptrC+8,t11);
_mm256_storeu_pd(ptrD+8,t12);
}
+#endif
static gmx_inline void
gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
+ double * gmx_restrict fptr,
+ double * gmx_restrict fshiftptr)
{
__m256d t1,t2;
__m128d tA,tB;
tA = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
- fix1 = gmx_mm256_set_m128(tB,tA); /* 0 fiz fiy fix */
+ fix1 = gmx_mm256_set_m128d(tB,tA); /* 0 fiz fiy fix */
t1 = _mm256_loadu_pd(fptr);
t2 = _mm256_loadu_pd(fshiftptr);
_mm256_storeu_pd(fshiftptr,t2);
}
-static gmx_inline void
-gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- __m256d fix2, __m256d fiy2, __m256d fiz2,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
-{
- __m256d t1,t2,t3;
- __m128d tA,tB,tC,tD,tE;
- fix1 = _mm256_hadd_pd(fix1,fiy1);
- fiz1 = _mm256_hadd_pd(fiz1,fix2);
- fiy2 = _mm256_hadd_pd(fiy2,fiz2);
- /* Add across the two lanes by swapping and adding back */
- tA = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
- tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
- tC = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
-
- t1 = gmx_mm256_set_m128(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
-
- t2 = _mm256_loadu_pd(fptr);
- tD = _mm_loadu_pd(fptr+4);
-
- t2 = _mm256_add_pd(t2,t1);
- tD = _mm_add_pd(tD,tC);
- _mm256_storeu_pd(fptr,t2);
- _mm_storeu_pd(fptr+4,tD);
-
- /* Add up shift force */
- /* t1: fix2 fiz1 | fiy1 fix1 */
- /* tC: fiz2 fiy2 */
-
- tA = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
- tB = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1)); /* fiy2 fix2 */
- tC = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1)); /* - fiz2 */
-
- tB = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
- tC = _mm_add_sd(tC,tA);
-
- tD = _mm_loadu_pd(fshiftptr);
- tE = _mm_load_sd(fshiftptr+2);
-
- tD = _mm_add_pd(tD,tB);
- tE = _mm_add_pd(tE,tC);
-
- _mm_storeu_pd(fshiftptr,tD);
- _mm_store_sd(fshiftptr+2,tE);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{ \
+ __m256d _t1,_t2,_t3,_t4;\
+ __m128d _tz3,_tA,_tB,_tC,_tD;\
+ fix1 = _mm256_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm256_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd());\
+ _t1 = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+ _t2 = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+ _t1 = _mm256_add_pd(_t1,_t2);\
+ _t3 = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+ _t4 = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+ _t3 = _mm256_add_pd(_t3,_t4);\
+ _tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));\
+ _t2 = _mm256_loadu_pd(fptr);\
+ _t4 = _mm256_loadu_pd(fptr+4);\
+ _tA = _mm_load_sd(fptr+8);\
+ _t2 = _mm256_add_pd(_t2,_t1);\
+ _t4 = _mm256_add_pd(_t4,_t3);\
+ _tA = _mm_add_sd(_tA,_tz3);\
+ _mm256_storeu_pd(fptr,_t2);\
+ _mm256_storeu_pd(fptr+4,_t4);\
+ _mm_store_sd(fptr+8,_tA);\
+ _tB = _mm256_extractf128_pd(_t1,0x1);\
+ _tC = _mm256_extractf128_pd(_t3,0x1);\
+ _tz3 = _mm_add_sd(_tz3,_tB);\
+ _tD = _mm_permute_pd(_mm256_castpd256_pd128(_t3),_GMX_MM_PERMUTE128D(1,1));\
+ _tz3 = _mm_add_sd(_tz3,_tD);\
+ _tC = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t1));\
+ _tD = _mm_shuffle_pd(_tB,_mm256_castpd256_pd128(_t3),_MM_SHUFFLE2(0,1));\
+ _tC = _mm_add_pd(_tC,_tD);\
+ _tA = _mm_loadu_pd(fshiftptr);\
+ _tB = _mm_load_sd(fshiftptr+2);\
+ _tA = _mm_add_pd(_tA,_tC);\
+ _tB = _mm_add_sd(_tB,_tz3);\
+ _mm_storeu_pd(fshiftptr,_tA);\
+ _mm_store_sd(fshiftptr+2,_tB);\
}
-
-
-
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- __m256d fix2, __m256d fiy2, __m256d fiz2,
- __m256d fix3, __m256d fiy3, __m256d fiz3,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
+ __m256d fix2, __m256d fiy2, __m256d fiz2,
+ __m256d fix3, __m256d fiy3, __m256d fiz3,
+ double * gmx_restrict fptr,
+ double * gmx_restrict fshiftptr)
{
__m256d t1,t2,t3,t4;
__m128d tz3,tA,tB,tC,tD;
_mm_storeu_pd(fshiftptr,tA);
_mm_store_sd(fshiftptr+2,tB);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m256d _t1,_t2,_t3,_t4,_t5,_t6;\
+ __m128d _tA,_tB,_tC,_tD;\
+ fix1 = _mm256_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm256_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm256_hadd_pd(fiz3,fix4);\
+ fiy4 = _mm256_hadd_pd(fiy4,fiz4);\
+ _t1 = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+ _t2 = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+ _t1 = _mm256_add_pd(_t1,_t2);\
+ _t3 = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+ _t4 = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+ _t3 = _mm256_add_pd(_t3,_t4);\
+ _t5 = gmx_mm256_unpack128lo_pd(fiz3,fiy4);\
+ _t6 = gmx_mm256_unpack128hi_pd(fiz3,fiy4);\
+ _t5 = _mm256_add_pd(_t5,_t6);\
+ _t2 = _mm256_loadu_pd(fptr);\
+ _t4 = _mm256_loadu_pd(fptr+4);\
+ _t6 = _mm256_loadu_pd(fptr+8);\
+ _t2 = _mm256_add_pd(_t2,_t1);\
+ _t4 = _mm256_add_pd(_t4,_t3);\
+ _t6 = _mm256_add_pd(_t6,_t5);\
+ _mm256_storeu_pd(fptr,_t2);\
+ _mm256_storeu_pd(fptr+4,_t4);\
+ _mm256_storeu_pd(fptr+8,_t6);\
+ _tA = _mm256_extractf128_pd(_t1,0x1);\
+ _tB = _mm256_extractf128_pd(_t3,0x1);\
+ _tC = _mm256_extractf128_pd(_t5,0x1);\
+ _tB = _mm_add_pd(_tB,_mm256_castpd256_pd128(_t1));\
+ _tA = _mm_add_pd(_tA,_mm256_castpd256_pd128(_t5));\
+ _tC = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t3));\
+ _tD = _mm_shuffle_pd(_tA,_tC,_MM_SHUFFLE2(0,1));\
+ _tB = _mm_add_pd(_tB,_tD);\
+ _tC = _mm_permute_pd(_tC,_GMX_MM_PERMUTE128D(1,1));\
+ _tC = _mm_add_sd(_tC,_tA);\
+ _tA = _mm_loadu_pd(fshiftptr);\
+ _tD = _mm_load_sd(fshiftptr+2);\
+ _tA = _mm_add_pd(_tA,_tB);\
+ _tD = _mm_add_sd(_tD,_tC);\
+ _mm_storeu_pd(fshiftptr,_tA);\
+ _mm_store_sd(fshiftptr+2,_tD);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- __m256d fix2, __m256d fiy2, __m256d fiz2,
- __m256d fix3, __m256d fiy3, __m256d fiz3,
- __m256d fix4, __m256d fiy4, __m256d fiz4,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
+ __m256d fix2, __m256d fiy2, __m256d fiz2,
+ __m256d fix3, __m256d fiy3, __m256d fiz3,
+ __m256d fix4, __m256d fiy4, __m256d fiz4,
+ double * gmx_restrict fptr,
+ double * gmx_restrict fshiftptr)
{
__m256d t1,t2,t3,t4,t5,t6;
__m128d tA,tB,tC,tD;
_mm_storeu_pd(fshiftptr,tA);
_mm_store_sd(fshiftptr+2,tD);
}
+#endif
static void
gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
- __m256d pot2, double * gmx_restrict ptrB)
+ __m256d pot2, double * gmx_restrict ptrB)
{
__m128d t1,t2;
}
-static void
-gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
- __m256d pot2, double * gmx_restrict ptrB,
- __m256d pot3, double * gmx_restrict ptrC,
- __m256d pot4, double * gmx_restrict ptrD)
-{
- __m256d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF,tG,tH;
-
- tA = _mm_load_sd(ptrA);
- tB = _mm_load_sd(ptrB);
- tC = _mm_load_sd(ptrC);
- tD = _mm_load_sd(ptrD);
-
- /* do a transpose */
- t1 = _mm256_unpacklo_pd(pot1, pot2); /* p2c p1c | p2a p1a */
- t2 = _mm256_unpackhi_pd(pot1, pot2); /* p2d p1d | p2b p1b */
- t3 = _mm256_unpacklo_pd(pot3, pot4); /* p4c p3c | p4a p3a */
- t4 = _mm256_unpackhi_pd(pot3, pot4); /* p4d p3d | p4b p3b */
- pot1 = _mm256_permute2f128_pd(t1, t3, 0x20); /* p4a p3a | p2a p1a */
- pot2 = _mm256_permute2f128_pd(t2, t4, 0x20); /* p4b p3b | p2b p1b */
- pot3 = _mm256_permute2f128_pd(t1, t3, 0x31); /* p4c p3c | p2c p1c */
- pot4 = _mm256_permute2f128_pd(t2, t4, 0x31); /* p4d p3d | p2d p1d */
-
- pot1 = _mm256_add_pd(pot1,pot2);
- pot3 = _mm256_add_pd(pot3,pot4);
- pot1 = _mm256_add_pd(pot1,pot3); /* Sum in the four elements */
-
- tE = _mm256_castpd256_pd128(pot1);
- tF = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
- tG = _mm256_extractf128_pd(pot1,0x1);
- tH = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
-
- tA = _mm_add_sd(tA,tE);
- tB = _mm_add_sd(tB,tF);
- tC = _mm_add_sd(tC,tG);
- tD = _mm_add_sd(tD,tH);
-
- _mm_store_sd(ptrA,tA);
- _mm_store_sd(ptrB,tB);
- _mm_store_sd(ptrC,tC);
- _mm_store_sd(ptrD,tD);
-}
-
-
#endif /* _kernelutil_x86_avx_256_double_h_ */
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
*/
- tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+ tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
static gmx_inline void
gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m256 * gmx_restrict x1,
- __m256 * gmx_restrict y1,
- __m256 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m256 * gmx_restrict x1,
+ __m256 * gmx_restrict y1,
+ __m256 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
static gmx_inline void
gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
- __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
- __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+ __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+ __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
static gmx_inline void
gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
- __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
- __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
- __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+ __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+ __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
+ __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
{
__m128 t1,t2,t3,t4;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = _mm_maskload_ps(ptrA,mask);
- t2 = _mm_maskload_ps(ptrB,mask);
- t3 = _mm_maskload_ps(ptrC,mask);
- t4 = _mm_maskload_ps(ptrD,mask);
+ t1 = gmx_mm_maskload_ps(ptrA,mask);
+ t2 = gmx_mm_maskload_ps(ptrB,mask);
+ t3 = gmx_mm_maskload_ps(ptrC,mask);
+ t4 = gmx_mm_maskload_ps(ptrD,mask);
_MM_TRANSPOSE4_PS(t1,t2,t3,t4);
*x1 = _mm256_castps128_ps256(t1);
*y1 = _mm256_castps128_ps256(t2);
__m256 t1,t2,t3,t4,t5,t6,t7,t8;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
- t2 = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
- t3 = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
- t4 = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
+ t1 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
+ t2 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
+ t3 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
+ t4 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
t5 = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
t6 = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
t1 = _mm256_unpacklo_ps(t1,t3); /* - - z3g z3e | - - z3c z3a */
t2 = _mm256_unpacklo_ps(t2,t4); /* - - z3h z3f | - - z3d z3b */
-
+
*z3 = _mm256_unpacklo_ps(t1,t2);
}
t6 = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
t7 = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
t8 = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
-
+
*z3 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
*x4 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
*y4 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
static gmx_inline void
gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC,float * gmx_restrict ptrD,
- __m256 x1, __m256 y1, __m256 z1)
+ float * gmx_restrict ptrC,float * gmx_restrict ptrD,
+ __m256 x1, __m256 y1, __m256 z1)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8;
__m128i mask;
t3 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /* - z1c y1c x1c */
t4 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /* - z1d y1d x1d */
- t5 = _mm_maskload_ps(ptrA,mask);
- t6 = _mm_maskload_ps(ptrB,mask);
- t7 = _mm_maskload_ps(ptrC,mask);
- t8 = _mm_maskload_ps(ptrD,mask);
+ t5 = gmx_mm_maskload_ps(ptrA,mask);
+ t6 = gmx_mm_maskload_ps(ptrB,mask);
+ t7 = gmx_mm_maskload_ps(ptrC,mask);
+ t8 = gmx_mm_maskload_ps(ptrD,mask);
t5 = _mm_sub_ps(t5,t1);
t6 = _mm_sub_ps(t6,t2);
t7 = _mm_sub_ps(t7,t3);
t8 = _mm_sub_ps(t8,t4);
- _mm_maskstore_ps(ptrA,mask,t5);
- _mm_maskstore_ps(ptrB,mask,t6);
- _mm_maskstore_ps(ptrC,mask,t7);
- _mm_maskstore_ps(ptrD,mask,t8);
+ gmx_mm_maskstore_ps(ptrA,mask,t5);
+ gmx_mm_maskstore_ps(ptrB,mask,t6);
+ gmx_mm_maskstore_ps(ptrC,mask,t7);
+ gmx_mm_maskstore_ps(ptrD,mask,t8);
}
-
-
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ x1,y1,z1,x2,y2,z2,x3,y3,z3) \
+{\
+ __m256 _t1,_t2,_t3,_t4,_t5,_t6;\
+ __m128 _tA,_tB,_tC,_tD;\
+\
+ _t1 = _mm256_loadu_ps(ptrA);\
+ _t2 = _mm256_loadu_ps(ptrB);\
+ _t3 = _mm256_loadu_ps(ptrC);\
+ _t4 = _mm256_loadu_ps(ptrD);\
+ _tA = _mm_load_ss(ptrA+8);\
+ _tB = _mm_load_ss(ptrB+8);\
+ _tC = _mm_load_ss(ptrC+8);\
+ _tD = _mm_load_ss(ptrD+8);\
+ _t5 = _mm256_unpacklo_ps(x1,y1);\
+ x1 = _mm256_unpackhi_ps(x1,y1);\
+ y1 = _mm256_unpacklo_ps(z1,x2);\
+ z1 = _mm256_unpackhi_ps(z1,x2);\
+ x2 = _mm256_unpacklo_ps(y2,z2);\
+ y2 = _mm256_unpackhi_ps(y2,z2);\
+ _t6 = _mm256_unpacklo_ps(x3,y3);\
+ x3 = _mm256_unpackhi_ps(x3,y3);\
+ _t5 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+ x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+ y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1);\
+ z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+ z2 = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(1,0,1,0));\
+ _t5 = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(3,2,3,2));\
+ y1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+ x1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_sub_ps(_t1,z2);\
+ _t2 = _mm256_sub_ps(_t2,_t5);\
+ _t3 = _mm256_sub_ps(_t3,y1);\
+ _t4 = _mm256_sub_ps(_t4,x1);\
+ _tA = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3));\
+ _tB = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));\
+ _tC = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));\
+ _tD = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));\
+ _mm256_storeu_ps(ptrA,_t1);\
+ _mm256_storeu_ps(ptrB,_t2);\
+ _mm256_storeu_ps(ptrC,_t3);\
+ _mm256_storeu_ps(ptrD,_t4);\
+ _mm_store_ss(ptrA+8,_tA);\
+ _mm_store_ss(ptrB+8,_tB);\
+ _mm_store_ss(ptrC+8,_tC);\
+ _mm_store_ss(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3)
{
__m256 t1,t2,t3,t4,t5,t6;
__m128 tA,tB,tC,tD;
_mm_store_ss(ptrC+8,tC);
_mm_store_ss(ptrD+8,tD);
}
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4) \
+{\
+ __m256 _t1,_t2,_t3,_t4,_t5;\
+ __m128 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH;\
+\
+ _t1 = _mm256_loadu_ps(ptrA);\
+ _t2 = _mm256_loadu_ps(ptrB);\
+ _t3 = _mm256_loadu_ps(ptrC);\
+ _t4 = _mm256_loadu_ps(ptrD);\
+ _tA = _mm_loadu_ps(ptrA+8);\
+ _tB = _mm_loadu_ps(ptrB+8);\
+ _tC = _mm_loadu_ps(ptrC+8);\
+ _tD = _mm_loadu_ps(ptrD+8);\
+ _t5 = _mm256_unpacklo_ps(x1,y1);\
+ x1 = _mm256_unpackhi_ps(x1,y1);\
+ y1 = _mm256_unpacklo_ps(z1,x2);\
+ z1 = _mm256_unpackhi_ps(z1,x2);\
+ x2 = _mm256_unpacklo_ps(y2,z2);\
+ y2 = _mm256_unpackhi_ps(y2,z2);\
+ z2 = _mm256_unpacklo_ps(x3,y3);\
+ x3 = _mm256_unpackhi_ps(x3,y3);\
+ y3 = _mm256_unpacklo_ps(z3,x4);\
+ z3 = _mm256_unpackhi_ps(z3,x4);\
+ x4 = _mm256_unpacklo_ps(y4,z4);\
+ y4 = _mm256_unpackhi_ps(y4,z4);\
+ x2 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+ x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+ y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);\
+ z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+ z2 = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0));\
+ _t5 = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2));\
+ y1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+ x1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+ _tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0));\
+ _tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2));\
+ _tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0));\
+ _tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_sub_ps(_t1,z2);\
+ _t2 = _mm256_sub_ps(_t2,_t5);\
+ _t3 = _mm256_sub_ps(_t3,y1);\
+ _t4 = _mm256_sub_ps(_t4,x1);\
+ _tA = _mm_sub_ps(_tA,_tE);\
+ _tB = _mm_sub_ps(_tB,_tF);\
+ _tC = _mm_sub_ps(_tC,_tG);\
+ _tD = _mm_sub_ps(_tD,_tH);\
+ _mm256_storeu_ps(ptrA,_t1);\
+ _mm256_storeu_ps(ptrB,_t2);\
+ _mm256_storeu_ps(ptrC,_t3);\
+ _mm256_storeu_ps(ptrD,_t4);\
+ _mm_storeu_ps(ptrA+8,_tA);\
+ _mm_storeu_ps(ptrB+8,_tB);\
+ _mm_storeu_ps(ptrC+8,_tC);\
+ _mm_storeu_ps(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3,
- __m256 x4, __m256 y4, __m256 z4)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3,
+ __m256 x4, __m256 y4, __m256 z4)
{
__m256 t1,t2,t3,t4,t5;
__m128 tA,tB,tC,tD,tE,tF,tG,tH;
_mm_storeu_ps(ptrC+8,tC);
_mm_storeu_ps(ptrD+8,tD);
}
-
+#endif
static gmx_inline void
gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- float * gmx_restrict ptrE, float * gmx_restrict ptrF,
- float * gmx_restrict ptrG, float * gmx_restrict ptrH,
- __m256 x1, __m256 y1, __m256 z1)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+ float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+ __m256 x1, __m256 y1, __m256 z1)
{
__m256 t1,t2,t3,t4,t5,t6;
__m256 tA,tB,tC,tD;
/* Construct a mask without executing any data loads */
mask = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
- tA = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask));
- tB = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask));
- tC = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask));
- tD = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask));
+ tA = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
+ tB = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
+ tC = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
+ tD = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
t1 = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
t2 = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
tC = _mm256_sub_ps(tC,t5);
tD = _mm256_sub_ps(tD,t6);
- _mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
- _mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
- _mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
- _mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
- _mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
- _mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
- _mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
- _mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
+ gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
+ gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
+ gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
+ gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
+ gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
+ gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
+ gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
+ gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+ __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+ __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+ _tA = _mm256_loadu_ps(ptrA);\
+ _tB = _mm256_loadu_ps(ptrB);\
+ _tC = _mm256_loadu_ps(ptrC);\
+ _tD = _mm256_loadu_ps(ptrD);\
+ _tE = _mm256_loadu_ps(ptrE);\
+ _tF = _mm256_loadu_ps(ptrF);\
+ _tG = _mm256_loadu_ps(ptrG);\
+ _tH = _mm256_loadu_ps(ptrH);\
+ _t1 = _mm256_unpacklo_ps(_x1,_y1);\
+ _t2 = _mm256_unpackhi_ps(_x1,_y1);\
+ _t3 = _mm256_unpacklo_ps(_z1,_x2);\
+ _t4 = _mm256_unpackhi_ps(_z1,_x2);\
+ _t5 = _mm256_unpacklo_ps(_y2,_z2);\
+ _t6 = _mm256_unpackhi_ps(_y2,_z2);\
+ _t7 = _mm256_unpacklo_ps(_x3,_y3);\
+ _t8 = _mm256_unpackhi_ps(_x3,_y3);\
+ _t9 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+ _t10 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+ _t11 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+ _t12 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+ _t2 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+ _t3 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+ _t4 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+ _t5 = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+ _t6 = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+ _t7 = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+ _t8 = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+ _t1 = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+ _t2 = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+ _t9 = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+ _t10 = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+ _tA = _mm256_sub_ps(_tA,_t5);\
+ _tB = _mm256_sub_ps(_tB,_t7);\
+ _tC = _mm256_sub_ps(_tC,_t1);\
+ _tD = _mm256_sub_ps(_tD,_t9);\
+ _tE = _mm256_sub_ps(_tE,_t6);\
+ _tF = _mm256_sub_ps(_tF,_t8);\
+ _tG = _mm256_sub_ps(_tG,_t2);\
+ _tH = _mm256_sub_ps(_tH,_t10);\
+ _mm256_storeu_ps(ptrA,_tA);\
+ _mm256_storeu_ps(ptrB,_tB);\
+ _mm256_storeu_ps(ptrC,_tC);\
+ _mm256_storeu_ps(ptrD,_tD);\
+ _mm256_storeu_ps(ptrE,_tE);\
+ _mm256_storeu_ps(ptrF,_tF);\
+ _mm256_storeu_ps(ptrG,_tG);\
+ _mm256_storeu_ps(ptrH,_tH);\
+ _tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));\
+ _tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));\
+ _tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));\
+ _tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));\
+ _tI = _mm256_unpacklo_ps(_tI,_tK);\
+ _tJ = _mm256_unpacklo_ps(_tJ,_tL);\
+ _tI = _mm256_unpacklo_ps(_tI,_tJ);\
+ _tI = _mm256_sub_ps(_tI,_z3);\
+ _tJ = _mm256_permute_ps(_tI,_MM_SHUFFLE(1,1,1,1));\
+ _tK = _mm256_permute_ps(_tI,_MM_SHUFFLE(2,2,2,2));\
+ _tL = _mm256_permute_ps(_tI,_MM_SHUFFLE(3,3,3,3));\
+ _mm_store_ss(ptrA+8,_mm256_castps256_ps128(_tI));\
+ _mm_store_ss(ptrB+8,_mm256_castps256_ps128(_tJ));\
+ _mm_store_ss(ptrC+8,_mm256_castps256_ps128(_tK));\
+ _mm_store_ss(ptrD+8,_mm256_castps256_ps128(_tL));\
+ _mm_store_ss(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+ _mm_store_ss(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+ _mm_store_ss(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+ _mm_store_ss(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- float * gmx_restrict ptrE, float * gmx_restrict ptrF,
- float * gmx_restrict ptrG, float * gmx_restrict ptrH,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+ float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3)
{
__m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
__m256 tA,tB,tC,tD,tE,tF,tG,tH;
_mm256_storeu_ps(ptrF,tF);
_mm256_storeu_ps(ptrG,tG);
_mm256_storeu_ps(ptrH,tH);
-
+
tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
-
+
tI = _mm256_unpacklo_ps(tI,tK); /* - - zG zE | - - zC zA */
tJ = _mm256_unpacklo_ps(tJ,tL); /* - - zH zF | - - zD zB */
tI = _mm256_unpacklo_ps(tI,tJ); /* zH zG zF zE | zD zC zB zA */
_mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
_mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
}
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+ __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+ _tA = _mm256_loadu_ps(ptrA);\
+ _tB = _mm256_loadu_ps(ptrB);\
+ _tC = _mm256_loadu_ps(ptrC);\
+ _tD = _mm256_loadu_ps(ptrD);\
+ _tE = _mm256_loadu_ps(ptrE);\
+ _tF = _mm256_loadu_ps(ptrF);\
+ _tG = _mm256_loadu_ps(ptrG);\
+ _tH = _mm256_loadu_ps(ptrH);\
+ _t1 = _mm256_unpacklo_ps(_x1,_y1);\
+ _t2 = _mm256_unpackhi_ps(_x1,_y1);\
+ _t3 = _mm256_unpacklo_ps(_z1,_x2);\
+ _t4 = _mm256_unpackhi_ps(_z1,_x2);\
+ _t5 = _mm256_unpacklo_ps(_y2,_z2);\
+ _t6 = _mm256_unpackhi_ps(_y2,_z2);\
+ _t7 = _mm256_unpacklo_ps(_x3,_y3);\
+ _t8 = _mm256_unpackhi_ps(_x3,_y3);\
+ _t9 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+ _t10 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+ _t11 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+ _t12 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+ _t2 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+ _t3 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+ _t4 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+ _t5 = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+ _t6 = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+ _t7 = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+ _t8 = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+ _t1 = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+ _t2 = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+ _t9 = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+ _t10 = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+ _tA = _mm256_sub_ps(_tA,_t5);\
+ _tB = _mm256_sub_ps(_tB,_t7);\
+ _tC = _mm256_sub_ps(_tC,_t1);\
+ _tD = _mm256_sub_ps(_tD,_t9);\
+ _tE = _mm256_sub_ps(_tE,_t6);\
+ _tF = _mm256_sub_ps(_tF,_t8);\
+ _tG = _mm256_sub_ps(_tG,_t2);\
+ _tH = _mm256_sub_ps(_tH,_t10);\
+ _mm256_storeu_ps(ptrA,_tA);\
+ _mm256_storeu_ps(ptrB,_tB);\
+ _mm256_storeu_ps(ptrC,_tC);\
+ _mm256_storeu_ps(ptrD,_tD);\
+ _mm256_storeu_ps(ptrE,_tE);\
+ _mm256_storeu_ps(ptrF,_tF);\
+ _mm256_storeu_ps(ptrG,_tG);\
+ _mm256_storeu_ps(ptrH,_tH);\
+ _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));\
+ _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));\
+ _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));\
+ _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));\
+ _t1 = _mm256_unpacklo_ps(_z3,_x4);\
+ _t2 = _mm256_unpackhi_ps(_z3,_x4);\
+ _t3 = _mm256_unpacklo_ps(_y4,_z4);\
+ _t4 = _mm256_unpackhi_ps(_y4,_z4);\
+ _t5 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+ _t6 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+ _t7 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+ _t8 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+ _tI = _mm256_sub_ps(_tI,_t5);\
+ _tJ = _mm256_sub_ps(_tJ,_t6);\
+ _tK = _mm256_sub_ps(_tK,_t7);\
+ _tL = _mm256_sub_ps(_tL,_t8);\
+ _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(_tI));\
+ _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(_tJ));\
+ _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(_tK));\
+ _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(_tL));\
+ _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+ _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+ _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+ _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- float * gmx_restrict ptrE, float * gmx_restrict ptrF,
- float * gmx_restrict ptrG, float * gmx_restrict ptrH,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3,
- __m256 x4, __m256 y4, __m256 z4)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+ float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3,
+ __m256 x4, __m256 y4, __m256 z4)
{
__m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
__m256 tA,tB,tC,tD,tE,tF,tG,tH;
tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
-
+
t1 = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
t2 = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
t3 = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
_mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
_mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
}
-
+#endif
static gmx_inline void
gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
- float * gmx_restrict fptr,
- float * gmx_restrict fshiftptr)
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m128 t1,t2,t3;
/* Add across the two lanes */
t1 = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
-
+
t2 = _mm_load_ss(fptr);
t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
t3 = _mm_load_ss(fshiftptr);
_mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{ \
+ __m256 _t1,_t2,_t3;\
+ __m128 _tA,_tB,_tC;\
+\
+ fix1 = _mm256_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm256_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+ fix1 = _mm256_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+\
+ _t1 = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+ _t2 = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+ _t1 = _mm256_add_ps(_t1,_t2);\
+ _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+ _t3 = _mm256_loadu_ps(fptr);\
+ _t3 = _mm256_add_ps(_t3,_t1);\
+ _mm256_storeu_ps(fptr,_t3);\
+ _tB = _mm_load_ss(fptr+8);\
+ _tB = _mm_add_ss(_tB,_tA);\
+ _mm_store_ss(fptr+8,_tB);\
+\
+ _tB = _mm256_extractf128_ps(_t1,0x1);\
+ _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+ _tB = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+ _tC = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+ _tB = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+ _tA = _mm_add_ps(_tB,_tC);\
+ _tA = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+ _tC = _mm_loadu_ps(fshiftptr);\
+ _tC = _mm_add_ps(_tC,_tA);\
+ _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
- __m256 fix2, __m256 fiy2, __m256 fiz2,
- __m256 fix3, __m256 fiy3, __m256 fiz3,
- float * gmx_restrict fptr,
- float * gmx_restrict fshiftptr)
+ __m256 fix2, __m256 fiy2, __m256 fiz2,
+ __m256 fix3, __m256 fiy3, __m256 fiz3,
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m256 t1,t2,t3;
__m128 tA,tB,tC;
tB = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
tA = _mm_add_ps(tB,tC); /* - z y x */
-
+
tA = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
tC = _mm_loadu_ps(fshiftptr);
tC = _mm_add_ps(tC,tA);
_mm_storeu_ps(fshiftptr,tC);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{ \
+ __m256 _t1,_t2,_t3; \
+ __m128 _tA,_tB,_tC; \
+\
+ fix1 = _mm256_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm256_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm256_hadd_ps(fiz3,fix4);\
+ fiy4 = _mm256_hadd_ps(fiy4,fiz4);\
+\
+ fix1 = _mm256_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm256_hadd_ps(fiz3,fiy4);\
+\
+ _t1 = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+ _t2 = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+ _t1 = _mm256_add_ps(_t1,_t2);\
+ _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+ _t3 = _mm256_loadu_ps(fptr);\
+ _t3 = _mm256_add_ps(_t3,_t1);\
+ _mm256_storeu_ps(fptr,_t3);\
+ _tB = _mm_loadu_ps(fptr+8);\
+ _tB = _mm_add_ps(_tB,_tA);\
+ _mm_storeu_ps(fptr+8,_tB);\
+\
+ _tB = _mm256_extractf128_ps(_t1,0x1);\
+ _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+ _tB = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+ _tC = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+ _tA = _mm_permute_ps(_tA,_MM_SHUFFLE(0,3,2,1));\
+ _tB = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+ _tA = _mm_add_ps(_tA,_tC);\
+ _tA = _mm_add_ps(_tA,_tB);\
+ _tA = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+ _tC = _mm_loadu_ps(fshiftptr);\
+ _tC = _mm_add_ps(_tC,_tA);\
+ _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
- __m256 fix2, __m256 fiy2, __m256 fiz2,
- __m256 fix3, __m256 fiy3, __m256 fiz3,
- __m256 fix4, __m256 fiy4, __m256 fiz4,
- float * gmx_restrict fptr,
- float * gmx_restrict fshiftptr)
+ __m256 fix2, __m256 fiy2, __m256 fiz2,
+ __m256 fix3, __m256 fiy3, __m256 fiz3,
+ __m256 fix4, __m256 fiy4, __m256 fiz4,
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m256 t1,t2,t3;
__m128 tA,tB,tC;
tC = _mm_add_ps(tC,tA);
_mm_storeu_ps(fshiftptr,tC);
}
+#endif
}
-static gmx_inline void
-gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
- __m256 pot2, float * gmx_restrict ptrB,
- __m256 pot3, float * gmx_restrict ptrC,
- __m256 pot4, float * gmx_restrict ptrD)
-{
- __m128 t1,t2,t3,t4;
-
- pot1 = _mm256_hadd_ps(pot1,pot2);
- pot3 = _mm256_hadd_ps(pot3,pot4);
- pot1 = _mm256_hadd_ps(pot1,pot3);
- t1 = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
- t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
- t3 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
- t4 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
- _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
- _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
- _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
-}
-
-
#endif /* _kernelutil_x86_avx_256_single_h_ */
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1,
- __m128d * gmx_restrict y1,
- __m128d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1,
+ __m128d * gmx_restrict y1,
+ __m128d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz;
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
- __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+ __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
- *x = _mm_load_sd(p1);
- *y = _mm_load_sd(p1+1);
- *z = _mm_load_sd(p1+2);
+ *x = _mm_load_sd(p1);
+ *y = _mm_load_sd(p1+1);
+ *z = _mm_load_sd(p1+2);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
- *x1 = _mm_load_sd(p1);
- *y1 = _mm_load_sd(p1+1);
- *z1 = _mm_load_sd(p1+2);
- *x2 = _mm_load_sd(p1+3);
- *y2 = _mm_load_sd(p1+4);
- *z2 = _mm_load_sd(p1+5);
- *x3 = _mm_load_sd(p1+6);
- *y3 = _mm_load_sd(p1+7);
- *z3 = _mm_load_sd(p1+8);
+ *x1 = _mm_load_sd(p1);
+ *y1 = _mm_load_sd(p1+1);
+ *z1 = _mm_load_sd(p1+2);
+ *x2 = _mm_load_sd(p1+3);
+ *y2 = _mm_load_sd(p1+4);
+ *z2 = _mm_load_sd(p1+5);
+ *x3 = _mm_load_sd(p1+6);
+ *y3 = _mm_load_sd(p1+7);
+ *z3 = _mm_load_sd(p1+8);
}
static gmx_inline void
/* Routines to decrement rvec in memory, typically use for j particle force updates */
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy, __m128d z)
+ __m128d xy, __m128d z)
{
__m128d t1,t2;
_mm_store_sd(ptrA+2,t2);
}
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3)
-{
- __m128d t1,t2;
- __m128d tA,tB,tC,tD,tE;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_load_sd(ptrA+8);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_sd(tE,z3);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3,
- __m128d xy4, __m128d z4)
-{
- __m128d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_loadu_pd(ptrA+8);
- tF = _mm_loadu_pd(ptrA+10);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
- t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
- t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_pd(tE,t3);
- tF = _mm_sub_pd(tF,t4);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_storeu_pd(ptrA+8,tE);
- _mm_storeu_pd(ptrA+10,tF);
-}
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_load_sd(ptrA+8);\
+ _x1 = _mm_unpacklo_pd(_x1,_y1);\
+ _z1 = _mm_unpacklo_pd(_z1,_x2);\
+ _y2 = _mm_unpacklo_pd(_y2,_z2);\
+ _x3 = _mm_unpacklo_pd(_x3,_y3);\
+ _t1 = _mm_sub_pd(_t1,_x1);\
+ _t2 = _mm_sub_pd(_t2,_z1);\
+ _t3 = _mm_sub_pd(_t3,_y2);\
+ _t4 = _mm_sub_pd(_t4,_x3);\
+ _t5 = _mm_sub_sd(_t5,_z3);\
+ _mm_storeu_pd(ptrA,_t1);\
+ _mm_storeu_pd(ptrA+2,_t2);\
+ _mm_storeu_pd(ptrA+4,_t3);\
+ _mm_storeu_pd(ptrA+6,_t4);\
+ _mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+6,t4);
_mm_store_sd(ptrA+8,t5);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_loadu_pd(ptrA+8);\
+ _t6 = _mm_loadu_pd(ptrA+10);\
+ _x1 = _mm_unpacklo_pd(_x1,_y1);\
+ _z1 = _mm_unpacklo_pd(_z1,_x2);\
+ _y2 = _mm_unpacklo_pd(_y2,_z2);\
+ _x3 = _mm_unpacklo_pd(_x3,_y3);\
+ _z3 = _mm_unpacklo_pd(_z3,_x4);\
+ _y4 = _mm_unpacklo_pd(_y4,_z4);\
+ _mm_storeu_pd(ptrA, _mm_sub_pd( _t1,_x1 ));\
+ _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2,_z1 ));\
+ _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3,_y2 ));\
+ _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4,_x3 ));\
+ _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5,_z3 ));\
+ _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
}
+#endif
+
static gmx_inline void
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
_mm_store_sd(ptrB+2,t4);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+ __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_load_sd(ptrA+8);\
+ _t6 = _mm_loadu_pd(ptrB);\
+ _t7 = _mm_loadu_pd(ptrB+2);\
+ _t8 = _mm_loadu_pd(ptrB+4);\
+ _t9 = _mm_loadu_pd(ptrB+6);\
+ _t10 = _mm_load_sd(ptrB+8);\
+ _tA = _mm_unpacklo_pd(_x1,_y1);\
+ _tB = _mm_unpackhi_pd(_x1,_y1);\
+ _tC = _mm_unpacklo_pd(_z1,_x2);\
+ _tD = _mm_unpackhi_pd(_z1,_x2);\
+ _tE = _mm_unpacklo_pd(_y2,_z2);\
+ _tF = _mm_unpackhi_pd(_y2,_z2);\
+ _tG = _mm_unpacklo_pd(_x3,_y3);\
+ _tH = _mm_unpackhi_pd(_x3,_y3);\
+ _tI = _mm_unpackhi_pd(_z3,_z3);\
+ _t1 = _mm_sub_pd(_t1,_tA);\
+ _t2 = _mm_sub_pd(_t2,_tC);\
+ _t3 = _mm_sub_pd(_t3,_tE);\
+ _t4 = _mm_sub_pd(_t4,_tG);\
+ _t5 = _mm_sub_sd(_t5,_z3);\
+ _t6 = _mm_sub_pd(_t6,_tB);\
+ _t7 = _mm_sub_pd(_t7,_tD);\
+ _t8 = _mm_sub_pd(_t8,_tF);\
+ _t9 = _mm_sub_pd(_t9,_tH);\
+ _t10 = _mm_sub_sd(_t10,_tI);\
+ _mm_storeu_pd(ptrA,_t1);\
+ _mm_storeu_pd(ptrA+2,_t2);\
+ _mm_storeu_pd(ptrA+4,_t3);\
+ _mm_storeu_pd(ptrA+6,_t4);\
+ _mm_store_sd(ptrA+8,_t5);\
+ _mm_storeu_pd(ptrB,_t6);\
+ _mm_storeu_pd(ptrB+2,_t7);\
+ _mm_storeu_pd(ptrB+4,_t8);\
+ _mm_storeu_pd(ptrB+6,_t9);\
+ _mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+6,t9);
_mm_store_sd(ptrB+8,t10);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+ __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_loadu_pd(ptrA+8);\
+ _t6 = _mm_loadu_pd(ptrA+10);\
+ _t7 = _mm_loadu_pd(ptrB);\
+ _t8 = _mm_loadu_pd(ptrB+2);\
+ _t9 = _mm_loadu_pd(ptrB+4);\
+ _t10 = _mm_loadu_pd(ptrB+6);\
+ _t11 = _mm_loadu_pd(ptrB+8);\
+ _t12 = _mm_loadu_pd(ptrB+10);\
+ _tA = _mm_unpacklo_pd(_x1,_y1);\
+ _tB = _mm_unpackhi_pd(_x1,_y1);\
+ _tC = _mm_unpacklo_pd(_z1,_x2);\
+ _tD = _mm_unpackhi_pd(_z1,_x2);\
+ _tE = _mm_unpacklo_pd(_y2,_z2);\
+ _tF = _mm_unpackhi_pd(_y2,_z2);\
+ _tG = _mm_unpacklo_pd(_x3,_y3);\
+ _tH = _mm_unpackhi_pd(_x3,_y3);\
+ _tI = _mm_unpacklo_pd(_z3,_x4);\
+ _tJ = _mm_unpackhi_pd(_z3,_x4);\
+ _tK = _mm_unpacklo_pd(_y4,_z4);\
+ _tL = _mm_unpackhi_pd(_y4,_z4);\
+ _t1 = _mm_sub_pd(_t1,_tA);\
+ _t2 = _mm_sub_pd(_t2,_tC);\
+ _t3 = _mm_sub_pd(_t3,_tE);\
+ _t4 = _mm_sub_pd(_t4,_tG);\
+ _t5 = _mm_sub_pd(_t5,_tI);\
+ _t6 = _mm_sub_pd(_t6,_tK);\
+ _t7 = _mm_sub_pd(_t7,_tB);\
+ _t8 = _mm_sub_pd(_t8,_tD);\
+ _t9 = _mm_sub_pd(_t9,_tF);\
+ _t10 = _mm_sub_pd(_t10,_tH);\
+ _t11 = _mm_sub_pd(_t11,_tJ);\
+ _t12 = _mm_sub_pd(_t12,_tL);\
+ _mm_storeu_pd(ptrA, _t1);\
+ _mm_storeu_pd(ptrA+2,_t2);\
+ _mm_storeu_pd(ptrA+4,_t3);\
+ _mm_storeu_pd(ptrA+6,_t4);\
+ _mm_storeu_pd(ptrA+8,_t5);\
+ _mm_storeu_pd(ptrA+10,_t6);\
+ _mm_storeu_pd(ptrB, _t7);\
+ _mm_storeu_pd(ptrB+2,_t8);\
+ _mm_storeu_pd(ptrB+4,_t9);\
+ _mm_storeu_pd(ptrB+6,_t10);\
+ _mm_storeu_pd(ptrB+8,_t11);\
+ _mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+8,t11);
_mm_storeu_pd(ptrB+10,t12);
}
+#endif
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+ GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+ GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+ _t1 = fix3;\
+ fix3 = _mm_unpacklo_pd(fix3,fiy3);\
+ fiy3 = _mm_unpackhi_pd(_t1,fiy3);\
+ fix1 = _mm_add_pd(fix1,fiy1);\
+ fiz1 = _mm_add_pd(fiz1,fix2);\
+ fiy2 = _mm_add_pd(fiy2,fiz2);\
+ fix3 = _mm_add_pd(fix3,fiy3);\
+ fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ fiz1 = _mm_add_sd(fiz1,_t2);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+ GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+ GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+ GMX_MM_TRANSPOSE2_PD(fix3,fiy3);\
+ GMX_MM_TRANSPOSE2_PD(fiz3,fix4);\
+ GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);\
+ fix1 = _mm_add_pd(fix1,fiy1);\
+ fiz1 = _mm_add_pd(fiz1,fix2);\
+ fiy2 = _mm_add_pd(fiy2,fiz2);\
+ fix3 = _mm_add_pd(fix3,fiy3);\
+ fiz3 = _mm_add_pd(fiz3,fix4);\
+ fiy4 = _mm_add_pd(fiy4,fiz4);\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));\
+ _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+ fix3 = _mm_add_pd(fix3,_t2);\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+ fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
fix3 = _mm_add_pd(fix3,fiy3);
fiz3 = _mm_add_pd(fiz3,fix4);
fiy4 = _mm_add_pd(fiy4,fiz4);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
+#endif
static gmx_inline void
/* We require SSE2 now! */
-#include <math.h>
+#include <math.h>
#include "gmx_x86_sse2.h"
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1,
- __m128 * gmx_restrict y1,
- __m128 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
-
+
t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
t3 = _mm_load_ss(xyz_shift+2);
t4 = _mm_load_ss(xyz+2);
t1 = _mm_add_ps(t1,t2);
t3 = _mm_add_ss(t3,t4);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
tB = _mm_load_ss(xyz_shift+2);
-
+
t1 = _mm_loadu_ps(xyz);
t2 = _mm_loadu_ps(xyz+4);
t3 = _mm_load_ss(xyz+8);
-
+
tA = _mm_movelh_ps(tA,tB);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ss(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
tB = _mm_load_ss(xyz_shift+2);
-
+
t1 = _mm_loadu_ps(xyz);
t2 = _mm_loadu_ps(xyz+4);
t3 = _mm_loadu_ps(xyz+8);
-
+
tA = _mm_movelh_ps(tA,tB);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ps(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 t1,t2,t3,t4;
t1 = _mm_loadu_ps(ptrA);
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
__m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 t1,t2,t3,t4;
t1 = _mm_loadu_ps(ptrA);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18 = _mm_movehl_ps(_z3,_z3);\
+_t19 = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20 = _mm_movelh_ps(_x1,_z1);\
+_t21 = _mm_movehl_ps(_z1,_x1);\
+_t22 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t23 = _mm_movelh_ps(_y2,_x3);\
+_t24 = _mm_movehl_ps(_x3,_y2);\
+_t25 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_load_ss(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t20);\
+_t2 = _mm_sub_ps(_t2,_t23);\
+_t3 = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_load_ss(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_t21);\
+_t5 = _mm_sub_ps(_t5,_t24);\
+_t6 = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_load_ss(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t22);\
+_t8 = _mm_sub_ps(_t8,_t25);\
+_t9 = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_load_ss(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3)
+ __m128 x3, __m128 y3, __m128 z3)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
_mm_storeu_ps(ptrD+4,t11);
_mm_store_ss(ptrD+8,t12);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_unpackhi_ps(_z3,_x4);\
+_z3 = _mm_unpacklo_ps(_z3,_x4);\
+_t18 = _mm_unpackhi_ps(_y4,_z4);\
+_y4 = _mm_unpacklo_ps(_y4,_z4);\
+_t19 = _mm_movelh_ps(_x1,_z1);\
+_z1 = _mm_movehl_ps(_z1,_x1);\
+_t20 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t21 = _mm_movelh_ps(_y2,_x3);\
+_x3 = _mm_movehl_ps(_x3,_y2);\
+_t22 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t23 = _mm_movelh_ps(_z3,_y4);\
+_y4 = _mm_movehl_ps(_y4,_z3);\
+_t24 = _mm_movelh_ps(_t17,_t18);\
+_t18 = _mm_movehl_ps(_t18,_t17);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_loadu_ps(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t19);\
+_t2 = _mm_sub_ps(_t2,_t21);\
+_t3 = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_loadu_ps(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_z1);\
+_t5 = _mm_sub_ps(_t5,_x3);\
+_t6 = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_loadu_ps(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t20);\
+_t8 = _mm_sub_ps(_t8,_t22);\
+_t9 = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_loadu_ps(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 x3, __m128 y3, __m128 z3,
- __m128 x4, __m128 y4, __m128 z4)
+ __m128 x4, __m128 y4, __m128 z4)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
__m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
_mm_storeu_ps(ptrD+4,t11);
_mm_storeu_ps(ptrD+8,t12);
}
-
+#endif
static gmx_inline void
_mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4;\
+\
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+ _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+ _t2 = _mm_movehl_ps(_mm_setzero_ps(),fiz3);\
+ _t1 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));\
+ _t3 = _mm_shuffle_ps(_t2,_t2,_MM_SHUFFLE(0,0,0,1));\
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+ fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+ fiz3 = _mm_add_ss(_mm_add_ps(fiz3,_t1) , _mm_add_ps(_t2,_t3));\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+ _t4 = _mm_load_ss(fshiftptr+2);\
+ _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+ _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+ _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+ _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+ _t3 = _mm_shuffle_ps(_t3 ,_t3 ,_MM_SHUFFLE(1,2,0,0));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _mm_store_ss(fshiftptr+2,_t1);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t1);
_mm_storeh_pi((__m64 *)(fshiftptr),t1);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5;\
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+ _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+ _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);\
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+ fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+ fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+ _t5 = _mm_load_ss(fshiftptr+2);\
+ _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+ _t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+ _t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+ _t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+ _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+ _t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _t5 = _mm_add_ps(_t5,_t1);\
+ _mm_store_ss(fshiftptr+2,_t5);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t5);
_mm_storeh_pi((__m64 *)(fshiftptr),t5);
}
-
+#endif
static void
}
-static void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
- __m128 pot2, float * gmx_restrict ptrB,
- __m128 pot3, float * gmx_restrict ptrC,
- __m128 pot4, float * gmx_restrict ptrD)
-{
- _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
- pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
- pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
- pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
- pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
- _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
- _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
#endif /* _kernelutil_x86_sse2_single_h_ */
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1,
- __m128d * gmx_restrict y1,
- __m128d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1,
+ __m128d * gmx_restrict y1,
+ __m128d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz;
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
- __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+ __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
- *x = _mm_load_sd(p1);
- *y = _mm_load_sd(p1+1);
- *z = _mm_load_sd(p1+2);
+ *x = _mm_load_sd(p1);
+ *y = _mm_load_sd(p1+1);
+ *z = _mm_load_sd(p1+2);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
- *x1 = _mm_load_sd(p1);
- *y1 = _mm_load_sd(p1+1);
- *z1 = _mm_load_sd(p1+2);
- *x2 = _mm_load_sd(p1+3);
- *y2 = _mm_load_sd(p1+4);
- *z2 = _mm_load_sd(p1+5);
- *x3 = _mm_load_sd(p1+6);
- *y3 = _mm_load_sd(p1+7);
- *z3 = _mm_load_sd(p1+8);
+ *x1 = _mm_load_sd(p1);
+ *y1 = _mm_load_sd(p1+1);
+ *z1 = _mm_load_sd(p1+2);
+ *x2 = _mm_load_sd(p1+3);
+ *y2 = _mm_load_sd(p1+4);
+ *z2 = _mm_load_sd(p1+5);
+ *x3 = _mm_load_sd(p1+6);
+ *y3 = _mm_load_sd(p1+7);
+ *z3 = _mm_load_sd(p1+8);
}
static gmx_inline void
/* Routines to decrement rvec in memory, typically use for j particle force updates */
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy, __m128d z)
+ __m128d xy, __m128d z)
{
__m128d t1,t2;
_mm_store_sd(ptrA+2,t2);
}
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3)
-{
- __m128d t1,t2;
- __m128d tA,tB,tC,tD,tE;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_load_sd(ptrA+8);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_sd(tE,z3);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3,
- __m128d xy4, __m128d z4)
-{
- __m128d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_loadu_pd(ptrA+8);
- tF = _mm_loadu_pd(ptrA+10);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
- t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
- t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_pd(tE,t3);
- tF = _mm_sub_pd(tF,t4);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_storeu_pd(ptrA+8,tE);
- _mm_storeu_pd(ptrA+10,tF);
-}
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_t1 = _mm_sub_pd(_t1,_x1);\
+_t2 = _mm_sub_pd(_t2,_z1);\
+_t3 = _mm_sub_pd(_t3,_y2);\
+_t4 = _mm_sub_pd(_t4,_x3);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+6,t4);
_mm_store_sd(ptrA+8,t5);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_z3 = _mm_unpacklo_pd(_z3,_x4);\
+_y4 = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA, _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
}
+#endif
+
static gmx_inline void
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
_mm_store_sd(ptrB+2,t4);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrB);\
+_t7 = _mm_loadu_pd(ptrB+2);\
+_t8 = _mm_loadu_pd(ptrB+4);\
+_t9 = _mm_loadu_pd(ptrB+6);\
+_t10 = _mm_load_sd(ptrB+8);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpackhi_pd(_z3,_z3);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_t6 = _mm_sub_pd(_t6,_tB);\
+_t7 = _mm_sub_pd(_t7,_tD);\
+_t8 = _mm_sub_pd(_t8,_tF);\
+_t9 = _mm_sub_pd(_t9,_tH);\
+_t10 = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+6,t9);
_mm_store_sd(ptrB+8,t10);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_t7 = _mm_loadu_pd(ptrB);\
+_t8 = _mm_loadu_pd(ptrB+2);\
+_t9 = _mm_loadu_pd(ptrB+4);\
+_t10 = _mm_loadu_pd(ptrB+6);\
+_t11 = _mm_loadu_pd(ptrB+8);\
+_t12 = _mm_loadu_pd(ptrB+10);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpacklo_pd(_z3,_x4);\
+_tJ = _mm_unpackhi_pd(_z3,_x4);\
+_tK = _mm_unpacklo_pd(_y4,_z4);\
+_tL = _mm_unpackhi_pd(_y4,_z4);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_pd(_t5,_tI);\
+_t6 = _mm_sub_pd(_t6,_tK);\
+_t7 = _mm_sub_pd(_t7,_tB);\
+_t8 = _mm_sub_pd(_t8,_tD);\
+_t9 = _mm_sub_pd(_t9,_tF);\
+_t10 = _mm_sub_pd(_t10,_tH);\
+_t11 = _mm_sub_pd(_t11,_tJ);\
+_t12 = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA, _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB, _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+8,t11);
_mm_storeu_pd(ptrB+10,t12);
}
-
+#endif
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ fix1 = _mm_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ fiz1 = _mm_add_sd(fiz1,_t2);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ fix1 = _mm_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm_hadd_pd(fiz3,fix4);\
+ fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));\
+ _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+ fix3 = _mm_add_pd(fix3,_t2);\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+ fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
static gmx_inline void
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
#ifndef _kernelutil_x86_sse4_1_single_h_
#define _kernelutil_x86_sse4_1_single_h_
-#include <math.h>
+#include <math.h>
#include "gmx_x86_sse4_1.h"
#undef gmx_restrict
-#define gmx_restrict
+#define gmx_restrict
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
const float * gmx_restrict ptrD)
{
__m128 t1,t2;
-
+
t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
return _mm_unpacklo_ps(t1,t2);
__m128 xmm1)
{
__m128 t2,t3,t4;
-
- t3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
- t2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
- t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
- _mm_store_ss(ptrA,xmm1);
- _mm_store_ss(ptrB,t2);
- _mm_store_ss(ptrC,t3);
- _mm_store_ss(ptrD,t4);
+
+ t3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+ t2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+ t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+ _mm_store_ss(ptrA,xmm1);
+ _mm_store_ss(ptrB,t2);
+ _mm_store_ss(ptrC,t3);
+ _mm_store_ss(ptrD,t4);
}
/* Similar to store, but increments value in memory */
float * gmx_restrict ptrD, __m128 xmm1)
{
__m128 tmp;
-
+
tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
tmp = _mm_add_ps(tmp,xmm1);
gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
__m128 * gmx_restrict c12)
{
__m128 t1,t2,t3,t4;
-
+
t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1); /* - - c12a c6a */
t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2); /* - - c12b c6b */
t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3); /* - - c12c c6c */
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1,
- __m128 * gmx_restrict y1,
- __m128 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
-
+
t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
t3 = _mm_load_ss(xyz_shift+2);
t4 = _mm_load_ss(xyz+2);
t1 = _mm_add_ps(t1,t2);
t3 = _mm_add_ss(t3,t4);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
tB = _mm_load_ss(xyz_shift+2);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ss(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
tB = _mm_load_ss(xyz_shift+2);
-
+
t1 = _mm_loadu_ps(xyz);
t2 = _mm_loadu_ps(xyz+4);
t3 = _mm_loadu_ps(xyz+8);
-
+
tA = _mm_movelh_ps(tA,tB);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ps(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 t1,t2,t3,t4;
t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrA ) );
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
__m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 t1,t2,t3,t4;
t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA) ) );
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18 = _mm_movehl_ps(_z3,_z3);\
+_t19 = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20 = _mm_movelh_ps(_x1,_z1);\
+_t21 = _mm_movehl_ps(_z1,_x1);\
+_t22 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t23 = _mm_movelh_ps(_y2,_x3);\
+_t24 = _mm_movehl_ps(_x3,_y2);\
+_t25 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_load_ss(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t20);\
+_t2 = _mm_sub_ps(_t2,_t23);\
+_t3 = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_load_ss(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_t21);\
+_t5 = _mm_sub_ps(_t5,_t24);\
+_t6 = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_load_ss(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t22);\
+_t8 = _mm_sub_ps(_t8,_t25);\
+_t9 = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_load_ss(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3)
+ __m128 x3, __m128 y3, __m128 z3)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
t10 = _mm_loadu_ps(ptrD);
t11 = _mm_loadu_ps(ptrD+4);
t12 = _mm_load_ss(ptrD+8);
-
+
t1 = _mm_sub_ps(t1,t20);
t2 = _mm_sub_ps(t2,t23);
t3 = _mm_sub_ss(t3,z3);
_mm_storeu_ps(ptrD+4,t11);
_mm_store_ss(ptrD+8,t12);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_unpackhi_ps(_z3,_x4);\
+_z3 = _mm_unpacklo_ps(_z3,_x4);\
+_t18 = _mm_unpackhi_ps(_y4,_z4);\
+_y4 = _mm_unpacklo_ps(_y4,_z4);\
+_t19 = _mm_movelh_ps(_x1,_z1);\
+_z1 = _mm_movehl_ps(_z1,_x1);\
+_t20 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t21 = _mm_movelh_ps(_y2,_x3);\
+_x3 = _mm_movehl_ps(_x3,_y2);\
+_t22 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t23 = _mm_movelh_ps(_z3,_y4);\
+_y4 = _mm_movehl_ps(_y4,_z3);\
+_t24 = _mm_movelh_ps(_t17,_t18);\
+_t18 = _mm_movehl_ps(_t18,_t17);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_loadu_ps(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t19);\
+_t2 = _mm_sub_ps(_t2,_t21);\
+_t3 = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_loadu_ps(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_z1);\
+_t5 = _mm_sub_ps(_t5,_x3);\
+_t6 = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_loadu_ps(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t20);\
+_t8 = _mm_sub_ps(_t8,_t22);\
+_t9 = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_loadu_ps(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 x3, __m128 y3, __m128 z3,
- __m128 x4, __m128 y4, __m128 z4)
+ __m128 x4, __m128 y4, __m128 z4)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
__m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
_mm_storeu_ps(ptrD+4,t11);
_mm_storeu_ps(ptrD+8,t12);
}
-
+#endif
static gmx_inline void
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
{
- __m128 t2,t3;
-
+ __m128 t2,t3;
+
fix1 = _mm_hadd_ps(fix1,fix1);
- fiy1 = _mm_hadd_ps(fiy1,fiz1);
-
- fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
-
- t2 = _mm_load_ss(fptr);
- t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
- t3 = _mm_load_ss(fshiftptr);
- t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
-
- t2 = _mm_add_ps(t2,fix1);
- t3 = _mm_add_ps(t3,fix1);
-
- _mm_store_ss(fptr,t2);
- _mm_storeh_pi((__m64 *)(fptr+1),t2);
- _mm_store_ss(fshiftptr,t3);
- _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+ fiy1 = _mm_hadd_ps(fiy1,fiz1);
+
+ fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
+
+ t2 = _mm_load_ss(fptr);
+ t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+ t3 = _mm_load_ss(fshiftptr);
+ t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+ t2 = _mm_add_ps(t2,fix1);
+ t3 = _mm_add_ps(t3,fix1);
+
+ _mm_store_ss(fptr,t2);
+ _mm_storeh_pi((__m64 *)(fptr+1),t2);
+ _mm_store_ss(fshiftptr,t3);
+ _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+_mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+_t4 = _mm_load_ss(fshiftptr+2);\
+_t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+_t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+_t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+_t3 = _mm_shuffle_ps(_t3,_t3,_MM_SHUFFLE(1,2,0,0));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_mm_store_ss(fshiftptr+2,_t1);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
{
- __m128 t1,t2,t3,t4;
-
- fix1 = _mm_hadd_ps(fix1,fiy1);
- fiz1 = _mm_hadd_ps(fiz1,fix2);
- fiy2 = _mm_hadd_ps(fiy2,fiz2);
- fix3 = _mm_hadd_ps(fix3,fiy3);
- fiz3 = _mm_hadd_ps(fiz3,fiz3);
-
- fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
- fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
- fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
-
- _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
- _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
- _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
-
- t4 = _mm_load_ss(fshiftptr+2);
- t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-
- t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
- t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
- t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
- t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
-
- t1 = _mm_add_ps(t1,t2);
- t3 = _mm_add_ps(t3,t4);
- t1 = _mm_add_ps(t1,t3); /* y x - z */
-
- _mm_store_ss(fshiftptr+2,t1);
- _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
+ __m128 t1,t2,t3,t4;
+
+ fix1 = _mm_hadd_ps(fix1,fiy1);
+ fiz1 = _mm_hadd_ps(fiz1,fix2);
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);
+ fix3 = _mm_hadd_ps(fix3,fiy3);
+ fiz3 = _mm_hadd_ps(fiz3,fiz3);
+
+ fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+ fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+ fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+ t4 = _mm_load_ss(fshiftptr+2);
+ t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
+ t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
+ t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
+ t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ps(t3,t4);
+ t1 = _mm_add_ps(t1,t3); /* y x - z */
+ _mm_store_ss(fshiftptr+2,t1);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fix4);\
+fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+_mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+_t5 = _mm_load_ss(fshiftptr+2);\
+_t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+_t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+_t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+_t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+_t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_t5 = _mm_add_ps(_t5,_t1);\
+_mm_store_ss(fshiftptr+2,_t5);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
{
- __m128 t1,t2,t3,t4,t5;
-
- fix1 = _mm_hadd_ps(fix1,fiy1);
- fiz1 = _mm_hadd_ps(fiz1,fix2);
- fiy2 = _mm_hadd_ps(fiy2,fiz2);
- fix3 = _mm_hadd_ps(fix3,fiy3);
- fiz3 = _mm_hadd_ps(fiz3,fix4);
- fiy4 = _mm_hadd_ps(fiy4,fiz4);
-
- fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
- fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
- fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
-
- _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
- _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
- _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
-
- t5 = _mm_load_ss(fshiftptr+2);
- t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
-
- t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
- t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
- t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
- t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
- t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
-
- t1 = _mm_add_ps(t1,t2);
- t3 = _mm_add_ps(t3,t4);
- t1 = _mm_add_ps(t1,t3);
- t5 = _mm_add_ps(t5,t1);
-
- _mm_store_ss(fshiftptr+2,t5);
- _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+ __m128 t1,t2,t3,t4,t5;
+
+ fix1 = _mm_hadd_ps(fix1,fiy1);
+ fiz1 = _mm_hadd_ps(fiz1,fix2);
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);
+ fix3 = _mm_hadd_ps(fix3,fiy3);
+ fiz3 = _mm_hadd_ps(fiz3,fix4);
+ fiy4 = _mm_hadd_ps(fiy4,fiz4);
+
+ fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+ fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+ fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+ t5 = _mm_load_ss(fshiftptr+2);
+ t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+ t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+ t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+ t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+ t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
+
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ps(t3,t4);
+ t1 = _mm_add_ps(t1,t3);
+ t5 = _mm_add_ps(t5,t1);
+
+ _mm_store_ss(fshiftptr+2,t5);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t5);
}
-
+#endif
static gmx_inline void
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
- __m128 t1,t2;
- t1 = _mm_movehl_ps(pot2,pot1);
- t2 = _mm_movelh_ps(pot1,pot2);
- t1 = _mm_add_ps(t1,t2);
- t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
- pot1 = _mm_add_ps(t1,t2);
- pot2 = _mm_movehl_ps(t2,pot1);
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-}
-
-
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
- __m128 pot2, float * gmx_restrict ptrB,
- __m128 pot3, float * gmx_restrict ptrC,
- __m128 pot4, float * gmx_restrict ptrD)
-{
- _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
- pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
- pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
- pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
- pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
- _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
- _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+ __m128 t1,t2;
+ t1 = _mm_movehl_ps(pot2,pot1);
+ t2 = _mm_movelh_ps(pot1,pot2);
+ t1 = _mm_add_ps(t1,t2);
+ t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+ pot1 = _mm_add_ps(t1,t2);
+ pot2 = _mm_movehl_ps(t2,pot1);
+ _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+ _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
}
t_complex ***t;
int i,j;
- snew(t,x);
t = (t_complex ***)calloc(x,sizeof(t_complex**));
if(!t) exit(fprintf(stderr,"\nallocation error"));
t[0] = (t_complex **)calloc(x*y,sizeof(t_complex*));
/* We tried again, and this time there was a copied buffer.
We use that, and indicate that we're not reading from the
regular buf. This case should be pretty rare. */
- tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1);
+ tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount),-1);
tMPI_Atomic_memory_barrier_acq();
srcbuf=try_again_srcbuf;
}
{
/* we decrement the read count; potentially releasing the buffer. */
tMPI_Atomic_memory_barrier_rel();
- tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
+ tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
}
#endif
}
else
{
/* wait until everybody else is done copying the original buffer.
- We use fetch_add because we want to be sure of coherency.
+ We use atomic add-return because we want to be sure of coherency.
This wait is bound to be very short (otherwise it wouldn't
be double-buffering) so we always spin here. */
/*tMPI_Atomic_memory_barrier_rel();*/
-100000))
#endif
#if 0
- while (tMPI_Atomic_fetch_add( &(cev->met[myrank].buf_readcount), 0)
+ while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0)
!= 0)
#endif
#if 1
tMPI_Atomic_memory_barrier_rel();
/* signal that we're done */
- tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
+ tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
/* we need to keep being in sync */
csync->syncs++;
}
tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
}
}
- /* the main thread now also runs start_fn if we don't want
+ /* the main thread also runs start_fn if we don't want
it to return */
if (!main_returns)
tMPI_Thread_starter((void*)&(threads[0]));
tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
#endif
-
if (TMPI_COMM_WORLD==0) /* we're the main process */
{
int N=0;
tMPI_Get_N(argc, argv, "-nt", &N);
- tMPI_Start_threads(FALSE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
+ tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
NULL, NULL, start_function);
}
else
CTYPE ("a value of -1 means: use rlist");
RTYPE("verlet-buffer-drift", ir->verletbuf_drift, 0.005);
CTYPE ("nblist cut-off");
- RTYPE ("rlist", ir->rlist, -1);
+ RTYPE ("rlist", ir->rlist, 1.0);
CTYPE ("long-range cut-off for switched potentials");
RTYPE ("rlistlong", ir->rlistlong, -1);
ITYPE ("nstcalclr", ir->nstcalclr, -1);
EETYPE("coulomb-modifier", ir->coulomb_modifier, eintmod_names);
CTYPE ("cut-off lengths");
RTYPE ("rcoulomb-switch", ir->rcoulomb_switch, 0.0);
- RTYPE ("rcoulomb", ir->rcoulomb, -1);
+ RTYPE ("rcoulomb", ir->rcoulomb, 1.0);
CTYPE ("Relative dielectric constant for the medium and the reaction field");
RTYPE ("epsilon-r", ir->epsilon_r, 1.0);
RTYPE ("epsilon-rf", ir->epsilon_rf, 0.0);
EETYPE("vdw-modifier", ir->vdw_modifier, eintmod_names);
CTYPE ("cut-off lengths");
RTYPE ("rvdw-switch", ir->rvdw_switch, 0.0);
- RTYPE ("rvdw", ir->rvdw, -1);
+ RTYPE ("rvdw", ir->rvdw, 1.0);
CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
EETYPE("DispCorr", ir->eDispCorr, edispc_names);
CTYPE ("Extension of the potential lookup tables beyond the cut-off");
RTYPE ("table-extension", ir->tabext, 1.0);
- CTYPE ("Seperate tables between energy group pairs");
+ CTYPE ("Separate tables between energy group pairs");
STYPE ("energygrp-table", egptable, NULL);
CTYPE ("Spacing for the PME/PPPM FFT grid");
RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
extern "C" {
#endif
-void do_edsam(t_inputrec *ir,gmx_large_int_t step,t_mdatoms *md,
+void do_edsam(t_inputrec *ir,gmx_large_int_t step,
t_commrec *cr,rvec xs[],rvec v[],matrix box,gmx_edsam_t ed);
/* Essential dynamics constraints, called from constrain() */
-gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec *cr);
-/* Sets the ED input/output filenames, opens output (.edo) file */
+gmx_edsam_t ed_open(int natoms, edsamstate_t *EDstate, int nfile,const t_filenm fnm[],
+ unsigned long Flags, const output_env_t oenv, t_commrec *cr);
+/* Sets the ED input/output filenames, opens output file */
void init_edsam(gmx_mtop_t *mtop,t_inputrec *ir,t_commrec *cr,
- gmx_edsam_t ed, rvec x[], matrix box);
+ gmx_edsam_t ed, rvec x[], matrix box, edsamstate_t *edsamstate);
/* Init routine for ED and flooding. Calls init_edi in a loop for every .edi-cycle
* contained in the input file, creates a NULL terminated list of t_edpar structures */
/* Make a selection of the home atoms for the ED groups.
* Should be called at every domain decomposition. */
-void do_flood(FILE *log, t_commrec *cr, rvec x[],rvec force[], gmx_edsam_t ed,
+void do_flood(t_commrec *cr, t_inputrec *ir, rvec x[],rvec force[], gmx_edsam_t ed,
matrix box, gmx_large_int_t step, gmx_bool bNS);
/* Flooding - called from do_force() */
int
gmx_dielectric(int argc,char *argv[]);
-int
-gmx_dih(int argc,char *argv[]);
-
int
gmx_dipoles(int argc,char *argv[]);
#include <stdio.h>
+
#ifdef __cplusplus
extern "C" {
#endif
enum gmx_cpuid_feature feature);
+/* Return pointers to cpu topology information.
+ *
+ * Important: CPU topology requires more OS support than most other
+ * functions in this file, including support for thread pinning to hardware.
+ * This means it will not work on some platforms, including e.g. Mac OS X.
+ * Thus, it is IMPERATIVE that you check the return value from this routine
+ * before doing anything with the information. It is only if the return
+ * value is zero that the data is valid.
+ *
+ * For the returned values we have:
+ * - nprocessors Total number of logical processors reported by OS
+ * - npackages Usually number of CPU sockets
+ * - ncores_per_package Number of cores in each package
+ * - nhwthreads_per_core Number of hardware threads per core; 2 for hyperthreading.
+ * - package_id Array with the package index for each logical cpu
+ * - core_id Array with local core index for each logical cpu
+ * - hwthread_id Array with local hwthread index for each logical cpu
+ * - locality_order Array with logical cpu numbers, sorted in order
+ * of physical and logical locality in the system.
+ *
+ * All arrays are of length nprocessors.
+ */
+int
+gmx_cpuid_topology(gmx_cpuid_t cpuid,
+ int * nprocessors,
+ int * npackages,
+ int * ncores_per_package,
+ int * nhwthreads_per_core,
+ const int ** package_id,
+ const int ** core_id,
+ const int ** hwthread_id,
+ const int ** locality_order);
+
/* Enumerated values for x86 SMT enabled-status. Note that this does not refer
* to Hyper-Threading support (that is the flag GMX_CPUID_FEATURE_X86_HTT), but
* whether Hyper-Threading is _enabled_ and _used_ in bios right now.
* in order not to give the impression we can detect any SMT. We haven't
* even tested the performance on other SMT implementations, so it is not
* obvious we shouldn't use SMT there.
+ *
+ * Note that you can get more complete topology information from
+ * gmx_cpuid_topology(), although that requires slightly more OS support.
*/
enum gmx_cpuid_x86_smt
gmx_cpuid_x86_smt(gmx_cpuid_t cpuid);
-
/* Formats a text string (up to n characters) from the data structure.
* The output will have max 80 chars between newline characters.
*/
#ifndef _gmx_math_x86_avx_128_fma_double_h_
#define _gmx_math_x86_avx_128_fma_double_h_
+#include <immintrin.h> /* AVX */
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
#include <math.h>
#include "gmx_x86_avx_128_fma.h"
* with different settings from the same source file.
*/
-/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
+/* NOTE: floor and blendv are NOT available with SSE2 only acceleration */
#undef GMX_SIMD_WIDTH_HERE
#undef gmx_epi32
+/* float/double SIMD register type */
#undef gmx_mm_pr
#undef gmx_load_pr
#undef gmx_set1_pr
#undef gmx_setzero_pr
#undef gmx_store_pr
+/* Only used for debugging */
#undef gmx_storeu_pr
#undef gmx_add_pr
#undef gmx_or_pr
#undef gmx_andnot_pr
+/* Only used to speed up the nbnxn tabulated PME kernels */
#undef gmx_floor_pr
+/* Only used with x86 when blendv is faster than comparison */
#undef gmx_blendv_pr
#undef gmx_movemask_pr
+/* Integer casts are only used for nbnxn x86 exclusion masks */
#undef gmx_mm_castsi128_pr
+#undef gmx_mm_castsi256_pr
+/* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
#undef gmx_cvttpr_epi32
#undef gmx_cvtepi32_pr
#undef gmx_calc_rsq_pr
#undef gmx_sum4_pr
+/* Only required for nbnxn analytical PME kernels */
#undef gmx_pmecorrF_pr
#undef gmx_pmecorrV_pr
+/* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
+#undef gmx_mm_hpr
+
+#undef gmx_load_hpr
+#undef gmx_load1_hpr
+#undef gmx_store_hpr
+#undef gmx_add_hpr
+#undef gmx_sub_hpr
+
+#undef gmx_sum4_hpr
+
+#undef gmx_2hpr_to_pr
+
+
/* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
* the same intrinsics, with defines, can be compiled for either 128 or 256
* bit wide SSE or AVX instructions.
#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
#endif
+
+#ifdef GMX_X86_SSE2
+
#ifdef GMX_MM128_HERE
#define gmx_epi32 __m128i
#endif
#endif /* GMX_MM256_HERE */
+
+#endif /* GMX_X86_SSE2 */
#ifdef HAVE_X86INTRIN_H
#include <x86intrin.h> /* FMA */
#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
#include <stdio.h>
return sse_overflow;
}
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
+
#endif /* _gmx_x86_avx_128_fma_h_ */
return sse_overflow;
}
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
#endif /* _gmx_x86_avx_256_h_ */
* The string name is used to print to the log file and in a fatal error
* if the val's don't match.
*/
-
void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
int nfile, const t_filenm fnm[], gmx_bool bParFn);
/* Splits the communication into nsim separate simulations
gmx_bool bShakeOnly,gmx_bool bSettle);
/* Build a graph from an idef description. The graph can be used
* to generate mol-shift indices.
+ * at_start and at_end should coincide will molecule boundaries,
+ * for the whole system this is simply 0 and natoms.
* If bShakeOnly, only the connections in the shake list are used.
* If bSettle && bShakeOnly the settles are used too.
*/
as the 486, and gcc on some Linux versions still target 80386 by default).
We also specifically check for icc, because intrinsics are not always
- supported there. */
-#if ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
- !defined(__INTEL_COMPILER) )
+ supported there.
+
+ llvm has issues with inline assembly and also in 32 bits has support for
+ the gcc intrinsics */
+#if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
+ !defined(__INTEL_COMPILER) ) || defined(__llvm__) )
#include "gcc_intrinsics.h"
#else
__asm__ __volatile__("lock ; xaddl %0, %1;"
:"=r"(i) :"m"(a->value), "0"(i) : "memory");
return i + __i;
-}
+}
static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
{
efDAT, efDLG,
efMAP, efEPS, efMAT, efM2P,
efMTX,
- efEDI, efEDO,
+ efEDI,
efHAT,
efCUB,
efXPM,
typedef enum { egcolWhite, egcolGrey, egcolBlack, egcolNR } egCol;
typedef struct {
+ int at0; /* The first atom the graph was constructed for */
+ int at1; /* The last atom the graph was constructed for */
int nnodes; /* The number of nodes, nnodes=at_end-at_start */
int nbound; /* The number of nodes with edges */
- int natoms; /* Total range for this graph: 0 to natoms */
int at_start; /* The first connected atom in this graph */
int at_end; /* The last+1 connected atom in this graph */
int *nedge; /* For each node the number of edges */
#define GMX_NBNXN_SIMD
#ifdef GMX_X86_AVX_256
-/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+/* Note that setting this to 128 will also work with AVX-256, but slower */
#define GMX_NBNXN_SIMD_BITWIDTH 256
#else
#define GMX_NBNXN_SIMD_BITWIDTH 128
#endif
/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
- * Currently the 2xNN SIMD kernels only make sense and are only implemented
- * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ * 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
*/
#define GMX_NBNXN_SIMD_4XN
#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
nbnxnkNR
} nbnxn_kernel_type;
-/* Note that _mm_... intrinsics can be converted to either SSE or AVX
- * depending on compiler flags.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
-static const char *nbnxn_kernel_name[nbnxnkNR] =
- { "not set", "plain C",
-#if !(defined GMX_X86_SSE2)
- "not available", "not available",
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
-#ifndef GMX_X86_SSE4_1
- "SSE2", "SSE2",
-#else
- "SSE4.1", "SSE4.1",
-#endif
-#else
- "AVX-128", "AVX-128",
-#endif
-#else
- "AVX-256", "AVX-256",
-#endif
-#endif
- "CUDA", "plain C" };
+/*! Return a string indentifying the kernel type */
+const char *lookup_nbnxn_kernel_name(int kernel_type);
enum { ewaldexclTable, ewaldexclAnalytical };
unsigned excl; /* The exclusion (interaction) bits */
} nbnxn_cj_t;
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc)) => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc) => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
#define NBNXN_CI_SHIFT 127
#define NBNXN_CI_DO_LJ(subc) (1<<(7+3*(subc)))
#define NBNXN_CI_HALF_LJ(subc) (1<<(8+3*(subc)))
/* Simple pair-list i-unit */
typedef struct {
int ci; /* i-cluster */
- int shift; /* Shift vector index plus possible flags */
+ int shift; /* Shift vector index plus possible flags, see above */
int cj_ind_start; /* Start index into cj */
int cj_ind_end; /* End index into cj */
} nbnxn_ci_t;
int xstride; /* stride for a coordinate in x (usually 3 or 4) */
int fstride; /* stride for a coordinate in f (usually 3 or 4) */
real *x; /* x and possibly q, size natoms*xstride */
+ real *simd_4xn_diag; /* indices to set the SIMD 4xN diagonal masks */
+ real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
int nout; /* The number of force arrays */
nbnxn_atomdata_output_t *out; /* Output data structures */
int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
}
energyhistory_t;
+typedef struct
+{
+ /* If one uses essential dynamics or flooding on a group of atoms from
+ * more than one molecule, we cannot make this group whole with
+ * do_pbc_first_mtop(). We assume that the ED group has the correct PBC
+ * representation at the beginning of the simulation and keep track
+ * of the shifts to always get it into that representation.
+ * For proper restarts from a checkpoint we store the positions of the
+ * reference group at the time of checkpoint writing */
+ gmx_bool bFromCpt; /* Did we start from a checkpoint file? */
+ int nED; /* No. of ED/Flooding data sets, if <1 no ED */
+ int *nref; /* No. of atoms in i'th reference structure */
+ int *nav; /* Same for average structure */
+ rvec **old_sref; /* Positions of the reference atoms
+ at the last time step (with correct PBC
+ representation) */
+ rvec **old_sref_p; /* Pointer to these positions */
+ rvec **old_sav; /* Same for the average positions */
+ rvec **old_sav_p;
+}
+edsamstate_t;
+
typedef struct
{
int natoms;
energyhistory_t enerhist; /* Energy history for statistics */
df_history_t dfhist; /*Free energy history for free energy analysis */
+ edsamstate_t edsamstate; /* Essential dynamics / flooding history */
int ddp_count; /* The DD partitioning count for this state */
int ddp_count_cg_gl; /* The DD part. count for index_gl */
#define INVSQRT_DONE
#endif /* gmx_invsqrt */
-#ifdef GMX_POWERPC_SQRT
-static real gmx_powerpc_invsqrt(real x)
-{
- const real half=0.5;
- const real three=3.0;
- t_convert result,bit_pattern;
- unsigned int exp,fract;
- real lu;
- real y;
-#ifdef GMX_DOUBLE
- real y2;
-#endif
-
- lu = __frsqrte((double)x);
-
- y=(half*lu*(three-((x*lu)*lu)));
-
-#if (GMX_POWERPC_SQRT==2)
- /* Extra iteration required */
- y=(half*y*(three-((x*y)*y)));
-#endif
-
-#ifdef GMX_DOUBLE
- y2=(half*y*(three-((x*y)*y)));
-
- return y2; /* 10 Flops */
-#else
- return y; /* 5 Flops */
-#endif
-}
-#define gmx_invsqrt(x) gmx_powerpc_invsqrt(x)
-#define INVSQRT_DONE
-#endif /* powerpc_invsqrt */
-
#ifndef INVSQRT_DONE
# ifdef GMX_DOUBLE
# ifdef HAVE_RSQRT
URL: http://www.gromacs.org
Version: @PROJECT_VERSION@
Requires: @PKG_FFT@ @PKG_XML@
-Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
Libs: -L${libdir} -lgromacs@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@ -lm
Cflags: -I${includedir} @PKG_CFLAGS@
if (constr->ed && delta_step > 0)
{
/* apply the essential dynamcs constraints here */
- do_edsam(ir,step,md,cr,xprime,v,box,constr->ed);
+ do_edsam(ir,step,cr,xprime,v,box,constr->ed);
}
}
/* Initialize the essential dynamics sampling.
* Put the pointer to the ED struct in constr */
constr->ed = ed;
- if (ed != NULL)
+ if (ed != NULL || state->edsamstate.nED > 0)
{
- init_edsam(mtop,ir,cr,ed,state->x,state->box);
+ init_edsam(mtop,ir,cr,ed,state->x,state->box,&state->edsamstate);
}
constr->warn_mtop = mtop;
}
static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box,
- gmx_ekindata_t *ekind, tensor vir, real pcorr, real ecorr, t_extmass *MassQ)
+ gmx_ekindata_t *ekind, tensor vir, real pcorr, t_extmass *MassQ)
{
real pscal;
/* for now, we use Elr = 0, because if you want to get it right, you
really should be using PME. Maybe print a warning? */
- pscal = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres);
+ pscal = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres)+pcorr;
vol = det(box);
GW = (vol*(MassQ->Winv/PRESFAC))*(DIM*pscal - trace(ir->ref_p)); /* W is in ps^2 * bar * nm^3 */
case etrtBAROV:
case etrtBAROV2:
boxv_trotter(ir,&(state->veta),dt,state->box,ekind,vir,
- enerd->term[F_PDISPCORR],enerd->term[F_DISPCORR],MassQ);
+ enerd->term[F_PDISPCORR],MassQ);
break;
case etrtBARONHC:
case etrtBARONHC2:
if (lossf >= DD_PERF_LOSS)
{
sprintf(buf,
- "NOTE: %.1f %% performance was lost due to load imbalance\n"
+ "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
" in the domain decomposition.\n",lossf*100);
if (!comm->bDynLoadBal)
{
comm->zones.dens_zone0,
fr->cginfo,
state_local->x,
- ncg_moved,comm->moved,
+ ncg_moved,bRedist ? comm->moved : NULL,
fr->nbv->grp[eintLocal].kernel_type,
fr->nbv->grp[eintLocal].nbat);
#include "mtop_util.h"
#include "edsam.h"
#include "gmxfio.h"
+#include "xvgr.h"
#include "groupcoord.h"
#define nblock_bc(cr,nr,d) gmx_bcast((nr)*sizeof((d)[0]), (d),(cr))
#define snew_bc(cr,d,nr) { if (!MASTER(cr)) snew((d),(nr)); }
+/* These macros determine the column width in the output file */
+#define EDcol_sfmt "%17s"
+#define EDcol_efmt "%17.5e"
+#define EDcol_ffmt "%17f"
/* enum to identify the type of ED: none, normal ED, flooding */
enum {eEDnone, eEDedsam, eEDflood, eEDnr};
real dt;
real constEfl;
real alpha2;
- int flood_id;
rvec *forces_cartesian;
t_eigvec vecs; /* use flooding for these */
} t_edflood;
* with respect to the collective
* anrs[0...nr-1] array */
rvec *x; /* positions for this structure */
- rvec *x_old; /* used to keep track of the shift vectors
- such that the ED molecule can always be
- made whole in the parallel case */
+ rvec *x_old; /* Last positions which have the correct PBC
+ representation of the ED group. In
+ combination with keeping track of the
+ shift vectors, the ED group can always
+ be made whole */
real *m; /* masses */
real mtot; /* total mass (only used in sref) */
real *sqrtm; /* sqrt of the masses used for mass-
* is used (i.e. apart from flooding) */
t_edflood flood; /* parameters especially for flooding */
struct t_ed_buffer *buf; /* handle to local buffers */
- struct edpar *next_edi; /* Pointer to another ed dataset */
+ struct edpar *next_edi; /* Pointer to another ED group */
} t_edpar;
typedef struct gmx_edsam
{
int eEDtype; /* Type of ED: see enums above */
- const char *edinam; /* name of ED sampling input file */
- const char *edonam; /* output */
FILE *edo; /* output file pointer */
t_edpar *edpar;
gmx_bool bFirst;
- gmx_bool bStartFromCpt;
} t_gmx_edsam;
ivec *shifts_xc_ref; /* Shifts for xc_ref */
ivec *extra_shifts_xc_ref; /* xc_ref shift changes since last NS step */
gmx_bool bUpdateShifts; /* TRUE in NS steps to indicate that the
- ED shifts for this ED dataset need to
+ ED shifts for this ED group need to
be updated */
};
/* Function declarations */
static void fit_to_reference(rvec *xcoll,rvec transvec,matrix rotmat,t_edpar *edi);
-
static void translate_and_rotate(rvec *x,int nat,rvec transvec,matrix rotmat);
+static real rmsd_from_structure(rvec *x, struct gmx_edx *s);
+static int read_edi_file(const char *fn, t_edpar *edi, int nr_mdatoms);
+static void crosscheck_edi_file_vs_checkpoint(gmx_edsam_t ed, edsamstate_t *EDstate);
+static void init_edsamstate(gmx_edsam_t ed, edsamstate_t *EDstate);
+static void write_edo_legend(gmx_edsam_t ed, int nED, const output_env_t oenv);
/* End function declarations */
+/* Multiple ED groups will be labeled with letters instead of numbers
+ * to avoid confusion with eigenvector indices */
+static char get_EDgroupChar(int nr_edi, int nED)
+{
+ if (nED == 1)
+ {
+ return ' ';
+ }
+
+ /* nr_edi = 1 -> A
+ * nr_edi = 2 -> B ...
+ */
+ return 'A' + nr_edi - 1;
+}
+
+
/* Does not subtract average positions, projection on single eigenvector is returned
* used by: do_linfix, do_linacc, do_radfix, do_radacc, do_radcon
* Average position is subtracted in ed_apply_constraints prior to calling projectx
for (i=0; i<edi->sav.nr; i++)
+ {
proj += edi->sav.sqrtm[i]*iprod(vec[i], xcoll[i]);
+ }
return proj;
}
/* Specialized: projection is stored in vec->refproj
* -> used for radacc, radfix, radcon and center of flooding potential
* subtracts average positions, projects vector x */
-static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec, t_commrec *cr)
+static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec)
{
int i;
real rad=0.0;
/* Subtract average positions */
for (i = 0; i < edi->sav.nr; i++)
+ {
rvec_dec(x[i], edi->sav.x[i]);
+ }
for (i = 0; i < vec->neig; i++)
{
/* Add average positions */
for (i = 0; i < edi->sav.nr; i++)
+ {
rvec_inc(x[i], edi->sav.x[i]);
+ }
}
/* Subtract average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_dec(x[i], edi->sav.x[i]);
+ }
for (i=0; i<vec->neig; i++)
+ {
vec->xproj[i] = projectx(edi, x, vec->vec[i]);
+ }
/* Add average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_inc(x[i], edi->sav.x[i]);
+ }
}
for (i=0; i<vec->neig; i++)
+ {
rad += pow((vec->refproj[i]-vec->xproj[i]),2);
+ }
return rad=sqrt(rad);
}
fp = fopen(fn, "w");
for (i=0; i<edi->sav.nr; i++)
+ {
fprintf(fp, "%d %9.5f %9.5f %9.5f %d %d %d %d %d %d\n",
edi->sav.anrs[i]+1,
xcoll[i][XX] , xcoll[i][YY] , xcoll[i][ZZ],
shifts[i][XX] , shifts[i][YY] , shifts[i][ZZ],
eshifts[i][XX], eshifts[i][YY], eshifts[i][ZZ]);
+ }
fclose(fp);
}
fprintf(out, "#%s positions:\n%d\n", name, s->nr);
if (s->nr == 0)
+ {
return;
+ }
fprintf(out, "#index, x, y, z");
if (s->sqrtm)
+ {
fprintf(out, ", sqrt(m)");
+ }
for (i=0; i<s->nr; i++)
{
fprintf(out, "\n%6d %11.6f %11.6f %11.6f",s->anrs[i], s->x[i][XX], s->x[i][YY], s->x[i][ZZ]);
if (s->sqrtm)
+ {
fprintf(out,"%9.3f",s->sqrtm[i]);
+ }
}
fprintf(out, "\n");
}
fprintf(out, "EV %4d\ncomponents %d\nstepsize %f\nxproj %f\nfproj %f\nrefproj %f\nradius %f\nComponents:\n",
ev->ieig[i], length, ev->stpsz[i], ev->xproj[i], ev->fproj[i], ev->refproj[i], ev->radius);
for (j=0; j<length; j++)
+ {
fprintf(out, "%11.6f %11.6f %11.6f\n", ev->vec[i][j][XX], ev->vec[i][j][YY], ev->vec[i][j][ZZ]);
+ }
}
}
for (i=0; i<dim; i++)
+ {
fprintf(out,"%4d %f %f %f\n",i,x[i][XX],x[i][YY],x[i][ZZ]);
+ }
}
for (i=0;i<dim;i++)
{
for (j=0;j<dim;j++)
+ {
fprintf(out,"%f ",mat[i][j]);
+ }
fprintf(out,"\n");
}
}
gmx_bool bFirst;
if(edi->buf->do_edfit != NULL)
+ {
bFirst = FALSE;
+ }
else
{
bFirst = TRUE;
/* construct loc->omega */
/* loc->omega is symmetric -> loc->omega==loc->omega' */
for(r=0;(r<6);r++)
+ {
for(c=0;(c<=r);c++)
+ {
if ((r>=3) && (c<3))
{
loc->omega[r][c]=u[r-3][c];
loc->omega[r][c]=0;
loc->omega[c][r]=0;
}
+ }
+ }
/* determine h and k */
#ifdef DEBUG
int i;
dump_mat(stderr,2*DIM,loc->omega);
for (i=0; i<6; i++)
+ {
fprintf(stderr,"d[%d] = %f\n",i,d[i]);
+ }
}
#endif
jacobi(loc->omega,6,d,loc->om,&irot);
if (irot==0)
+ {
fprintf(stderr,"IROT=0\n");
+ }
index=0; /* For the compiler only */
{
max_d=-1000;
for(i=0;(i<6);i++)
+ {
if (d[i]>max_d)
{
max_d=d[i];
index=i;
}
+ }
d[index]=-10000;
for(i=0;(i<3);i++)
{
/* determine R */
for(c=0;(c<3);c++)
+ {
for(r=0;(r<3);r++)
+ {
R[c][r]=vk[0][r]*vh[0][c]+
- vk[1][r]*vh[1][c]+
- vk[2][r]*vh[2][c];
+ vk[1][r]*vh[1][c]+
+ vk[2][r]*vh[2][c];
+ }
+ }
if (det(R) < 0)
+ {
for(c=0;(c<3);c++)
+ {
for(r=0;(r<3);r++)
+ {
R[c][r]=vk[0][r]*vh[0][c]+
- vk[1][r]*vh[1][c]-
- vk[2][r]*vh[2][c];
+ vk[1][r]*vh[1][c]-
+ vk[2][r]*vh[2][c];
+ }
+ }
+ }
}
two edsam files from two peptide chains
*/
-static void write_edo_flood(t_edpar *edi, FILE *fp, gmx_large_int_t step)
+static void write_edo_flood(t_edpar *edi, FILE *fp, real rmsd)
{
int i;
- char buf[22];
- gmx_bool bOutputRef=FALSE;
- fprintf(fp,"%d.th FL: %s %12.5e %12.5e %12.5e\n",
- edi->flood.flood_id, gmx_step_str(step,buf),
- edi->flood.Efl, edi->flood.Vfl, edi->flood.deltaF);
+ /* Output how well we fit to the reference structure */
+ fprintf(fp, EDcol_ffmt, rmsd);
-
- /* Check whether any of the references changes with time (this can happen
- * in case flooding is used as harmonic restraint). If so, output all the
- * current reference projections. */
- if (edi->flood.bHarmonic)
+ for (i=0; i<edi->flood.vecs.neig; i++)
{
- for (i = 0; i < edi->flood.vecs.neig; i++)
+ fprintf(fp, EDcol_efmt, edi->flood.vecs.xproj[i]);
+
+ /* Check whether the reference projection changes with time (this can happen
+ * in case flooding is used as harmonic restraint). If so, output the
+ * current reference projection */
+ if (edi->flood.bHarmonic && edi->flood.vecs.refprojslope[i] != 0.0)
{
- if (edi->flood.vecs.refprojslope[i] != 0.0)
- bOutputRef=TRUE;
+ fprintf(fp, EDcol_efmt, edi->flood.vecs.refproj[i]);
}
- if (bOutputRef)
+
+ /* Output Efl if we are doing adaptive flooding */
+ if (0 != edi->flood.tau)
{
- fprintf(fp, "Ref. projs.: ");
- for (i = 0; i < edi->flood.vecs.neig; i++)
- {
- fprintf(fp, "%12.5e ", edi->flood.vecs.refproj[i]);
- }
- fprintf(fp, "\n");
+ fprintf(fp, EDcol_efmt, edi->flood.Efl);
}
- }
- fprintf(fp,"FL_FORCES: ");
-
- for (i=0; i<edi->flood.vecs.neig; i++)
- fprintf(fp," %12.5e",edi->flood.vecs.fproj[i]);
+ fprintf(fp, EDcol_efmt, edi->flood.Vfl);
- fprintf(fp,"\n");
+ /* Output deltaF if we are doing adaptive flooding */
+ if (0 != edi->flood.tau)
+ {
+ fprintf(fp, EDcol_efmt, edi->flood.deltaF);
+ }
+ fprintf(fp, EDcol_efmt, edi->flood.vecs.fproj[i]);
+ }
}
if (edi->flood.bHarmonic)
+ {
for (i=0; i<edi->flood.vecs.neig; i++)
{
edi->flood.vecs.fproj[i] = edi->flood.Efl* edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]);
}
+ }
else
+ {
for (i=0; i<edi->flood.vecs.neig; i++)
{
/* if Efl is zero the forces are zero if not use the formula */
edi->flood.vecs.fproj[i] = edi->flood.Efl!=0 ? edi->flood.kT/edi->flood.Efl/edi->flood.alpha2*energy*edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]) : 0;
}
+ }
}
/* Clear forces first */
for (j=0; j<edi->sav.nr_loc; j++)
+ {
clear_rvec(forces_cart[j]);
+ }
/* Now compute atomwise */
for (j=0; j<edi->sav.nr_loc; j++)
edi->flood.Efl = edi->flood.Efl+edi->flood.dt/edi->flood.tau*(edi->flood.deltaF0-edi->flood.deltaF);
/* check if restrain (inverted flooding) -> don't let EFL become positive */
if (edi->flood.alpha2<0 && edi->flood.Efl>-0.00000001)
+ {
edi->flood.Efl = 0;
+ }
edi->flood.deltaF = (1-edi->flood.dt/edi->flood.tau)*edi->flood.deltaF+edi->flood.dt/edi->flood.tau*edi->flood.Vfl;
}
matrix rotmat; /* rotation matrix */
matrix tmat; /* inverse rotation */
rvec transvec; /* translation vector */
+ real rmsdev;
struct t_do_edsam *buf;
/* Only assembly REFERENCE positions if their indices differ from the average ones */
if (!edi->bRefEqAv)
+ {
communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, bNS, x,
edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+ }
/* If bUpdateShifts was TRUE, the shifts have just been updated in get_positions.
* We do not need to update the shifts until the next NS step */
/* Fit the reference indices to the reference structure */
if (edi->bRefEqAv)
+ {
fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+ }
else
+ {
fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+ }
/* Now apply the translation and rotation to the ED structure */
translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
/* Finally add forces to the main force variable */
for (i=0; i<edi->sav.nr_loc; i++)
+ {
rvec_inc(force[edi->sav.anrs_loc[i]],edi->flood.forces_cartesian[i]);
+ }
/* Output is written by the master process */
if (do_per_step(step,edi->outfrq) && MASTER(cr))
- write_edo_flood(edi,edo,step);
+ {
+ /* Output how well we fit to the reference */
+ if (edi->bRefEqAv)
+ {
+ /* Indices of reference and average structures are identical,
+ * thus we can calculate the rmsd to SREF using xcoll */
+ rmsdev = rmsd_from_structure(buf->xcoll,&edi->sref);
+ }
+ else
+ {
+ /* We have to translate & rotate the reference atoms first */
+ translate_and_rotate(buf->xc_ref, edi->sref.nr, transvec, rotmat);
+ rmsdev = rmsd_from_structure(buf->xc_ref,&edi->sref);
+ }
+
+ write_edo_flood(edi,edo,rmsdev);
+ }
}
/* Main flooding routine, called from do_force */
extern void do_flood(
- FILE *log, /* md.log file */
t_commrec *cr, /* Communication record */
+ t_inputrec *ir, /* Input record */
rvec x[], /* Positions on the local processor */
rvec force[], /* forcefield forces, to these the flooding forces are added */
- gmx_edsam_t ed, /* ed data structure contains all ED and flooding datasets */
+ gmx_edsam_t ed, /* ed data structure contains all ED and flooding groups */
matrix box, /* the box */
gmx_large_int_t step, /* The relative time step since ir->init_step is already subtracted */
gmx_bool bNS) /* Are we in a neighbor searching step? */
t_edpar *edi;
+ edi = ed->edpar;
+
+ /* Write time to edo, when required. Output the time anyhow since we need
+ * it in the output file for ED constraints. */
+ if (MASTER(cr) && do_per_step(step,edi->outfrq))
+ {
+ fprintf(ed->edo, "\n%12f", ir->init_t + step*ir->delta_t);
+ }
+
if (ed->eEDtype != eEDflood)
+ {
return;
+ }
- edi = ed->edpar;
while (edi)
{
/* Call flooding for one matrix */
if (edi->flood.vecs.neig)
+ {
do_single_flood(ed->edo,x,force,edi,step,box,cr,bNS);
+ }
edi = edi->next_edi;
}
}
/* Called by init_edi, configure some flooding related variables and structures,
* print headers to output files */
-static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
+static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt)
{
int i;
if (edi->flood.vecs.neig)
{
- /* If in any of the datasets we find a flooding vector, flooding is turned on */
+ /* If in any of the ED groups we find a flooding vector, flooding is turned on */
ed->eEDtype = eEDflood;
- fprintf(stderr,"ED: Flooding of matrix %d is switched on.\n", edi->flood.flood_id);
+ fprintf(stderr,"ED: Flooding %d eigenvector%s.\n", edi->flood.vecs.neig, edi->flood.vecs.neig > 1 ? "s":"");
if (edi->flood.bConstForce)
{
edi->flood.vecs.ieig[i], edi->flood.vecs.fproj[i]);
}
}
- fprintf(ed->edo,"FL_HEADER: Flooding of matrix %d is switched on! The flooding output will have the following format:\n",
- edi->flood.flood_id);
- fprintf(ed->edo,"FL_HEADER: Step Efl Vfl deltaF\n");
}
}
count++;
}
if (nnames!=count-1)
+ {
gmx_fatal(FARGS,"Number of energies is not consistent with t_edi structure");
+ }
}
/************* END of FLOODING IMPLEMENTATION ****************************/
#endif
-gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec *cr)
+gmx_edsam_t ed_open(int natoms, edsamstate_t *EDstate, int nfile,const t_filenm fnm[],unsigned long Flags, const output_env_t oenv, t_commrec *cr)
{
gmx_edsam_t ed;
+ int nED;
/* Allocate space for the ED data structure */
if (MASTER(cr))
{
- /* Open .edi input file: */
- ed->edinam=ftp2fn(efEDI,nfile,fnm);
- /* The master opens the .edo output file */
fprintf(stderr,"ED sampling will be performed!\n");
- ed->edonam = ftp2fn(efEDO,nfile,fnm);
- ed->edo = gmx_fio_fopen(ed->edonam,(Flags & MD_APPENDFILES)? "a+" : "w+");
- ed->bStartFromCpt = Flags & MD_STARTFROMCPT;
+ snew(ed->edpar,1);
+
+ /* Read the edi input file: */
+ nED = read_edi_file(ftp2fn(efEDI,nfile,fnm),ed->edpar,natoms);
+
+ /* Make sure the checkpoint was produced in a run using this .edi file */
+ if (EDstate->bFromCpt)
+ {
+ crosscheck_edi_file_vs_checkpoint(ed, EDstate);
+ }
+ else
+ {
+ EDstate->nED = nED;
+ }
+ init_edsamstate(ed, EDstate);
+
+ /* The master opens the ED output file */
+ if (Flags & MD_APPENDFILES)
+ {
+ ed->edo = gmx_fio_fopen(opt2fn("-eo",nfile,fnm),"a+");
+ }
+ else
+ {
+ ed->edo = xvgropen(opt2fn("-eo",nfile,fnm),
+ "Essential dynamics / flooding output",
+ "Time (ps)",
+ "RMSDs (nm), projections on EVs (nm), ...", oenv);
+
+ /* Make a descriptive legend */
+ write_edo_legend(ed, EDstate->nED, oenv);
+ }
}
return ed;
}
/* Broadcast flooding eigenvectors and, if needed, values for the moving reference */
bc_ed_vecs(cr, &edi->flood.vecs, edi->sav.nr, edi->flood.bHarmonic);
- /* Set the pointer to the next ED dataset */
+ /* Set the pointer to the next ED group */
if (edi->next_edi)
{
snew_bc(cr, edi->next_edi, 1);
/* init-routine called for every *.edi-cycle, initialises t_edpar structure */
-static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
- t_commrec *cr,gmx_edsam_t ed,t_edpar *edi)
+static void init_edi(gmx_mtop_t *mtop,t_edpar *edi)
{
int i;
real totalmass = 0.0;
static void check(const char *line, const char *label)
{
if (!strstr(line,label))
+ {
gmx_fatal(FARGS,"Could not find input parameter %s at expected position in edsam input-file (.edi)\nline read instead is %s",label,line);
+ }
}
sscanf (line,"%d%lf%lf%lf",&anrs[i],&d[0],&d[1],&d[2]);
anrs[i]--; /* we are reading FORTRAN indices */
for(j=0; j<3; j++)
+ {
x[i][j]=d[j]; /* always read as double and convert to single */
+ }
}
}
{
nscan = sscanf(line,"%d%lf",&idum,&rdum);
if (nscan != 2)
+ {
gmx_fatal(FARGS,"Expected 2 values for flooding vec: <nr> <stpsz>\n");
+ }
}
tvec->ieig[i]=idum;
tvec->stpsz[i]=rdum;
/* If the number of atoms differs between the two structures,
* they cannot be identical */
if (sref.nr != sav.nr)
+ {
return FALSE;
+ }
/* Now that we know that both stuctures have the same number of atoms,
* check if also the indices are identical */
for (i=0; i < sav.nr; i++)
{
if (sref.anrs[i] != sav.anrs[i])
+ {
return FALSE;
+ }
}
fprintf(stderr, "ED: Note: Reference and average structure are composed of the same atom indices.\n");
}
-static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int edi_nr, t_commrec *cr)
+static int read_edi(FILE* in,t_edpar *edi,int nr_mdatoms, const char *fn)
{
int readmagic;
const int magic=670;
readmagic=read_edint(in,&bEOF);
/* Check whether we have reached the end of the input file */
if (bEOF)
+ {
return 0;
+ }
if (readmagic != magic)
{
if (readmagic==666 || readmagic==667 || readmagic==668)
+ {
gmx_fatal(FARGS,"Wrong magic number: Use newest version of make_edi to produce edi file");
+ }
else if (readmagic != 669)
- gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,ed->edinam);
+ {
+ gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,fn);
+ }
}
/* check the number of atoms */
edi->nini=read_edint(in,&bEOF);
if (edi->nini != nr_mdatoms)
- gmx_fatal(FARGS,"Nr of atoms in %s (%d) does not match nr of md atoms (%d)",
- ed->edinam,edi->nini,nr_mdatoms);
+ {
+ gmx_fatal(FARGS,"Nr of atoms in %s (%d) does not match nr of md atoms (%d)", fn,edi->nini,nr_mdatoms);
+ }
/* Done checking. For the rest we blindly trust the input */
edi->fitmas = read_checked_edint(in,"FITMAS");
edi->flood.kT = read_checked_edreal(in,"KT");
edi->flood.bHarmonic = read_checked_edint(in,"HARMONIC");
if (readmagic > 669)
+ {
edi->flood.bConstForce = read_checked_edint(in,"CONST_FORCE_FLOODING");
+ }
else
+ {
edi->flood.bConstForce = FALSE;
- edi->flood.flood_id = edi_nr;
+ }
edi->sref.nr = read_checked_edint(in,"NREF");
/* allocate space for reference positions and read them */
edi->sori.nr=read_edint(in,&bEOF);
if (edi->sori.nr > 0)
{
- if (bHaveReference)
- {
- /* Both an -ori structure and a at least one manual reference point have been
- * specified. That's ambiguous and probably not intentional. */
- gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
- " point was manually specified in the edi file. That is ambiguous. Aborting.\n");
- }
+ if (bHaveReference)
+ {
+ /* Both an -ori structure and a at least one manual reference point have been
+ * specified. That's ambiguous and probably not intentional. */
+ gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
+ " point was manually specified in the edi file. That is ambiguous. Aborting.\n");
+ }
snew(edi->sori.anrs,edi->sori.nr);
snew(edi->sori.x ,edi->sori.nr);
edi->sori.sqrtm =NULL;
/* Read in the edi input file. Note that it may contain several ED data sets which were
* achieved by concatenating multiple edi files. The standard case would be a single ED
* data set, though. */
-static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commrec *cr)
+static int read_edi_file(const char *fn, t_edpar *edi, int nr_mdatoms)
{
FILE *in;
t_edpar *curr_edi,*last_edi;
/* This routine is executed on the master only */
/* Open the .edi parameter input file */
- in = gmx_fio_fopen(ed->edinam,"r");
- fprintf(stderr, "ED: Reading edi file %s\n", ed->edinam);
+ in = gmx_fio_fopen(fn,"r");
+ fprintf(stderr, "ED: Reading edi file %s\n", fn);
/* Now read a sequence of ED input parameter sets from the edi file */
curr_edi=edi;
last_edi=edi;
- while( read_edi(in, ed, curr_edi, nr_mdatoms, edi_nr, cr) )
+ while( read_edi(in, curr_edi, nr_mdatoms, fn) )
{
edi_nr++;
- /* Make shure that the number of atoms in each dataset is the same as in the tpr file */
- if (edi->nini != nr_mdatoms)
- gmx_fatal(FARGS,"edi file %s (dataset #%d) was made for %d atoms, but the simulation contains %d atoms.",
- ed->edinam, edi_nr, edi->nini, nr_mdatoms);
+
/* Since we arrived within this while loop we know that there is still another data set to be read in */
/* We need to allocate space for the data: */
snew(edi_read,1);
/* Point the 'next_edi' entry to the next edi: */
curr_edi->next_edi=edi_read;
- /* Keep the curr_edi pointer for the case that the next dataset is empty: */
+ /* Keep the curr_edi pointer for the case that the next group is empty: */
last_edi = curr_edi;
/* Let's prepare to read in the next edi data set: */
curr_edi = edi_read;
}
if (edi_nr == 0)
- gmx_fatal(FARGS, "No complete ED data set found in edi file %s.", ed->edinam);
+ {
+ gmx_fatal(FARGS, "No complete ED data set found in edi file %s.", fn);
+ }
- /* Terminate the edi dataset list with a NULL pointer: */
+ /* Terminate the edi group list with a NULL pointer: */
last_edi->next_edi = NULL;
- fprintf(stderr, "ED: Found %d ED dataset%s.\n", edi_nr, edi_nr>1? "s" : "");
+ fprintf(stderr, "ED: Found %d ED group%s.\n", edi_nr, edi_nr>1? "s" : "");
/* Close the .edi file again */
gmx_fio_fclose(in);
+
+ return edi_nr;
}
struct t_fit_to_ref *loc;
- /* Allocate memory the first time this routine is called for each edi dataset */
+ /* Allocate memory the first time this routine is called for each edi group */
if (NULL == edi->buf->fit_to_ref)
{
snew(edi->buf->fit_to_ref, 1);
/* We do not touch the original positions but work on a copy. */
for (i=0; i<edi->sref.nr; i++)
+ {
copy_rvec(xcoll[i], loc->xcopy[i]);
+ }
/* Calculate the center of mass */
get_center(loc->xcopy, edi->sref.m, edi->sref.nr, com);
for (i=0; i < s->nr; i++)
+ {
rmsd += distance2(s->x[i], x[i]);
+ }
rmsd /= (real) s->nr;
rmsd = sqrt(rmsd);
if (ed->eEDtype != eEDnone)
{
- /* Loop over ED datasets (usually there is just one dataset, though) */
+ /* Loop over ED groups */
edi=ed->edpar;
while (edi)
{
/* Local atoms of the reference structure (for fitting), need only be assembled
* if their indices differ from the average ones */
if (!edi->bRefEqAv)
+ {
dd_make_local_group_indices(dd->ga2la, edi->sref.nr, edi->sref.anrs,
&edi->sref.nr_loc, &edi->sref.anrs_loc, &edi->sref.nalloc_loc, edi->sref.c_ind);
+ }
/* Local atoms of the average structure (on these ED will be performed) */
dd_make_local_group_indices(dd->ga2la, edi->sav.nr, edi->sav.anrs,
* at the next call to communicate_group_positions, since obviously we are in a NS step */
edi->buf->do_edsam->bUpdateShifts = TRUE;
- /* Set the pointer to the next ED dataset (if any) */
+ /* Set the pointer to the next ED group (if any) */
edi=edi->next_edi;
}
}
xu[XX] = x[XX]-tx*box[XX][XX]-ty*box[YY][XX]-tz*box[ZZ][XX];
xu[YY] = x[YY]-ty*box[YY][YY]-tz*box[ZZ][YY];
xu[ZZ] = x[ZZ]-tz*box[ZZ][ZZ];
- } else
+ }
+ else
{
xu[XX] = x[XX]-tx*box[XX][XX];
xu[YY] = x[YY]-ty*box[YY][YY];
}
-static void do_linfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
+static void do_linfix(rvec *xcoll, t_edpar *edi, gmx_large_int_t step)
{
int i, j;
real proj, add;
}
-static void do_linacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
+static void do_linacc(rvec *xcoll, t_edpar *edi)
{
int i, j;
real proj, add;
if (edi->vecs.linacc.stpsz[i] > 0.0)
{
if ((proj-edi->vecs.linacc.refproj[i]) < 0.0)
+ {
add = edi->vecs.linacc.refproj[i] - proj;
+ }
}
if (edi->vecs.linacc.stpsz[i] < 0.0)
{
if ((proj-edi->vecs.linacc.refproj[i]) > 0.0)
+ {
add = edi->vecs.linacc.refproj[i] - proj;
+ }
}
/* apply the correction */
}
-static void do_radfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
+static void do_radfix(rvec *xcoll, t_edpar *edi)
{
int i,j;
real *proj, rad=0.0, ratio;
/* apply the correction */
proj[i] /= edi->sav.sqrtm[i];
proj[i] *= ratio;
- for (j=0; j<edi->sav.nr; j++) {
+ for (j=0; j<edi->sav.nr; j++)
+ {
svmul(proj[i], edi->vecs.radfix.vec[i][j], vec_dum);
rvec_inc(xcoll[j], vec_dum);
}
}
-static void do_radacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
+static void do_radacc(rvec *xcoll, t_edpar *edi)
{
int i,j;
real *proj, rad=0.0, ratio=0.0;
real *proj;
};
-static void do_radcon(rvec *xcoll, t_edpar *edi, t_commrec *cr)
+static void do_radcon(rvec *xcoll, t_edpar *edi)
{
int i,j;
real rad=0.0, ratio=0.0;
loc = edi->buf->do_radcon;
if (edi->vecs.radcon.neig == 0)
+ {
return;
-
+ }
+
if (bFirst)
+ {
snew(loc->proj, edi->vecs.radcon.neig);
+ }
/* loop over radcon vectors */
for (i=0; i<edi->vecs.radcon.neig; i++)
}
-static void ed_apply_constraints(rvec *xcoll, t_edpar *edi, gmx_large_int_t step, t_commrec *cr)
+static void ed_apply_constraints(rvec *xcoll, t_edpar *edi, gmx_large_int_t step)
{
int i;
/* subtract the average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_dec(xcoll[i], edi->sav.x[i]);
+ }
/* apply the constraints */
if (step >= 0)
- do_linfix(xcoll, edi, step, cr);
- do_linacc(xcoll, edi, cr);
+ {
+ do_linfix(xcoll, edi, step);
+ }
+ do_linacc(xcoll, edi);
if (step >= 0)
- do_radfix(xcoll, edi, step, cr);
- do_radacc(xcoll, edi, cr);
- do_radcon(xcoll, edi, cr);
+ {
+ do_radfix(xcoll, edi);
+ }
+ do_radacc(xcoll, edi);
+ do_radcon(xcoll, edi);
/* add back the average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_inc(xcoll[i], edi->sav.x[i]);
+ }
}
-/* Write out the projections onto the eigenvectors */
-static void write_edo(int nr_edi, t_edpar *edi, gmx_edsam_t ed, gmx_large_int_t step,real rmsd)
+/* Write out the projections onto the eigenvectors. The order of output
+ * corresponds to ed_output_legend() */
+static void write_edo(t_edpar *edi, FILE *fp,real rmsd)
{
int i;
- char buf[22];
- if (edi->bNeedDoEdsam)
+ /* Output how well we fit to the reference structure */
+ fprintf(fp, EDcol_ffmt, rmsd);
+
+ for (i=0; i<edi->vecs.mon.neig; i++)
{
- if (step == -1)
- fprintf(ed->edo, "Initial projections:\n");
- else
- {
- fprintf(ed->edo,"Step %s, ED #%d ", gmx_step_str(step, buf), nr_edi);
- fprintf(ed->edo," RMSD %f nm\n",rmsd);
- }
+ fprintf(fp, EDcol_efmt, edi->vecs.mon.xproj[i]);
+ }
- if (edi->vecs.mon.neig)
- {
- fprintf(ed->edo," Monitor eigenvectors");
- for (i=0; i<edi->vecs.mon.neig; i++)
- fprintf(ed->edo," %d: %12.5e ",edi->vecs.mon.ieig[i],edi->vecs.mon.xproj[i]);
- fprintf(ed->edo,"\n");
- }
- if (edi->vecs.linfix.neig)
- {
- fprintf(ed->edo," Linfix eigenvectors");
- for (i=0; i<edi->vecs.linfix.neig; i++)
- fprintf(ed->edo," %d: %12.5e ",edi->vecs.linfix.ieig[i],edi->vecs.linfix.xproj[i]);
- fprintf(ed->edo,"\n");
- }
- if (edi->vecs.linacc.neig)
- {
- fprintf(ed->edo," Linacc eigenvectors");
- for (i=0; i<edi->vecs.linacc.neig; i++)
- fprintf(ed->edo," %d: %12.5e ",edi->vecs.linacc.ieig[i],edi->vecs.linacc.xproj[i]);
- fprintf(ed->edo,"\n");
- }
- if (edi->vecs.radfix.neig)
- {
- fprintf(ed->edo," Radfix eigenvectors");
- for (i=0; i<edi->vecs.radfix.neig; i++)
- fprintf(ed->edo," %d: %12.5e ",edi->vecs.radfix.ieig[i],edi->vecs.radfix.xproj[i]);
- fprintf(ed->edo,"\n");
- fprintf(ed->edo," fixed increment radius = %f\n", calc_radius(&edi->vecs.radfix));
- }
- if (edi->vecs.radacc.neig)
- {
- fprintf(ed->edo," Radacc eigenvectors");
- for (i=0; i<edi->vecs.radacc.neig; i++)
- fprintf(ed->edo," %d: %12.5e ",edi->vecs.radacc.ieig[i],edi->vecs.radacc.xproj[i]);
- fprintf(ed->edo,"\n");
- fprintf(ed->edo," acceptance radius = %f\n", calc_radius(&edi->vecs.radacc));
- }
- if (edi->vecs.radcon.neig)
- {
- fprintf(ed->edo," Radcon eigenvectors");
- for (i=0; i<edi->vecs.radcon.neig; i++)
- fprintf(ed->edo," %d: %12.5e ",edi->vecs.radcon.ieig[i],edi->vecs.radcon.xproj[i]);
- fprintf(ed->edo,"\n");
- fprintf(ed->edo," contracting radius = %f\n", calc_radius(&edi->vecs.radcon));
- }
+ for (i=0; i<edi->vecs.linfix.neig; i++)
+ {
+ fprintf(fp, EDcol_efmt, edi->vecs.linfix.xproj[i]);
+ }
+
+ for (i=0; i<edi->vecs.linacc.neig; i++)
+ {
+ fprintf(fp, EDcol_efmt, edi->vecs.linacc.xproj[i]);
+ }
+
+ for (i=0; i<edi->vecs.radfix.neig; i++)
+ {
+ fprintf(fp, EDcol_efmt, edi->vecs.radfix.xproj[i]);
+ }
+ if (edi->vecs.radfix.neig)
+ {
+ fprintf(fp, EDcol_ffmt, calc_radius(&edi->vecs.radfix)); /* fixed increment radius */
+ }
+
+ for (i=0; i<edi->vecs.radacc.neig; i++)
+ {
+ fprintf(fp, EDcol_efmt, edi->vecs.radacc.xproj[i]);
+ }
+ if (edi->vecs.radacc.neig)
+ {
+ fprintf(fp, EDcol_ffmt, calc_radius(&edi->vecs.radacc)); /* acceptance radius */
+ }
+
+ for (i=0; i<edi->vecs.radcon.neig; i++)
+ {
+ fprintf(fp, EDcol_efmt, edi->vecs.radcon.xproj[i]);
+ }
+ if (edi->vecs.radcon.neig)
+ {
+ fprintf(fp, EDcol_ffmt, calc_radius(&edi->vecs.radcon)); /* contracting radius */
}
}
if (NULL==floodvecs->refproj0)
+ {
snew(floodvecs->refproj0, floodvecs->neig);
+ }
for (i=0; i<floodvecs->neig; i++)
{
}
+/* Call on MASTER only. Check whether the essential dynamics / flooding
+ * groups of the checkpoint file are consistent with the provided .edi file. */
+static void crosscheck_edi_file_vs_checkpoint(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+ t_edpar *edi = NULL; /* points to a single edi data set */
+ int edinum;
+
+
+ if (NULL == EDstate->nref || NULL == EDstate->nav)
+ {
+ gmx_fatal(FARGS, "Essential dynamics and flooding can only be switched on (or off) at the\n"
+ "start of a new simulation. If a simulation runs with/without ED constraints,\n"
+ "it must also continue with/without ED constraints when checkpointing.\n"
+ "To switch on (or off) ED constraints, please prepare a new .tpr to start\n"
+ "from without a checkpoint.\n");
+ }
+
+ edi=ed->edpar;
+ edinum = 0;
+ while(edi != NULL)
+ {
+ /* Check number of atoms in the reference and average structures */
+ if (EDstate->nref[edinum] != edi->sref.nr)
+ {
+ gmx_fatal(FARGS, "The number of reference structure atoms in ED group %c is\n"
+ "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+ get_EDgroupChar(edinum+1, 0), EDstate->nref[edinum], edi->sref.nr);
+ }
+ if (EDstate->nav[edinum] != edi->sav.nr)
+ {
+ gmx_fatal(FARGS, "The number of average structure atoms in ED group %c is\n"
+ "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+ get_EDgroupChar(edinum+1, 0), EDstate->nav[edinum], edi->sav.nr);
+ }
+ edi=edi->next_edi;
+ edinum++;
+ }
+
+ if (edinum != EDstate->nED)
+ {
+ gmx_fatal(FARGS, "The number of essential dynamics / flooding groups is not consistent.\n"
+ "There are %d ED groups in the .cpt file, but %d in the .edi file!\n"
+ "Are you sure this is the correct .edi file?\n", EDstate->nED, edinum);
+ }
+}
+
+
+/* The edsamstate struct stores the information we need to make the ED group
+ * whole again after restarts from a checkpoint file. Here we do the following:
+ * a) If we did not start from .cpt, we prepare the struct for proper .cpt writing,
+ * b) if we did start from .cpt, we copy over the last whole structures from .cpt,
+ * c) in any case, for subsequent checkpoint writing, we set the pointers in
+ * edsamstate to the x_old arrays, which contain the correct PBC representation of
+ * all ED structures at the last time step. */
+static void init_edsamstate(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+ int i, nr_edi;
+ t_edpar *edi;
+
+
+ snew(EDstate->old_sref_p, EDstate->nED);
+ snew(EDstate->old_sav_p , EDstate->nED);
+
+ /* If we did not read in a .cpt file, these arrays are not yet allocated */
+ if (!EDstate->bFromCpt)
+ {
+ snew(EDstate->nref, EDstate->nED);
+ snew(EDstate->nav , EDstate->nED);
+ }
+
+ /* Loop over all ED/flooding data sets (usually only one, though) */
+ edi = ed->edpar;
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
+ {
+ /* We always need the last reference and average positions such that
+ * in the next time step we can make the ED group whole again
+ * if the atoms do not have the correct PBC representation */
+ if (EDstate->bFromCpt)
+ {
+ /* Copy the last whole positions of reference and average group from .cpt */
+ for (i=0; i<edi->sref.nr; i++)
+ {
+ copy_rvec(EDstate->old_sref[nr_edi-1][i], edi->sref.x_old[i]);
+ }
+ for (i=0; i<edi->sav.nr ; i++)
+ {
+ copy_rvec(EDstate->old_sav [nr_edi-1][i], edi->sav.x_old [i]);
+ }
+ }
+ else
+ {
+ EDstate->nref[nr_edi-1] = edi->sref.nr;
+ EDstate->nav [nr_edi-1] = edi->sav.nr;
+ }
+
+ /* For subsequent checkpoint writing, set the edsamstate pointers to the edi arrays: */
+ EDstate->old_sref_p[nr_edi-1] = edi->sref.x_old;
+ EDstate->old_sav_p [nr_edi-1] = edi->sav.x_old ;
+
+ edi = edi->next_edi;
+ }
+}
+
+
+/* Adds 'buf' to 'str' */
+static void add_to_string(char **str, char *buf)
+{
+ int len;
+
+
+ len = strlen(*str) + strlen(buf) + 1;
+ srenew(*str, len);
+ strcat(*str, buf);
+}
+
+
+static void add_to_string_aligned(char **str, char *buf)
+{
+ char buf_aligned[STRLEN];
+
+ sprintf(buf_aligned, EDcol_sfmt, buf);
+ add_to_string(str, buf_aligned);
+}
+
+
+static void nice_legend(const char ***setname, int *nsets, char **LegendStr, char *value, char *unit, char EDgroupchar)
+{
+ char tmp[STRLEN], tmp2[STRLEN];
+
+
+ sprintf(tmp, "%c %s", EDgroupchar, value);
+ add_to_string_aligned(LegendStr, tmp);
+ sprintf(tmp2, "%s (%s)", tmp, unit);
+ (*setname)[*nsets] = strdup(tmp2);
+ (*nsets)++;
+}
+
+
+static void nice_legend_evec(const char ***setname, int *nsets, char **LegendStr, t_eigvec *evec, char EDgroupChar, const char *EDtype)
+{
+ int i;
+ char tmp[STRLEN];
+
+
+ for (i=0; i<evec->neig; i++)
+ {
+ sprintf(tmp, "EV%dprj%s", evec->ieig[i], EDtype);
+ nice_legend(setname, nsets, LegendStr, tmp, "nm", EDgroupChar);
+ }
+}
+
+
+/* Makes a legend for the xvg output file. Call on MASTER only! */
+static void write_edo_legend(gmx_edsam_t ed, int nED, const output_env_t oenv)
+{
+ t_edpar *edi = NULL;
+ int i;
+ int nr_edi, nsets, n_flood, n_edsam;
+ const char **setname;
+ char buf[STRLEN];
+ char *LegendStr=NULL;
+
+
+ edi = ed->edpar;
+
+ fprintf(ed->edo, "# Output will be written every %d step%s\n", ed->edpar->outfrq, ed->edpar->outfrq != 1 ? "s":"");
+
+ for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+ {
+ fprintf(ed->edo, "#\n");
+ fprintf(ed->edo, "# Summary of applied con/restraints for the ED group %c\n", get_EDgroupChar(nr_edi, nED));
+ fprintf(ed->edo, "# Atoms in average structure: %d\n", edi->sav.nr);
+ fprintf(ed->edo, "# monitor : %d vec%s\n" , edi->vecs.mon.neig , edi->vecs.mon.neig != 1 ? "s":"");
+ fprintf(ed->edo, "# LINFIX : %d vec%s\n" , edi->vecs.linfix.neig, edi->vecs.linfix.neig != 1 ? "s":"");
+ fprintf(ed->edo, "# LINACC : %d vec%s\n" , edi->vecs.linacc.neig, edi->vecs.linacc.neig != 1 ? "s":"");
+ fprintf(ed->edo, "# RADFIX : %d vec%s\n" , edi->vecs.radfix.neig, edi->vecs.radfix.neig != 1 ? "s":"");
+ fprintf(ed->edo, "# RADACC : %d vec%s\n" , edi->vecs.radacc.neig, edi->vecs.radacc.neig != 1 ? "s":"");
+ fprintf(ed->edo, "# RADCON : %d vec%s\n" , edi->vecs.radcon.neig, edi->vecs.radcon.neig != 1 ? "s":"");
+ fprintf(ed->edo, "# FLOODING : %d vec%s " , edi->flood.vecs.neig , edi->flood.vecs.neig != 1 ? "s":"");
+
+ if (edi->flood.vecs.neig)
+ {
+ /* If in any of the groups we find a flooding vector, flooding is turned on */
+ ed->eEDtype = eEDflood;
+
+ /* Print what flavor of flooding we will do */
+ if (0 == edi->flood.tau) /* constant flooding strength */
+ {
+ fprintf(ed->edo, "Efl_null = %g", edi->flood.constEfl);
+ if (edi->flood.bHarmonic)
+ {
+ fprintf(ed->edo, ", harmonic");
+ }
+ }
+ else /* adaptive flooding */
+ {
+ fprintf(ed->edo, ", adaptive");
+ }
+ }
+ fprintf(ed->edo, "\n");
+
+ edi = edi->next_edi;
+ }
+
+ /* Print a nice legend */
+ snew(LegendStr, 1);
+ LegendStr[0] = '\0';
+ sprintf(buf, "# %6s", "time");
+ add_to_string(&LegendStr, buf);
+
+ /* Calculate the maximum number of columns we could end up with */
+ edi = ed->edpar;
+ nsets = 0;
+ for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+ {
+ nsets += 5 +edi->vecs.mon.neig
+ +edi->vecs.linfix.neig
+ +edi->vecs.linacc.neig
+ +edi->vecs.radfix.neig
+ +edi->vecs.radacc.neig
+ +edi->vecs.radcon.neig
+ + 6*edi->flood.vecs.neig;
+ edi = edi->next_edi;
+ }
+ snew(setname, nsets);
+
+ /* In the mdrun time step in a first function call (do_flood()) the flooding
+ * forces are calculated and in a second function call (do_edsam()) the
+ * ED constraints. To get a corresponding legend, we need to loop twice
+ * over the edi groups and output first the flooding, then the ED part */
+
+ /* The flooding-related legend entries, if flooding is done */
+ nsets = 0;
+ if (eEDflood == ed->eEDtype)
+ {
+ edi = ed->edpar;
+ for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+ {
+ /* Always write out the projection on the flooding EVs. Of course, this can also
+ * be achieved with the monitoring option in do_edsam() (if switched on by the
+ * user), but in that case the positions need to be communicated in do_edsam(),
+ * which is not necessary when doing flooding only. */
+ nice_legend(&setname, &nsets, &LegendStr, "RMSD to ref", "nm", get_EDgroupChar(nr_edi, nED) );
+
+ for (i=0; i<edi->flood.vecs.neig; i++)
+ {
+ sprintf(buf, "EV%dprjFLOOD", edi->flood.vecs.ieig[i]);
+ nice_legend(&setname, &nsets, &LegendStr, buf, "nm", get_EDgroupChar(nr_edi, nED));
+
+ /* Output the current reference projection if it changes with time;
+ * this can happen when flooding is used as harmonic restraint */
+ if (edi->flood.bHarmonic && edi->flood.vecs.refprojslope[i] != 0.0)
+ {
+ sprintf(buf, "EV%d ref.prj.", edi->flood.vecs.ieig[i]);
+ nice_legend(&setname, &nsets, &LegendStr, buf, "nm", get_EDgroupChar(nr_edi, nED));
+ }
+
+ /* For flooding we also output Efl, Vfl, deltaF, and the flooding forces */
+ if (0 != edi->flood.tau) /* only output Efl for adaptive flooding (constant otherwise) */
+ {
+ sprintf(buf, "EV%d-Efl", edi->flood.vecs.ieig[i]);
+ nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol", get_EDgroupChar(nr_edi, nED));
+ }
+
+ sprintf(buf, "EV%d-Vfl", edi->flood.vecs.ieig[i]);
+ nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol", get_EDgroupChar(nr_edi, nED));
+
+ if (0 != edi->flood.tau) /* only output deltaF for adaptive flooding (zero otherwise) */
+ {
+ sprintf(buf, "EV%d-deltaF", edi->flood.vecs.ieig[i]);
+ nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol", get_EDgroupChar(nr_edi, nED));
+ }
+
+ sprintf(buf, "EV%d-FLforces", edi->flood.vecs.ieig[i]);
+ nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol/nm", get_EDgroupChar(nr_edi, nED));
+ }
+
+ edi = edi->next_edi;
+ } /* End of flooding-related legend entries */
+ }
+ n_flood = nsets;
+
+ /* Now the ED-related entries, if essential dynamics is done */
+ edi = ed->edpar;
+ for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RMSD to ref", "nm", get_EDgroupChar(nr_edi, nED) );
+
+ /* Essential dynamics, projections on eigenvectors */
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.mon , get_EDgroupChar(nr_edi, nED), "MON" );
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linfix, get_EDgroupChar(nr_edi, nED), "LINFIX");
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linacc, get_EDgroupChar(nr_edi, nED), "LINACC");
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radfix, get_EDgroupChar(nr_edi, nED), "RADFIX");
+ if (edi->vecs.radfix.neig)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RADFIX radius", "nm", get_EDgroupChar(nr_edi, nED));
+ }
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radacc, get_EDgroupChar(nr_edi, nED), "RADACC");
+ if (edi->vecs.radacc.neig)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RADACC radius", "nm", get_EDgroupChar(nr_edi, nED));
+ }
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radcon, get_EDgroupChar(nr_edi, nED), "RADCON");
+ if (edi->vecs.radcon.neig)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RADCON radius", "nm", get_EDgroupChar(nr_edi, nED));
+ }
+
+ edi = edi->next_edi;
+ } /* end of 'pure' essential dynamics legend entries */
+ n_edsam = nsets - n_flood;
+
+ xvgr_legend(ed->edo, nsets, setname, oenv);
+ sfree(setname);
+
+ fprintf(ed->edo, "#\n"
+ "# Legend for %d column%s of flooding plus %d column%s of essential dynamics data:\n",
+ n_flood, 1 == n_flood ? "":"s",
+ n_edsam, 1 == n_edsam ? "":"s");
+ fprintf(ed->edo, "%s", LegendStr);
+ sfree(LegendStr);
+
+ fflush(ed->edo);
+}
+
+
void init_edsam(gmx_mtop_t *mtop, /* global topology */
t_inputrec *ir, /* input record */
t_commrec *cr, /* communication record */
gmx_edsam_t ed, /* contains all ED data */
rvec x[], /* positions of the whole MD system */
- matrix box) /* the box */
+ matrix box, /* the box */
+ edsamstate_t *EDstate)
{
t_edpar *edi = NULL; /* points to a single edi data set */
- int numedis=0; /* keep track of the number of ED data sets in edi file */
int i,nr_edi,avindex;
rvec *x_pbc = NULL; /* positions of the whole MD system with pbc removed */
- rvec *xfit = NULL; /* the positions which will be fitted to the reference structure */
- rvec *xstart = NULL; /* the positions which are subject to ED sampling */
+ rvec *xfit=NULL, *xstart=NULL; /* dummy arrays to determine initial RMSDs */
rvec fit_transvec; /* translation ... */
matrix fit_rotmat; /* ... and rotation from fit to reference structure */
if (!DOMAINDECOMP(cr) && PAR(cr) && MASTER(cr))
+ {
gmx_fatal(FARGS, "Please switch on domain decomposition to use essential dynamics in parallel.");
+ }
if (MASTER(cr))
+ {
fprintf(stderr, "ED: Initializing essential dynamics constraints.\n");
+ if (NULL == ed)
+ {
+ gmx_fatal(FARGS, "The checkpoint file you provided is from an essential dynamics or\n"
+ "flooding simulation. Please also provide the correct .edi file with -ei.\n");
+ }
+ }
+
/* Needed for initializing radacc radius in do_edsam */
- ed->bFirst = 1;
+ ed->bFirst = TRUE;
/* The input file is read by the master and the edi structures are
* initialized here. Input is stored in ed->edpar. Then the edi
* structures are transferred to the other nodes */
if (MASTER(cr))
{
- snew(ed->edpar,1);
- /* Read the whole edi file at once: */
- read_edi_file(ed,ed->edpar,mtop->natoms,cr);
-
- /* Initialization for every ED/flooding dataset. Flooding uses one edi dataset per
+ /* Initialization for every ED/flooding group. Flooding uses one edi group per
* flooding vector, Essential dynamics can be applied to more than one structure
* as well, but will be done in the order given in the edi file, so
* expect different results for different order of edi file concatenation! */
edi=ed->edpar;
while(edi != NULL)
{
- init_edi(mtop,ir,cr,ed,edi);
-
- /* Init flooding parameters if needed */
- init_flood(edi,ed,ir->delta_t,cr);
-
+ init_edi(mtop,edi);
+ init_flood(edi,ed,ir->delta_t);
edi=edi->next_edi;
- numedis++;
}
}
/* Reset pointer to first ED data set which contains the actual ED data */
edi=ed->edpar;
-
/* Loop over all ED/flooding data sets (usually only one, though) */
- for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
{
- /* We use srenew to allocate memory since the size of the buffers
- * is likely to change with every ED dataset */
- srenew(xfit , edi->sref.nr );
- srenew(xstart, edi->sav.nr );
-
- /* Extract the positions of the atoms to which will be fitted */
- for (i=0; i < edi->sref.nr; i++)
+ /* For multiple ED groups we use the output frequency that was specified
+ * in the first set */
+ if (nr_edi > 1)
{
- copy_rvec(x_pbc[edi->sref.anrs[i]], xfit[i]);
-
- /* Save the sref positions such that in the next time step we can make the ED group whole
- * in case any of the atoms do not have the correct PBC representation */
- copy_rvec(xfit[i], edi->sref.x_old[i]);
+ edi->outfrq = ed->edpar->outfrq;
}
- /* Extract the positions of the atoms subject to ED sampling */
- for (i=0; i < edi->sav.nr; i++)
+ /* Extract the initial reference and average positions. When starting
+ * from .cpt, these have already been read into sref.x_old
+ * in init_edsamstate() */
+ if (!EDstate->bFromCpt)
{
- copy_rvec(x_pbc[edi->sav.anrs[i]], xstart[i]);
+ /* If this is the first run (i.e. no checkpoint present) we assume
+ * that the starting positions give us the correct PBC representation */
+ for (i=0; i < edi->sref.nr; i++)
+ {
+ copy_rvec(x_pbc[edi->sref.anrs[i]], edi->sref.x_old[i]);
+ }
- /* Save the sav positions such that in the next time step we can make the ED group whole
- * in case any of the atoms do not have the correct PBC representation */
- copy_rvec(xstart[i], edi->sav.x_old[i]);
+ for (i=0; i < edi->sav.nr; i++)
+ {
+ copy_rvec(x_pbc[edi->sav.anrs[i]], edi->sav.x_old[i]);
+ }
}
+ /* Now we have the PBC-correct start positions of the reference and
+ average structure. We copy that over to dummy arrays on which we
+ can apply fitting to print out the RMSD. We srenew the memory since
+ the size of the buffers is likely different for every ED group */
+ srenew(xfit , edi->sref.nr );
+ srenew(xstart, edi->sav.nr );
+ copy_rvecn(edi->sref.x_old, xfit, 0, edi->sref.nr);
+ copy_rvecn(edi->sav.x_old, xstart, 0, edi->sav.nr);
+
/* Make the fit to the REFERENCE structure, get translation and rotation */
fit_to_reference(xfit, fit_transvec, fit_rotmat, edi);
/* Output how well we fit to the reference at the start */
translate_and_rotate(xfit, edi->sref.nr, fit_transvec, fit_rotmat);
- fprintf(stderr, "ED: Initial RMSD from reference after fit = %f nm (dataset #%d)\n",
- rmsd_from_structure(xfit, &edi->sref), nr_edi);
+ fprintf(stderr, "ED: Initial RMSD from reference after fit = %f nm",
+ rmsd_from_structure(xfit, &edi->sref));
+ if (EDstate->nED > 1)
+ {
+ fprintf(stderr, " (ED group %c)", get_EDgroupChar(nr_edi, EDstate->nED));
+ }
+ fprintf(stderr, "\n");
/* Now apply the translation and rotation to the atoms on which ED sampling will be performed */
translate_and_rotate(xstart, edi->sav.nr, fit_transvec, fit_rotmat);
* the average structure, which must be projected */
avindex = edi->star.nr - edi->sav.nr;
}
- rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon, cr);
- } else
- rad_project(edi, xstart, &edi->vecs.radcon, cr);
+ rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon);
+ }
+ else
+ {
+ rad_project(edi, xstart, &edi->vecs.radcon);
+ }
/* process structure that will serve as origin of expansion circle */
if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
+ {
fprintf(stderr, "ED: Setting center of flooding potential (0 = average structure)\n");
+ }
if (edi->sori.nr > 0)
{
avindex = edi->sori.nr - edi->sav.nr;
}
- rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radacc, cr);
- rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radfix, cr);
+ rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radacc);
+ rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radfix);
if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
{
fprintf(stderr, "ED: The ORIGIN structure will define the flooding potential center.\n");
/* Set center of flooding potential to the ORIGIN structure */
- rad_project(edi, &edi->sori.x[avindex], &edi->flood.vecs, cr);
+ rad_project(edi, &edi->sori.x[avindex], &edi->flood.vecs);
/* We already know that no (moving) reference position was provided,
* therefore we can overwrite refproj[0]*/
copyEvecReference(&edi->flood.vecs);
}
else /* No origin structure given */
{
- rad_project(edi, xstart, &edi->vecs.radacc, cr);
- rad_project(edi, xstart, &edi->vecs.radfix, cr);
+ rad_project(edi, xstart, &edi->vecs.radacc);
+ rad_project(edi, xstart, &edi->vecs.radfix);
if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
{
if (edi->flood.bHarmonic)
{
fprintf(stderr, "ED: A (possibly changing) ref. projection will define the flooding potential center.\n");
for (i=0; i<edi->flood.vecs.neig; i++)
+ {
edi->flood.vecs.refproj[i] = edi->flood.vecs.refproj0[i];
+ }
}
else
{
/* Set center of flooding potential to the center of the covariance matrix,
* i.e. the average structure, i.e. zero in the projected system */
for (i=0; i<edi->flood.vecs.neig; i++)
+ {
edi->flood.vecs.refproj[i] = 0.0;
+ }
}
}
}
{
for (i=0; i<edi->flood.vecs.neig; i++)
{
- fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", i, edi->flood.vecs.refproj[i]);
+ fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", edi->flood.vecs.ieig[i], edi->flood.vecs.refproj[i]);
if (edi->flood.bHarmonic)
+ {
fprintf(stdout, " (adding %11.4e/timestep)", edi->flood.vecs.refprojslope[i]);
+ }
fprintf(stdout, "\n");
}
}
/* set starting projections for linsam */
- rad_project(edi, xstart, &edi->vecs.linacc, cr);
- rad_project(edi, xstart, &edi->vecs.linfix, cr);
-
- /* Output to file, set the step to -1 so that write_edo knows it was called from init_edsam */
- if (ed->edo && !(ed->bStartFromCpt))
- write_edo(nr_edi, edi, ed, -1, 0);
+ rad_project(edi, xstart, &edi->vecs.linacc);
+ rad_project(edi, xstart, &edi->vecs.linfix);
/* Prepare for the next edi data set: */
edi=edi->next_edi;
if (PAR(cr))
{
/* First let everybody know how many ED data sets to expect */
- gmx_bcast(sizeof(numedis), &numedis, cr);
+ gmx_bcast(sizeof(EDstate->nED), &EDstate->nED, cr);
/* Broadcast the essential dynamics / flooding data to all nodes */
- broadcast_ed_data(cr, ed, numedis);
+ broadcast_ed_data(cr, ed, EDstate->nED);
}
else
{
/* Loop over all ED data sets (usually only one, though) */
edi=ed->edpar;
- for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
{
edi->sref.anrs_loc = edi->sref.anrs;
edi->sav.anrs_loc = edi->sav.anrs;
snew(edi->sav.c_ind, edi->sav.nr);
/* Initialize the array */
for (i=0; i<edi->sav.nr; i++)
+ {
edi->sav.c_ind[i] = i;
+ }
/* In the general case we will need a different-sized array for the reference indices: */
if (!edi->bRefEqAv)
{
snew(edi->sref.c_ind, edi->sref.nr);
for (i=0; i<edi->sref.nr; i++)
+ {
edi->sref.c_ind[i] = i;
+ }
}
/* Point to the very same array in case of other structures: */
edi->star.c_ind = edi->sav.c_ind;
edi->star.nr_loc = edi->star.nr;
edi->sori.nr_loc = edi->sori.nr;
- /* An on we go to the next edi dataset */
+ /* An on we go to the next ED group */
edi=edi->next_edi;
}
}
/* Allocate space for ED buffer variables */
/* Again, loop over ED data sets */
edi=ed->edpar;
- for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
{
/* Allocate space for ED buffer */
snew(edi->buf, 1);
dump_edi(edi, cr, nr_edi);
#endif
- /* An on we go to the next edi dataset */
+ /* Next ED group */
edi=edi->next_edi;
}
/* Flush the edo file so that the user can check some things
* when the simulation has started */
if (ed->edo)
+ {
fflush(ed->edo);
+ }
}
void do_edsam(t_inputrec *ir,
gmx_large_int_t step,
- t_mdatoms *md,
t_commrec *cr,
rvec xs[], /* The local current positions on this processor */
rvec v[], /* The velocities */
struct t_do_edsam *buf;
t_edpar *edi;
real rmsdev=-1; /* RMSD from reference structure prior to applying the constraints */
- gmx_bool bSuppress=FALSE; /* Write .edo file on master? */
+ gmx_bool bSuppress=FALSE; /* Write .xvg output file on master? */
/* Check if ED sampling has to be performed */
if ( ed->eEDtype==eEDnone )
+ {
return;
+ }
/* Suppress output on first call of do_edsam if
* two-step sd2 integrator is used */
if ( (ir->eI==eiSD2) && (v != NULL) )
+ {
bSuppress = TRUE;
+ }
dt_1 = 1.0/ir->delta_t;
- /* Loop over all ED datasets (usually one) */
+ /* Loop over all ED groups (usually one) */
edi = ed->edpar;
edinr = 0;
while (edi != NULL)
buf=edi->buf->do_edsam;
if (ed->bFirst)
+ {
/* initialise radacc radius for slope criterion */
buf->oldrad=calc_radius(&edi->vecs.radacc);
+ }
/* Copy the positions into buf->xc* arrays and after ED
* feed back corrections to the official positions */
communicate_group_positions(cr, buf->xcoll, buf->shifts_xcoll, buf->extra_shifts_xcoll, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
edi->sav.nr, edi->sav.nr_loc, edi->sav.anrs_loc, edi->sav.c_ind, edi->sav.x_old, box);
-#ifdef DEBUG_ED
- dump_xcoll(edi, buf, cr, step);
-#endif
/* Only assembly reference positions if their indices differ from the average ones */
if (!edi->bRefEqAv)
+ {
communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+ }
/* If bUpdateShifts was TRUE then the shifts have just been updated in communicate_group_positions.
* We do not need to update the shifts until the next NS step. Note that dd_make_local_ed_indices
/* Fit the reference indices to the reference structure */
if (edi->bRefEqAv)
+ {
fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+ }
else
+ {
fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+ }
/* Now apply the translation and rotation to the ED structure */
translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
if (do_per_step(step,edi->maxedsteps) && step >= edi->presteps)
{
project(buf->xcoll, edi);
- rad_project(edi, buf->xcoll, &edi->vecs.radacc, cr);
- rad_project(edi, buf->xcoll, &edi->vecs.radfix, cr);
+ rad_project(edi, buf->xcoll, &edi->vecs.radacc);
+ rad_project(edi, buf->xcoll, &edi->vecs.radfix);
buf->oldrad=-1.e5;
}
if (edi->vecs.radacc.radius - buf->oldrad < edi->slope)
{
project(buf->xcoll, edi);
- rad_project(edi, buf->xcoll, &edi->vecs.radacc, cr);
+ rad_project(edi, buf->xcoll, &edi->vecs.radacc);
buf->oldrad = 0.0;
- } else
+ }
+ else
+ {
buf->oldrad = edi->vecs.radacc.radius;
+ }
}
/* apply the constraints */
{
/* ED constraints should be applied already in the first MD step
* (which is step 0), therefore we pass step+1 to the routine */
- ed_apply_constraints(buf->xcoll, edi, step+1 - ir->init_step, cr);
+ ed_apply_constraints(buf->xcoll, edi, step+1 - ir->init_step);
}
/* write to edo, when required */
{
project(buf->xcoll, edi);
if (MASTER(cr) && !bSuppress)
- write_edo(edinr, edi, ed, step, rmsdev);
+ {
+ write_edo(edi, ed->edo, rmsdev);
+ }
}
/* Copy back the positions unless monitoring only */
}
} /* END of if (edi->bNeedDoEdsam) */
- /* Prepare for the next ED dataset */
+ /* Prepare for the next ED group */
edi = edi->next_edi;
- } /* END of loop over ED datasets */
+ } /* END of loop over ED groups */
ed->bFirst = FALSE;
}
#include "gmxcomplex.h"
#include "gmx_fft.h"
-#ifndef GMX_LIB_MPI
+#ifndef GMX_MPI
double MPI_Wtime();
#endif
nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
nbl->table_elec.ninteractions = 1;
nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
- snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),16);
+ snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),32);
nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
nbl->table_vdw.format = nbl->table_elec_vdw.format;
nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
nbl->table_vdw.ninteractions = 2;
nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
- snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),16);
+ snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),32);
for(i=0; i<=nbl->table_elec_vdw.n; i++)
{
#endif
if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
{
-#ifdef GMX_NBNXN_SIMD_2XNN
+#ifdef GMX_NBNXN_SIMD_4XN
*kernel_type = nbnxnk4xN_SIMD_4xN;
#else
gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
}
-/* Note that _mm_... intrinsics can be converted to either SSE or AVX
- * depending on compiler flags.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
-static const char *nbk_name[] =
- { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+const char *lookup_nbnxn_kernel_name(int kernel_type)
+{
+ const char *returnvalue = NULL;
+ switch(kernel_type)
+ {
+ case nbnxnkNotSet: returnvalue = "not set"; break;
+ case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
+#ifndef GMX_NBNXN_SIMD
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "not available"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
+#else
+#ifdef GMX_X86_SSE2
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+ /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
+ * on compiler flags. As we use nearly identical intrinsics, using an AVX
+ * compiler flag without an AVX macro effectively results in AVX kernels.
+ * For gcc we check for __AVX__
+ * At least a check for icc should be added (if there is a macro)
+ */
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
#ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
- "SSE2 4x4",
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE2"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
#else
- "SSE2 4x2",
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE4.1"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
#endif
#else
-#ifndef GMX_DOUBLE
- "SSE4.1 4x4",
-#else
- "SSE4.1 4x2",
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-128"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
#endif
#endif
-#else
-#ifndef GMX_DOUBLE
- "AVX-128 4x4",
-#else
- "AVX-128 4x2",
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-256"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
#endif
+#else /* not GMX_X86_SSE2 */
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "SIMD"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
#endif
-#ifndef GMX_DOUBLE
- "AVX-256 4x8",
-#else
- "AVX-256 4x4",
#endif
- "CUDA 8x8x8", "plain C 8x8x8" };
+ case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
+ case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
+
+ case nbnxnkNR:
+ default:
+ gmx_fatal(FARGS, "Illegal kernel type selected");
+ returnvalue = NULL;
+ break;
+ }
+ return returnvalue;
+};
static void pick_nbnxn_kernel(FILE *fp,
const t_commrec *cr,
if (bDoNonbonded && fp != NULL)
{
fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
- nbnxn_kernel_name[*kernel_type],
+ lookup_nbnxn_kernel_name(*kernel_type),
nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
nbnxn_kernel_to_cj_size(*kernel_type));
}
sfree_aligned(ic->tabq_coul_V);
/* Create the original table data in FDV0 */
- snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
- snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
- snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
+ snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,32);
+ snew_aligned(ic->tabq_coul_F,ic->tabq_size,32);
+ snew_aligned(ic->tabq_coul_V,ic->tabq_size,32);
table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
}
snew(ic, 1);
/* Just allocate something so we can free it */
- snew_aligned(ic->tabq_coul_FDV0,16,16);
- snew_aligned(ic->tabq_coul_F,16,16);
- snew_aligned(ic->tabq_coul_V,16,16);
+ snew_aligned(ic->tabq_coul_FDV0,16,32);
+ snew_aligned(ic->tabq_coul_F,16,32);
+ snew_aligned(ic->tabq_coul_V,16,32);
ic->rlist = fr->rlist;
ic->rlistlong = fr->rlistlong;
fprintf(fplog, "%s\n", hline);
gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
- fprintf(fplog, "\n Force evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
+ fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
gpu_cpu_ratio);
* but we currently can't check that here.
*/
md_print_warn(NULL,fplog,
- "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
+ "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
" performance loss. Maybe the domain decomposition limits the PME tuning.\n"
- " In that case, try setting the DD grid manually (-dd) or lowering -dds.\n");
+ " In that case, try setting the DD grid manually (-dd) or lowering -dds.");
}
else
{
* too small for increasing the cut-off for PME tuning.
*/
md_print_warn(NULL,fplog,
- "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
- " performance loss.\n");
+ "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
+ " performance loss.");
}
}
if (gpu_cpu_ratio > 1.2)
{
md_print_warn(NULL,fplog,
- "NOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
- " performance loss, consider using a shorter cut-off and a finer PME grid.\n");
+ "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
+ " performance loss, consider using a shorter cut-off and a finer PME grid.");
}
}
}
snew(state->cg_p,state->nalloc);
}
}
- if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE) {
+ if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc)) {
state->nrng = gmx_rng_n();
state->nrngi = 1;
- if (EI_SD(ir->eI) || ir->eI == eiBD) {
+ if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc)) {
/* This will be correct later with DD */
state->nrng *= nnodes;
state->nrngi *= nnodes;
{
bcast_ir_mtop(cr,inputrec,mtop);
- if (inputrec->eI == eiBD || EI_SD(inputrec->eI)) {
+ if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc)) {
/* Make sure the random seeds are different on each node */
inputrec->ld_seed += cr->nodeid;
}
#include "nbnxn_atomdata.h"
#include "gmx_omp_nthreads.h"
-/* Default nbnxn allocation routine, allocates 32 byte aligned,
- * which works for plain C and aligned SSE and AVX loads/stores.
- */
+/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
{
- *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
+ *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,NBNXN_MEM_ALIGN);
}
/* Free function for memory allocated with nbnxn_alloc_aligned */
nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
nbat->x = NULL;
+
+#ifdef GMX_NBNXN_SIMD
+ if (simple)
+ {
+ /* Set the diagonal cluster pair exclusion mask setup data.
+ * In the kernel we check 0 < j - i to generate the masks.
+ * Here we store j - i for generating the mask for the first i,
+ * we substract 0.5 to avoid rounding issues.
+ * In the kernel we can subtract 1 to generate the subsequent mask.
+ */
+ const int simd_width=GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+ int simd_4xn_diag_size,j;
+
+ simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE,simd_width);
+ snew_aligned(nbat->simd_4xn_diag,simd_4xn_diag_size,NBNXN_MEM_ALIGN);
+ for(j=0; j<simd_4xn_diag_size; j++)
+ {
+ nbat->simd_4xn_diag[j] = j - 0.5;
+ }
+
+ snew_aligned(nbat->simd_2xnn_diag,simd_width,NBNXN_MEM_ALIGN);
+ for(j=0; j<simd_width/2; j++)
+ {
+ /* The j-cluster size is half the SIMD width */
+ nbat->simd_2xnn_diag[j] = j - 0.5;
+ /* The next half of the SIMD width is for i + 1 */
+ nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
+ }
+ }
+#endif
+
+ /* Initialize the output data structures */
nbat->nout = nout;
snew(nbat->out,nbat->nout);
nbat->nalloc = 0;
}
static void
-nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
- gmx_bool bDestSet,
- real ** gmx_restrict src,
- int nsrc,
- int i0, int i1)
+nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
+ gmx_bool bDestSet,
+ real ** gmx_restrict src,
+ int nsrc,
+ int i0, int i1)
{
-#ifdef NBNXN_SEARCH_SSE
-/* We can use AVX256 here, but not when AVX128 kernels are selected.
- * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
+#ifdef GMX_NBNXN_SIMD
+/* The SIMD width here is actually independent of that in the kernels,
+ * but we use the same width for simplicity (usually optimal anyhow).
*/
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#define GMX_MM128_HERE
#endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#endif
#include "gmx_simd_macros.h"
int i,s;
}
if (nfptr > 0)
{
-#ifdef NBNXN_SEARCH_SSE
- nbnxn_atomdata_reduce_reals_x86_simd
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_atomdata_reduce_reals_simd
#else
nbnxn_atomdata_reduce_reals
#endif
/* With GPU kernels the cluster size is 8 atoms */
#define NBNXN_GPU_CLUSTER_SIZE 8
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage */
-#define NBNXN_GPU_JGROUP_SIZE 4
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
+ * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ */
+#define NBNXN_GPU_JGROUP_SIZE 4
+#define NBNXN_GPU_JGROUP_SIZE_2LOG 2
/* To avoid NaN when excluded atoms are at zero distance, we add a small
* number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
#if !defined PRUNE_NBL && __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
#pragma unroll 4
#endif
- for (jm = 0; jm < 4; jm++)
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
- if (imask & (255U << (jm * NCL_PER_SUPERCL)))
+ if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
{
mask_ji = (1U << (jm * NCL_PER_SUPERCL));
#if CUDA_VERSION >= 4010
#pragma unroll 4
#endif
- for (jm = 0; jm < 4; jm++)
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
- imask_j = (imask >> (jm * 8)) & 255U;
+ imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
if (imask_j)
{
nsubi = __popc(imask_j);
#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
#define FBUF_STRIDE (CL_SIZE_SQ)
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
* Original idea: OpenMM
*/
#ifdef GMX_X86_SSE2
-#define NBNXN_SEARCH_SSE
+/* Use 4-way SIMD for, always, single precision bounding box calculations */
+#define NBNXN_SEARCH_BB_SSE
+#endif
+
+
+#ifdef GMX_NBNXN_SIMD
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
+#define NBNXN_MEM_ALIGN (GMX_NBNXN_SIMD_BITWIDTH/8)
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_MEM_ALIGN 32
#endif
excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
- for(jm=0; jm<4; jm++)
+ for(jm=0; jm<NBNXN_GPU_JGROUP_SIZE; jm++)
{
cj = nbl->cj4[cj4_ind].cj[jm];
real *nbfp_i;
int n,ci,ci_sh;
int ish,ishf;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int cjind0,cjind1,cjind;
int ip,jp;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef CALC_ENERGIES
#ifndef ENERGY_GROUPS
}
}
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
#ifdef CALC_ENERGIES
real Vc_sub_self;
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+ load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE0,c12_SSE0);
#ifndef HALF_LJ
- load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+ load_lj_pair_params2(nbfp2,nbfp3,type,aj,c6_SSE2,c12_SSE2);
#endif
#endif /* not defined any LJ rule */
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+/* GMX_MM256_HERE should be set before including this file */
#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE 4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if defined GMX_MM256_HERE
#define STRIDE 4
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
-#else
-/* SSE double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
-#endif
-#endif
-
#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 2x(4+4) kernel */
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
#define TAB_FDV0
#else
-/* AVX double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+#error "unsupported kernel configuration"
#endif
#endif
int nbfp_stride;
int n,ci,ci_sh;
int ish,ish3;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int sci,scix,sciy,sciz,sci2;
int cjind0,cjind1,cjind;
int ip,jp;
gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
- gmx_mm_pr diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- gmx_mm_pr diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-
-#ifndef GMX_MM256_HERE
- __m128i zeroi_SSE = _mm_setzero_si128();
+ gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_SSE0,diag_SSE2;
+#else
+ gmx_mm_pr diag0_SSE0,diag0_SSE2;
+ gmx_mm_pr diag1_SSE0,diag1_SSE2;
#endif
-#ifdef GMX_X86_SSE4_1
+
gmx_mm_pr zero_SSE = gmx_set1_pr(0);
-#endif
gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
gmx_mm_pr iq_SSE0=gmx_setzero_pr();
const real *tab_coul_V;
#endif
#ifdef GMX_MM256_HERE
- int ti0_array[2*UNROLLJ-1],*ti0;
- int ti2_array[2*UNROLLJ-1],*ti2;
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
#endif
#ifdef CALC_ENERGIES
gmx_mm_pr mhalfsp_SSE;
nbfp_stride = NBFP_STRIDE;
#endif
+ /* Load j-i for the first i */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+ diag_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if 2*UNROLLI == UNROLLJ
+ diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+#endif
+#endif
+
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
- /* Generate aligned table pointers */
- ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ /* Generate aligned table index pointers */
+ ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
#endif
invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
egps_jshift = 2*nbat->neg_2log;
egps_jmask = (1<<egps_jshift) - 1;
egps_jstride = (UNROLLJ>>1)*UNROLLJ;
- /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ /* Major division is over i-particle energy groups, determine the stride */
Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
#endif
ish = (nbln->shift & NBNXN_CI_SHIFT);
ish3 = ish*3;
- cjind0 = nbln->cj_ind_start;
- cjind1 = nbln->cj_ind_end;
- /* Currently only works super-cells equal to sub-cells */
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
sci += (ci & 1)*(STRIDE>>1);
#endif
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef ENERGY_GROUPS
egps_i = nbat->energrp[ci];
iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz) ,shZ_SSE);
iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
gmx_mm_pr facel_SSE;
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
{
/* Load integer interaction mask */
__m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
- /* The is no unequal sse instruction, so we need a not here */
int_SSE0 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
int_SSE1 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
int_SSE2 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
int_SSE3 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
}
-#else
+#endif
+#if defined GMX_X86_SSE2 && defined GMX_MM256_HERE
{
#ifndef GMX_DOUBLE
/* Load integer interaction mask */
#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#define TAB_FDV0
#else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
#define SUM_SIMD(x) (x[0]+x[1])
#endif
#endif
#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
#define TAB_FDV0
#else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#endif
#endif
int nbfp_stride;
int n,ci,ci_sh;
int ish,ish3;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int sci,scix,sciy,sciz,sci2;
int cjind0,cjind1,cjind;
int ip,jp;
__m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
__m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
__m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
__m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
__m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
#endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
/* AVX: use floating point masks, as there are no integer instructions */
#ifndef GMX_DOUBLE
gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
#endif
#endif
-#ifndef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
- __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
- __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+ gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_SSE0,diag_SSE1,diag_SSE2,diag_SSE3;
#else
- __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
- __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
- __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#else /* GMX_MM256_HERE */
-#ifndef GMX_DOUBLE
- gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
- gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
+ gmx_mm_pr diag0_SSE0,diag0_SSE1,diag0_SSE2,diag0_SSE3;
+ gmx_mm_pr diag1_SSE0,diag1_SSE1,diag1_SSE2,diag1_SSE3;
#endif
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
__m128i zeroi_SSE = _mm_setzero_si128();
#endif
-#ifdef GMX_X86_SSE4_1
gmx_mm_pr zero_SSE = gmx_set1_pr(0);
-#endif
gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
gmx_mm_pr iq_SSE0=gmx_setzero_pr();
const real *tab_coul_V;
#endif
#ifdef GMX_MM256_HERE
- int ti0_array[2*UNROLLJ-1],*ti0;
- int ti1_array[2*UNROLLJ-1],*ti1;
- int ti2_array[2*UNROLLJ-1],*ti2;
- int ti3_array[2*UNROLLJ-1],*ti3;
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+ int ti1_array[2*GMX_SIMD_WIDTH_HERE-1],*ti1;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
+ int ti3_array[2*GMX_SIMD_WIDTH_HERE-1],*ti3;
#endif
#ifdef CALC_ENERGIES
gmx_mm_pr mhalfsp_SSE;
nbfp_stride = NBFP_STRIDE;
#endif
+ /* Load j-i for the first i */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+ diag_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
+ diag0_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+
+#if UNROLLI == 2*UNROLLJ
+ /* Load j-i for the second half of the j-cluster */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+#endif
+
+ diag1_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#endif
+#endif
+
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
- /* Generate aligned table pointers */
- ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ /* Generate aligned table index pointers */
+ ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
#endif
invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
egps_jshift = 2*nbat->neg_2log;
egps_jmask = (1<<egps_jshift) - 1;
egps_jstride = (UNROLLJ>>1)*UNROLLJ;
- /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ /* Major division is over i-particle energy groups, determine the stride */
Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
#endif
ish = (nbln->shift & NBNXN_CI_SHIFT);
ish3 = ish*3;
- cjind0 = nbln->cj_ind_start;
- cjind1 = nbln->cj_ind_end;
- /* Currently only works super-cells equal to sub-cells */
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
sci += (ci & 1)*(STRIDE>>1);
#endif
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef ENERGY_GROUPS
egps_i = nbat->energrp[ci];
iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
iq_SSE0 = gmx_set1_pr(facel*q[sci]);
iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
#ifndef _nbnxn_kernel_sse_utils_h_
#define _nbnxn_kernel_sse_utils_h_
-/* This files contains all functions/macros for the SSE/AVX kernels
- * which have explicit dependencies on the j-size / SIMD-width, which
- * can be 2 (SSE-double), 4 (SSE-single,AVX-double) or 8 (AVX-single).
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
* The functionality which depends on the j-cluster size is:
* LJ-parameter lookup
* force table lookup
* energy group pair energy storage
*/
+#ifdef GMX_X86_SSE2
+
+/* Transpose 2 double precision registers */
#define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1) \
{ \
- out0 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(0,0)); \
- out1 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(1,1)); \
+ out0 = _mm_unpacklo_pd(in0,in1); \
+ out1 = _mm_unpackhi_pd(in0,in1); \
}
#if defined GMX_MM128_HERE || !defined GMX_DOUBLE
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1) \
{ \
__m128 _c01,_c23; \
- _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(1,0,1,0)); \
- _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(1,0,1,0)); \
+ _c01 = _mm_movelh_ps(in0,in1); \
+ _c23 = _mm_movelh_ps(in2,in3); \
out0 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0)); \
out1 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(3,1,3,1)); \
}
#else
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1) \
{ \
__m256d _c01,_c23; \
}
#endif
+/* Collect element 2 of the 4 inputs to out */
#define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0,in1,in2,in3,out) \
{ \
__m128 _c01,_c23; \
#ifndef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out) \
{ \
- _MM_TRANSPOSE4_PS(i_SSE0,i_SSE1,i_SSE2,i_SSE3); \
- i_SSE0 = _mm_add_ps(i_SSE0,i_SSE1); \
- i_SSE2 = _mm_add_ps(i_SSE2,i_SSE3); \
- o_SSE = _mm_add_ps(i_SSE0,i_SSE2); \
+ _MM_TRANSPOSE4_PS(in0,in1,in2,in3); \
+ in0 = _mm_add_ps(in0,in1); \
+ in2 = _mm_add_ps(in2,in3); \
+ out = _mm_add_ps(in0,in2); \
}
#else
-#define GMX_MM_TRANSPOSE_SUM2_PD(i_SSE0,i_SSE1,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM2_PD(in0,in1,out) \
{ \
- GMX_MM_TRANSPOSE2_PD(i_SSE0,i_SSE1); \
- o_SSE = _mm_add_pd(i_SSE0,i_SSE1); \
+ GMX_MM_TRANSPOSE2_PD(in0,in1); \
+ out = _mm_add_pd(in0,in1); \
}
#endif
#else
#ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out) \
{ \
- i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE1); \
- i_SSE2 = _mm256_hadd_ps(i_SSE2,i_SSE3); \
- i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
- o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
+ in0 = _mm256_hadd_ps(in0,in1); \
+ in2 = _mm256_hadd_ps(in2,in3); \
+ in1 = _mm256_hadd_ps(in0,in2); \
+ out = _mm_add_ps(_mm256_castps256_ps128(in1),_mm256_extractf128_ps(in1,1)); \
}
-#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE) \
+/* Sum the elements of halfs of each input register and store sums in out */
+#define GMX_MM_TRANSPOSE_SUM4H_PR(in0,in2,out) \
{ \
- i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps()); \
- i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps()); \
- i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
- i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001); \
- o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+ in0 = _mm256_hadd_ps(in0,_mm256_setzero_ps()); \
+ in2 = _mm256_hadd_ps(in2,_mm256_setzero_ps()); \
+ in0 = _mm256_hadd_ps(in0,in2); \
+ in2 = _mm256_permute_ps(in0,_MM_SHUFFLE(2,3,0,1)); \
+ out = _mm_add_ps(_mm256_castps256_ps128(in0),_mm256_extractf128_ps(in2,1)); \
}
#else
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out) \
{ \
- i_SSE0 = _mm256_hadd_pd(i_SSE0,i_SSE1); \
- i_SSE2 = _mm256_hadd_pd(i_SSE2,i_SSE3); \
- o_SSE = _mm256_add_pd(_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x20),_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x31)); \
+ in0 = _mm256_hadd_pd(in0,in1); \
+ in2 = _mm256_hadd_pd(in2,in3); \
+ out = _mm256_add_pd(_mm256_permute2f128_pd(in0,in2,0x20),_mm256_permute2f128_pd(in0,in2,0x31)); \
}
#endif
#endif
return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
}
-/* Do 2/4 double precision invsqrt operations.
- * Doing the SSE rsqrt and the first Newton Raphson iteration
+/* Do 2 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
* in single precision gives full double precision accuracy.
- * The speed is more than twice as fast as two gmx_mm_invsqrt_pd calls.
+ * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
*/
-#define GMX_MM128_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1) \
+#define GMX_MM128_INVSQRT2_PD(in0,in1,out0,out1) \
{ \
const __m128d half = _mm_set1_pd(0.5); \
const __m128d three = _mm_set1_pd(3.0); \
- __m128 s_SSE,ir_SSE; \
+ __m128 s,ir; \
__m128d lu0,lu1; \
\
- s_SSE = _mm_movelh_ps(_mm_cvtpd_ps(i_SSE0),_mm_cvtpd_ps(i_SSE1)); \
- ir_SSE = gmx_mm128_invsqrt_ps_single(s_SSE); \
- lu0 = _mm_cvtps_pd(ir_SSE); \
- lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir_SSE,ir_SSE)); \
- o_SSE0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
- o_SSE1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+ s = _mm_movelh_ps(_mm_cvtpd_ps(in0),_mm_cvtpd_ps(in1)); \
+ ir = gmx_mm128_invsqrt_ps_single(s); \
+ lu0 = _mm_cvtps_pd(ir); \
+ lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir,ir)); \
+ out0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),in0)),lu0)); \
+ out1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),in1)),lu1)); \
}
#define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
}
-#define GMX_MM256_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1) \
+/* Do 4 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ */
+#define GMX_MM256_INVSQRT2_PD(in0,in1,out0,out1) \
{ \
const __m256d half = _mm256_set1_pd(0.5); \
const __m256d three = _mm256_set1_pd(3.0); \
- __m256 s_SSE,ir_SSE; \
+ __m256 s,ir; \
__m256d lu0,lu1; \
\
- s_SSE = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(i_SSE0)),_mm256_cvtpd_ps(i_SSE1),1); \
- ir_SSE = gmx_mm256_invsqrt_ps_single(s_SSE); \
- lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir_SSE)); \
- lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir_SSE,1)); \
- o_SSE0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
- o_SSE1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+ s = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)),_mm256_cvtpd_ps(in1),1); \
+ ir = gmx_mm256_invsqrt_ps_single(s); \
+ lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir)); \
+ lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir,1)); \
+ out0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),in0)),lu0)); \
+ out1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),in1)),lu1)); \
}
#define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
}
-#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE) \
+#define load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE,c12_SSE) \
{ \
- __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
+ __m128 clj_SSE0[UNROLLJ],clj_SSE1[UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
int p; \
\
- for(p=0; p<2*UNROLLJ; p++) \
+ for(p=0; p<UNROLLJ; p++) \
{ \
/* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
+ clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE); \
} \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+ for(p=0; p<UNROLLJ; p++) \
+ { \
+ /* Here we load 4 aligned floats, but we need just 2 */ \
+ clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE); \
+ } \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0],clj_SSE0[1],clj_SSE0[2],clj_SSE0[3],c6t_SSE[0],c12t_SSE[0]); \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0],clj_SSE1[1],clj_SSE1[2],clj_SSE1[3],c6t_SSE[1],c12t_SSE[1]); \
\
GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE); \
GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
* But AMD CPUs perform significantly worse with gcc than with icc.
* Performance is improved a bit by using the extract function UNROLLJ times,
* instead of doing an _mm_store_si128 for every i-particle.
- * With AVX this significantly deteriorates performance (8 extracts iso 4).
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
* Because of this, the load_table_f macro always takes the ti parameter,
* but it is only used with AVX.
*/
}
#endif
+#endif /* GMX_X86_SSE2 */
+
#endif /* _nbnxn_kernel_sse_utils_h_ */
#define BBU_Z 6
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
#ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
#define NBNXN_SEARCH_SSE_SINGLE
#endif
/* Include basic SSE2 stuff */
#include <emmintrin.h>
-#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
-#define NBNXN_8BB_SSE
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
#endif
/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
* Here AVX-256 turns out to be slightly slower than AVX-128.
*/
-#define STRIDE_8BB 4
-#define STRIDE_8BB_2LOG 2
+#define STRIDE_PBB 4
+#define STRIDE_PBB_2LOG 2
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
#ifdef GMX_NBNXN_SIMD
#define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
#define NBNXN_BBXXXX
/* Size of bounding box corners quadruplet */
-#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_8BB)
+#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB)
#endif
/* We shift the i-particles backward for PBC.
static int set_grid_size_xy(const nbnxn_search_t nbs,
nbnxn_grid_t *grid,
+ int dd_zone,
int n,rvec corner0,rvec corner1,
real atom_density,
int XFormat)
grid->ncy = 1;
}
+ grid->sx = size[XX]/grid->ncx;
+ grid->sy = size[YY]/grid->ncy;
+ grid->inv_sx = 1/grid->sx;
+ grid->inv_sy = 1/grid->sy;
+
+ if (dd_zone > 0)
+ {
+ /* This is a non-home zone, add an extra row of cells
+ * for particles communicated for bonded interactions.
+ * These can be beyond the cut-off. It doesn't matter where
+ * they end up on the grid, but for performance it's better
+ * if they don't end up in cells that can be within cut-off range.
+ */
+ grid->ncx++;
+ grid->ncy++;
+ }
+
/* We need one additional cell entry for particles moved by DD */
if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
{
grid->nc_nalloc = over_alloc_large(nc_max);
srenew(grid->nsubc,grid->nc_nalloc);
srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_8BB_SSE
- bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
+#ifdef NBNXN_PBB_SSE
+ bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
#else
bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
#endif
copy_rvec(corner0,grid->c0);
copy_rvec(corner1,grid->c1);
- grid->sx = size[XX]/grid->ncx;
- grid->sy = size[YY]/grid->ncy;
- grid->inv_sx = 1/grid->sx;
- grid->inv_sy = 1/grid->sy;
return nc_max;
}
-#define SORT_GRID_OVERSIZE 2
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
#define SGSF (SORT_GRID_OVERSIZE + 1)
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the inverse hole spacing.
+ * nsort, the theortical hole limit, is only used for debugging.
+ * sort is the sorting work array.
+ */
static void sort_atoms(int dim,gmx_bool Backwards,
int *a,int n,rvec *x,
real h0,real invh,int nsort,int *sort)
{
int i,c;
- int zi,zim;
+ int zi,zim,zi_min,zi_max;
int cp,tmp;
if (n <= 1)
return;
}
- /* For small oversize factors clearing the whole area is fastest.
- * For large oversize we should clear the used elements after use.
- */
- for(i=0; i<nsort; i++)
- {
- sort[i] = -1;
- }
+ /* Determine the index range used, so we can limit it for the second pass */
+ zi_min = INT_MAX;
+ zi_max = -1;
+
/* Sort the particles using a simple index sort */
for(i=0; i<n; i++)
{
if (sort[zi] < 0)
{
sort[zi] = a[i];
+ zi_min = min(zi_min,zi);
+ zi_max = max(zi_max,zi);
}
else
{
zim++;
}
sort[zim] = cp;
+ zi_max = max(zi_max,zim);
}
sort[zi] = a[i];
+ zi_max = max(zi_max,zi);
}
}
if (sort[zi] >= 0)
{
a[c++] = sort[zi];
+ sort[zi] = -1;
}
}
}
else
{
- for(zi=nsort-1; zi>=0; zi--)
+ for(zi=zi_max; zi>=zi_min; zi--)
{
if (sort[zi] >= 0)
{
a[c++] = sort[zi];
+ sort[zi] = -1;
}
}
}
bb[BBU_Z] = R2F_U(zh);
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Packed coordinates, bb order xyz0 */
static void calc_bounding_box_x_x4_halves(int na,const real *x,
i += stride;
}
/* Note: possible double to float conversion here */
- bb[0*STRIDE_8BB] = R2F_D(xl);
- bb[1*STRIDE_8BB] = R2F_D(yl);
- bb[2*STRIDE_8BB] = R2F_D(zl);
- bb[3*STRIDE_8BB] = R2F_U(xh);
- bb[4*STRIDE_8BB] = R2F_U(yh);
- bb[5*STRIDE_8BB] = R2F_U(zh);
+ bb[0*STRIDE_PBB] = R2F_D(xl);
+ bb[1*STRIDE_PBB] = R2F_D(yl);
+ bb[2*STRIDE_PBB] = R2F_D(zl);
+ bb[3*STRIDE_PBB] = R2F_U(xh);
+ bb[4*STRIDE_PBB] = R2F_U(yh);
+ bb[5*STRIDE_PBB] = R2F_U(zh);
}
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
#ifdef NBNXN_SEARCH_SSE_SINGLE
{
calc_bounding_box_sse(na,x,bb_work);
- bb[0*STRIDE_8BB] = bb_work[BBL_X];
- bb[1*STRIDE_8BB] = bb_work[BBL_Y];
- bb[2*STRIDE_8BB] = bb_work[BBL_Z];
- bb[3*STRIDE_8BB] = bb_work[BBU_X];
- bb[4*STRIDE_8BB] = bb_work[BBU_Y];
- bb[5*STRIDE_8BB] = bb_work[BBU_Z];
+ bb[0*STRIDE_PBB] = bb_work[BBL_X];
+ bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+ bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+ bb[3*STRIDE_PBB] = bb_work[BBU_X];
+ bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+ bb[5*STRIDE_PBB] = bb_work[BBU_Z];
}
#endif /* NBNXN_SEARCH_SSE_SINGLE */
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Combines pairs of consecutive bounding boxes */
static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
for(c=0; c<grid->nc; c++)
{
#ifdef NBNXN_BBXXXX
- for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
+ for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
{
int cs_w,i,d;
- cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
- for(i=0; i<STRIDE_8BB; i++)
+ cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+ for(i=0; i<STRIDE_PBB; i++)
{
for(d=0; d<DIM; d++)
{
ba[d] +=
- grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
- grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_8BB+i];
+ grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+ grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
}
}
}
offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
bb_ptr = grid->bb + offset;
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
if (2*grid->na_cj == grid->na_c)
{
calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
*/
bb_ptr =
grid->bb +
- ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
- (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
+ ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+ (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
#ifdef NBNXN_SEARCH_SSE_SINGLE
if (nbat->XFormat == nbatXYZQ)
{
fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx,sy,sz,
- bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
- bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
- bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
+ bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
+ bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
+ bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
}
}
#endif
/* Determine in which grid column atoms should go */
static void calc_column_indices(nbnxn_grid_t *grid,
int a0,int a1,
- rvec *x,const int *move,
+ rvec *x,
+ int dd_zone,const int *move,
int thread,int nthread,
int *cell,
int *cxy_na)
n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
- for(i=n0; i<n1; i++)
+ if (dd_zone == 0)
{
- if (move == NULL || move[i] >= 0)
+ /* Home zone */
+ for(i=n0; i<n1; i++)
{
- /* We need to be careful with rounding,
- * particles might be a few bits outside the local box.
- * The int cast takes care of the lower bound,
- * we need to explicitly take care of the upper bound.
- */
- cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
- if (cx == grid->ncx)
- {
- cx = grid->ncx - 1;
- }
- cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
- if (cy == grid->ncy)
+ if (move == NULL || move[i] >= 0)
{
- cy = grid->ncy - 1;
- }
- /* For the moment cell contains only the, grid local,
- * x and y indices, not z.
- */
- cell[i] = cx*grid->ncy + cy;
+ /* We need to be careful with rounding,
+ * particles might be a few bits outside the local zone.
+ * The int cast takes care of the lower bound,
+ * we will explicitly take care of the upper bound.
+ */
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
#ifdef DEBUG_NBNXN_GRIDDING
- if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+ if (cx < 0 || cx >= grid->ncx ||
+ cy < 0 || cy >= grid->ncy)
+ {
+ gmx_fatal(FARGS,
+ "grid cell cx %d cy %d out of range (max %d %d)\n"
+ "atom %f %f %f, grid->c0 %f %f",
+ cx,cy,grid->ncx,grid->ncy,
+ x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+ }
+#endif
+ /* Take care of potential rouding issues */
+ cx = min(cx,grid->ncx - 1);
+ cy = min(cy,grid->ncy - 1);
+
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+ }
+ else
{
- gmx_fatal(FARGS,
- "grid cell cx %d cy %d out of range (max %d %d)\n"
- "atom %f %f %f, grid->c0 %f %f",
- cx,cy,grid->ncx,grid->ncy,
- x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+ /* Put this moved particle after the end of the grid,
+ * so we can process it later without using conditionals.
+ */
+ cell[i] = grid->ncx*grid->ncy;
}
-#endif
+
+ cxy_na[cell[i]]++;
}
- else
+ }
+ else
+ {
+ /* Non-home zone */
+ for(i=n0; i<n1; i++)
{
- /* Put this moved particle after the end of the grid,
- * so we can process it later without using conditionals.
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+ /* For non-home zones there could be particles outside
+ * the non-bonded cut-off range, which have been communicated
+ * for bonded interactions only. For the result it doesn't
+ * matter where these end up on the grid. For performance
+ * we put them in an extra row at the border.
*/
- cell[i] = grid->ncx*grid->ncy;
- }
+ cx = max(cx,0);
+ cx = min(cx,grid->ncx - 1);
+ cy = max(cy,0);
+ cy = min(cy,grid->ncy - 1);
- cxy_na[cell[i]]++;
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+
+ cxy_na[cell[i]]++;
+ }
}
}
#pragma omp parallel for num_threads(nthread) schedule(static)
for(thread=0; thread<nthread; thread++)
{
- calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+ calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
nbs->cell,nbs->work[thread].cxy_na);
}
over_alloc_large(ncz_max*grid->na_sc*SGSF);
srenew(nbs->work[thread].sort_work,
nbs->work[thread].sort_work_nalloc);
+ /* When not in use, all elements should be -1 */
+ for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
+ {
+ nbs->work[thread].sort_work[i] = -1;
+ }
}
}
nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
}
- /* Set the cell indices for the moved particles */
- n0 = grid->nc*grid->na_sc;
- n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
- for(i=n0; i<n1; i++)
+ if (dd_zone == 0)
{
- nbs->cell[nbs->a[i]] = i;
+ /* Set the cell indices for the moved particles */
+ n0 = grid->nc*grid->na_sc;
+ n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+ if (dd_zone == 0)
+ {
+ for(i=n0; i<n1; i++)
+ {
+ nbs->cell[nbs->a[i]] = i;
+ }
+ }
}
/* Sort the super-cell columns along z into the sub-cells. */
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid,grid->bb);
nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
}
- nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+ nc_max_grid = set_grid_size_xy(nbs,grid,
+ dd_zone,n-nmoved,corner0,corner1,
nbs->grid[0].atom_density,
nbat->XFormat);
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid,grid->bb_simple);
return d2;
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* SSE code for bb distance for bb format xyz0 */
static float subc_bb_dist2_sse(int na_c,
\
shi = si*NNBSBB_D*DIM; \
\
- xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB); \
- yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB); \
- zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB); \
- xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB); \
- yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB); \
- zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB); \
+ xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB); \
\
dx_0 = _mm_sub_ps(xi_l,xj_h); \
dy_0 = _mm_sub_ps(yi_l,yj_h); \
zero = _mm_setzero_ps();
- xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
- yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
- zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
- xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
- yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
- zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
+ xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+ yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+ zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+ xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+ yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+ zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
- /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
+ /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
* But as we know the number of iterations is 1 or 2, we unroll manually.
*/
SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
- if (STRIDE_8BB < nsi)
+ if (STRIDE_PBB < nsi)
{
- SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
+ SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
}
}
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
/* Plain C function which determines if any atom pair between two cells
* is within distance sqrt(rl2).
rc2_SSE = _mm_set1_ps(rl2);
- na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
- ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
- iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
- iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
- ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
- iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
- iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
+ na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+ ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+ iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+ iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+ ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+ iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+ iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
/* Returns the j sub-cell for index cj_ind */
static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
{
- return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
}
/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
{
- return nbl->cj4[cj_ind>>2].imei[0].imask;
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
}
/* Ensures there is enough space for extra extra exclusion masks */
/* We can store 4 j-subcell - i-supercell pairs in one struct.
* since we round down, we need one extra entry.
*/
- ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+ ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
if (ncj4_max > nbl->cj4_nalloc)
{
snew(nbl->work,1);
#ifdef NBNXN_BBXXXX
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
#else
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+ snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
#ifdef GMX_NBNXN_SIMD
- snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
- snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
+ snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
+ snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
}
void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
fprintf(fp,"nbl average j super cell list length %.1f\n",
0.25*nbl->ncj4/(double)nbl->nsci);
fprintf(fp,"nbl average i sub cell list length %.1f\n",
- nbl->nci_tot/(0.25*nbl->ncj4));
+ nbl->nci_tot/((double)nbl->ncj4));
for(si=0; si<=GPU_NSUBCELL; si++)
{
{
for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
{
- for(j=0; j<4; j++)
+ for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
{
b = 0;
for(si=0; si<GPU_NSUBCELL; si++)
w = (ej>>2);
for(ei=ej; ei<nbl->na_ci; ei++)
{
- excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
- ~(1U << (sj_offset*GPU_NSUBCELL+si));
+ excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+ ~(1U << (sj_offset*GPU_NSUBCELL + si));
}
}
}
for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
{
- cj4_ind = (nbl->work->cj_ind >> 2);
+ cj4_ind = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
cj4 = &nbl->cj4[cj4_ind];
#ifdef NBNXN_BBXXXX
/* Determine all ci1 bb distances in one call with SSE */
- subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
+ subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
ci1,bb_ci,d2l);
*ndistc += na_c*2;
#endif
{
/* Avoid using function pointers here, as it's slower */
if (
-#ifdef NBNXN_8BB_SSE
+#ifdef NBNXN_PBB_SSE
!subc_in_range_sse8
#else
!subc_in_range_x
nbl->nci_tot += npair;
/* Increase the closing index in i super-cell list */
- nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+ nbl->sci[nbl->nsci].cj4_ind_end =
+ ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
}
}
}
ndirect++;
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
else
{
while (cj_ind_first + ndirect <= cj_ind_last &&
inner_e = ge - se*na_c;
/* Macro for getting the index of atom a within a cluster */
-#define AMODI(a) ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+#define AMODCJ4(a) ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
/* Macro for converting an atom number to a cluster number */
-#define A2CI(a) ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+#define A2CJ4(a) ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a) ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
- if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+ if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
{
w = (inner_e >> 2);
- get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+ get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
- nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
- ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+ nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+ ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
}
-#undef AMODI
-#undef A2CI
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
}
}
}
{
sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
- if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+ /* The counts below are used for non-bonded pair/flop counts
+ * and should therefore match the available kernel setups.
+ */
+ if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
{
- nbl->work->ncj_hlj += jlen;
+ nbl->work->ncj_noq += jlen;
}
- else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+ else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+ !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
{
- nbl->work->ncj_noq += jlen;
+ nbl->work->ncj_hlj += jlen;
}
nbl->nci++;
/* We can only have complete blocks of 4 j-entries in a list,
* so round the count up before closing.
*/
- nbl->ncj4 = ((nbl->work->cj_ind + 4-1) >> 2);
+ nbl->ncj4 = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
nbl->nsci++;
int ia,m,i;
#ifdef NBNXN_BBXXXX
- ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
- for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+ ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+ for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
{
- for(i=0; i<STRIDE_8BB; i++)
+ for(i=0; i<STRIDE_PBB; i++)
{
- bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
- bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
- bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
- bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
- bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
- bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
+ bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+ bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+ bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+ bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+ bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+ bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
}
}
#else
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Copies PBC shifted super-cell packed atom coordinates to working array */
static void icell_set_x_supersub_sse8(int ci,
real shx,real shy,real shz,
for(si=0; si<GPU_NSUBCELL; si++)
{
- for(i=0; i<na_c; i+=STRIDE_8BB)
+ for(i=0; i<na_c; i+=STRIDE_PBB)
{
io = si*na_c + i;
ia = ci*GPU_NSUBCELL*na_c + io;
- for(j=0; j<STRIDE_8BB; j++)
+ for(j=0; j<STRIDE_PBB; j++)
{
- x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
- x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
- x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
+ x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+ x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+ x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
}
}
}
for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
{
- for(j=0; j<4; j++)
+ for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
{
fprintf(fp," sj %5d imask %x\n",
nbl->cj4[j4].cj[j],
}
else
{
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
nbs->icell_set_x = icell_set_x_supersub_sse8;
#else
nbs->icell_set_x = icell_set_x_supersub;
if (ed)
{
- do_flood(fplog,cr,x,f,ed,box,step,bNS);
+ do_flood(cr,inputrec,x,f,ed,box,step,bNS);
}
if (bUseOrEmulGPU && !bDiffKernels)
wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
/* now clear the GPU outputs while we finish the step on the CPU */
+
+ wallcycle_start_nocount(wcycle,ewcLAUNCH_GPU_NB);
nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
}
else
{
if (ed)
{
- do_flood(fplog,cr,x,f,ed,box,step,bNS);
+ do_flood(cr,inputrec,x,f,ed,box,step,bNS);
}
if (DOMAINDECOMP(cr))
* numbers per nx+1 data points. For performance reasons we want
* the table data to be aligned to 16-byte.
*/
- snew_aligned(table.data, 12*(nx+1)*sizeof(real),16);
+ snew_aligned(table.data, 12*(nx+1)*sizeof(real),32);
for(k=0; (k<etiNR); k++) {
if (tabsel[k] != etabUSER) {
* to do this :-)
*/
- snew_aligned(table.data,4*nx,16);
+ snew_aligned(table.data,4*nx,32);
init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
* to do this :-)
*/
- snew_aligned(table.data,4*nx,16);
+ snew_aligned(table.data,4*nx,32);
copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data);
g_densmap
g_densorder
g_dielectric
- g_dih
g_dipoles
g_disre
g_dist
"Calculate surface fluctuations");
LegacyCmdLineWrapper::registerModule(manager, &gmx_dielectric, "dielectric",
"Calculate frequency dependent dielectric constants");
- LegacyCmdLineWrapper::registerModule(manager, &gmx_dih, "dih",
- "Analyze dihedral transitions");
LegacyCmdLineWrapper::registerModule(manager, &gmx_dipoles, "dipoles",
"Compute the total dipole plus fluctuations");
LegacyCmdLineWrapper::registerModule(manager, &gmx_disre, "disre",
b0 = 0;
switch (ftype) {
case F_BONDS:
- case F_G96BONDS:
b0 = idef->iparams[type].harmonic.rA;
+ break;
+ case F_G96BONDS:
+ b0 = sqrt(idef->iparams[type].harmonic.rA);
break;
case F_MORSE:
b0 = idef->iparams[type].morse.b0A;
}
}
-static void assign_param(t_functype ftype,t_iparams *newparam,
+/* A return value of 0 means parameters were assigned successfully,
+ * returning -1 means this is an all-zero interaction that should not be added.
+ */
+static int
+assign_param(t_functype ftype,t_iparams *newparam,
real old[MAXFORCEPARAM],int comb,double reppow)
{
int i,j;
real tmp;
+ gmx_bool all_param_zero=TRUE;
/* Set to zero */
for(j=0; (j<MAXFORCEPARAM); j++)
- {
+ {
newparam->generic.buf[j]=0.0;
- }
+ /* If all parameters are zero we might not add some interaction types (selected below).
+ * We cannot apply this to ALL interactions, since many have valid reasons for having
+ * zero parameters (e.g. an index to a Cmap interaction, or LJ parameters), but
+ * we use it for angles and torsions that are typically generated automatically.
+ */
+ all_param_zero = (all_param_zero==TRUE) && fabs(old[j])<GMX_REAL_MIN;
+ }
+
+ if(all_param_zero==TRUE)
+ {
+ if(IS_ANGLE(ftype) || IS_RESTRAINT_TYPE(ftype) || ftype==F_IDIHS ||
+ ftype==F_PDIHS || ftype==F_PIDIHS || ftype==F_RBDIHS || ftype==F_FOURDIHS)
+ {
+ return -1;
+ }
+ }
+
switch (ftype) {
case F_G96ANGLES:
/* Post processing of input data: store cosine iso angle itself */
case F_PIDIHS:
case F_ANGRES:
case F_ANGRESZ:
- newparam->pdihs.phiA = old[0];
- newparam->pdihs.cpA = old[1];
-
- /* Dont do any checks if all parameters are zero (such interactions will be removed).
- * Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
- * so I have changed the lower limit to -99 /EL
- *
- * Second, if the force constant is zero in both A and B states, we set the phase
- * and multiplicity to zero too so the interaction gets removed during clean-up.
- */
- newparam->pdihs.phiB = old[3];
- newparam->pdihs.cpB = old[4];
-
- if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
- {
- newparam->pdihs.phiA = 0.0;
- newparam->pdihs.phiB = 0.0;
- newparam->pdihs.mult = 0;
- }
- else
- {
- newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
- }
+ newparam->pdihs.phiA = old[0];
+ newparam->pdihs.cpA = old[1];
+
+ /* Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
+ * so I have changed the lower limit to -99 /EL
+ */
+ newparam->pdihs.phiB = old[3];
+ newparam->pdihs.cpB = old[4];
+ /* If both force constants are zero there is no interaction. Return -1 to signal
+ * this entry should NOT be added.
+ */
+ if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
+ {
+ return -1;
+ }
+
+ newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
break;
case F_POSRES:
newparam->rbdihs.rbcB[3]=-2.0*old[NR_FOURDIHS+2];
newparam->rbdihs.rbcB[4]=-4.0*old[NR_FOURDIHS+3];
newparam->rbdihs.rbcB[5]=0.0;
- break;
+ break;
case F_CONSTR:
case F_CONSTRNC:
newparam->constr.dA = old[0];
gmx_fatal(FARGS,"unknown function type %d in %s line %d",
ftype,__FILE__,__LINE__);
}
+ return 0;
}
static int enter_params(gmx_ffparams_t *ffparams, t_functype ftype,
{
t_iparams newparam;
int type;
-
- assign_param(ftype,&newparam,forceparams,comb,reppow);
+ int rc;
+
+ if( (rc=assign_param(ftype,&newparam,forceparams,comb,reppow))<0 )
+ {
+ /* -1 means this interaction is all-zero and should not be added */
+ return rc;
+ }
+
if (!bAppend) {
for (type=start; (type<ffparams->ntypes); type++) {
if (ffparams->functype[type]==ftype) {
__FILE__,__LINE__,*maxtypes);
}
type = enter_params(ffparams,ftype,p->param[k].c,comb,reppow,start,bAppend);
- if (!bNB) {
+ /* Type==-1 is used as a signal that this interaction is all-zero and should not be added. */
+ if (!bNB && type>=0) {
nral = NRAL(ftype);
delta = nr*(nral+1);
srenew(il->iatoms,il->nr+delta);
{
nstfep = ir->expandedvals->nstexpanded;
}
- if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
+ if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
{
nstfep = repl_ex_nst;
}
top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
cglo_flags
| CGLO_ENERGY
- | (bStopCM ? CGLO_STOPCM : 0)
| (bTemp ? CGLO_TEMPERATURE:0)
| (bPres ? CGLO_PRESSURE : 0)
| (bPres ? CGLO_CONSTRAINT : 0)
{
if (bTrotter)
{
+ m_add(force_vir,shake_vir,total_vir); /* we need the un-dispersion corrected total vir here */
trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
}
else
top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
}
-
-
- update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
}
}
saved_conserved_quantity -= enerd->term[F_DISPCORR];
}
/* sum up the foreign energy and dhdl terms for vv. currently done every step so that dhdl is correct in the .edr */
- sum_dhdl(enerd,state->lambda,ir->fepvals);
+ if (!bRerunMD)
+ {
+ sum_dhdl(enerd,state->lambda,ir->fepvals);
+ }
}
/* ######## END FIRST UPDATE STEP ############## */
gs.sig[eglsCHKPT] = 1;
}
-
- /* at the start of step, randomize the velocities */
- if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
+ /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
+ if (EI_VV(ir->eI))
{
- gmx_bool bDoAndersenConstr;
- bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
- /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
- if (bDoAndersenConstr)
+ if (!bInitStep)
{
- update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
- state,fr->bMolPBC,graph,f,
- &top->idef,tmp_vir,NULL,
- cr,nrnb,wcycle,upd,constr,
- bInitStep,TRUE,bCalcVir,vetanew);
+ update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
+ }
+ if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+ {
+ gmx_bool bIfRandomize;
+ bIfRandomize = update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr);
+ /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+ if (constr && bIfRandomize)
+ {
+ update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ state,fr->bMolPBC,graph,f,
+ &top->idef,tmp_vir,NULL,
+ cr,nrnb,wcycle,upd,constr,
+ bInitStep,TRUE,bCalcVir,vetanew);
+ }
}
}
lastbox,
top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
cglo_flags
- | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0)
+ | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
| (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
| (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
| (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
/* only add constraint dvdl after constraints */
enerd->term[F_DVDL_BONDED] += dvdl;
- if (!bVV)
+ if (!bVV || bRerunMD)
{
/* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
sum_dhdl(enerd,state->lambda,ir->fepvals);
state->fep_state = lamnew;
for (i=0;i<efptNR;i++)
{
- state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+ state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
}
}
/* Remaining runtime */
"ED (essential dynamics) sampling is switched on by using the [TT]-ei[tt]",
"flag followed by an [TT].edi[tt] file.",
"The [TT].edi[tt] file can be produced using options in the essdyn",
- "menu of the WHAT IF program. [TT]mdrun[tt] produces a [TT].edo[tt] file that",
+ "menu of the WHAT IF program. [TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
"contains projections of positions, velocities and forces onto selected",
"eigenvectors.[PAR]",
"When user-defined potential functions have been selected in the",
{ efXVG, "-tpi", "tpi", ffOPTWR },
{ efXVG, "-tpid", "tpidist", ffOPTWR },
{ efEDI, "-ei", "sam", ffOPTRD },
- { efEDO, "-eo", "sam", ffOPTWR },
+ { efXVG, "-eo", "edsam", ffOPTWR },
{ efGCT, "-j", "wham", ffOPTRD },
{ efGCT, "-jo", "bam", ffOPTWR },
{ efXVG, "-ffout", "gct", ffOPTWR },
}
fprintf(fplog,"\n");
+ /* the "Order After Exchange" is the state label corresponding to the configuration that
+ started in state listed in order, i.e.
+
+ 3 0 1 2
+
+ means that the:
+ configuration starting in simulation 3 is now in simulation 0,
+ configuration starting in simulation 0 is now in simulation 1,
+ configuration starting in simulation 1 is now in simulation 2,
+ configuration starting in simulation 2 is now in simulation 3
+ */
fprintf(fplog,"Order After Exchange: ");
for (i=0;i<n;i++)
{
= [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
= [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
= de[b][a] + de[a][b] */
+
/* permuted:
ediff = E_new - E_old
= [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
= [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
= [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
= (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]) */
+ /* but, in the current code implementation, we flip configurations, not indices . . .
+ So let's examine that.
+ = [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+ = [H_b(x_ap) - H_a(x_ap)] + [H_a(x_bp) - H_b(x_pb)]
+ = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+ So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+ So the simple solution is to flip the
+ position of perturbed and original indices in the tests.
+ */
+
ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
delta = ediff*beta[a]; /* assume all same temperature in this case */
break;
gmx_bool bPrint,bMultiEx;
gmx_bool *bEx = re->bEx;
real *prob = re->prob;
- int *pind = re->destinations;
+ int *pind = re->destinations; /* permuted index */
gmx_bool bEpot=FALSE;
gmx_bool bDLambda=FALSE;
gmx_bool bVol=FALSE;
for (i=0;i<re->nex;i++)
{
/* randomly select a pair */
- /* find out which state it is from, and what label that state currently has */
+ /* in theory, could reduce this by identifying only which switches had a nonneglibible
+ probability of occurring (log p > -100) and only operate on those switches */
+ /* find out which state it is from, and what label that state currently has. Likely
+ more work that useful. */
i0 = (int)(re->nrepl*rando(&(re->seed)));
i1 = (int)(re->nrepl*rando(&(re->seed)));
if (i0==i1)
{
i--;
- continue; /* got the same pair, back up and do it again */
+ continue; /* self-exchange, back up and do it again */
}
- a = re->ind[i0];
+ a = re->ind[i0]; /* what are the indices of these states? */
b = re->ind[i1];
ap = pind[i0];
bp = pind[i1];
bPrint = FALSE; /* too noisy */
- delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); /* calculate the energy difference */
+ /* calculate the energy difference */
+ /* if the code changes to flip the STATES, rather than the configurations,
+ use the commented version of the code */
+ /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+ delta = calc_delta(fplog,bPrint,re,ap,bp,a,b);
- /* we actually only use the first space, since there are actually many switches between pairs. */
+ /* we actually only use the first space in the prob and bEx array,
+ since there are actually many switches between pairs. */
if (delta <= 0)
{
re->nmoves[re->ind[i]][pind[i]] +=1;
re->nmoves[pind[i]][re->ind[i]] +=1;
}
+ fflush(fplog); /* make sure we can see what the last exchange was */
}
static void write_debug_x(t_state *state)
/* There will be only one swap cycle with standard replica
* exchange, but there may be multiple swap cycles if we
* allow multiple swaps. */
+
for (j = 0; j < maxswap; j++)
{
exchange_partner = re->order[replica_id][j];
mda->deviceOptions=deviceOptions;
mda->Flags=Flags;
- fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
- fflush(stderr);
/* now spawn new threads that start mdrunner_start_fn(), while
the main thread returns */
ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
gmx_mtop_remove_chargegroups(mtop);
}
-/* Check the process affinity mask and if it is found to be non-zero,
- * will honor it and disable mdrun internal affinity setting.
- * This function should be called first before the OpenMP library gets
- * initialized with the last argument FALSE (which will detect affinity
- * set by external tools like taskset), and later, after the OpenMP
- * initialization, with the last argument TRUE to detect affinity changes
- * made by the OpenMP library.
+/* Check the process affinity mask. If it is non-zero, something
+ * else has set the affinity, and mdrun should honor that and
+ * not attempt to do its own thread pinning.
+ *
+ * This function should be called twice. Once before the OpenMP
+ * library gets initialized with bAfterOpenMPInit=FALSE (which will
+ * detect affinity set by external tools like taskset), and again
+ * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
+ * (which will detect affinity changes made by the OpenMP library).
*
- * Note that this will only work on Linux as we use a GNU feature. */
+ * Note that this will only work on Linux, because we use a GNU
+ * feature. */
static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
gmx_hw_opt_t *hw_opt, int ncpus,
gmx_bool bAfterOpenmpInit)
if (!bAfterOpenmpInit)
{
md_print_warn(cr, fplog,
- "Non-default process affinity set, disabling internal affinity");
+ "%s detected a non-default process affinity, "
+ "so it will not attempt to pin its threads", ShortProgram());
}
else
{
md_print_warn(cr, fplog,
- "Non-default process affinity set probably by the OpenMP library, "
- "disabling internal affinity");
+ "%s detected a non-default process affinity, "
+ "probably set by the OpenMP library, "
+ "so it will not attempt to pin its threads", ShortProgram());
}
hw_opt->bThreadPinning = FALSE;
if (debug)
{
- fprintf(debug, "Non-default affinity mask found\n");
+ fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
}
}
else
}
else
{
- /* check if some threads failed to set their affinities */
+ /* check & warn if some threads failed to set their affinities */
if (nth_affinity_set != nthread_local)
{
- char sbuf[STRLEN];
- sbuf[0] = '\0';
+ char sbuf1[STRLEN], sbuf2[STRLEN];
+
+ /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+ sbuf1[0] = sbuf2[0] = '\0';
#ifdef GMX_MPI
#ifdef GMX_THREAD_MPI
- sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
+ sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
#else /* GMX_LIB_MPI */
+ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
- sprintf(sbuf, "In MPI process #%d", cr->nodeid);
#endif /* GMX_MPI */
+
+ if (nthread_local > 1)
+ {
+ sprintf(sbuf2, "of %d/%d thread%s ",
+ nthread_local - nth_affinity_set, nthread_local,
+ (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+ }
+
md_print_warn(NULL, fplog,
- "%s%d/%d thread%s failed to set their affinities. "
- "This can cause performance degradation!",
- sbuf, nthread_local - nth_affinity_set, nthread_local,
- (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+ "NOTE: %sAffinity setting %sfailed.\n"
+ " This can cause performance degradation!",
+ sbuf1, sbuf2);
}
}
}
/* now make sure the state is initialized and propagated */
set_state_entries(state,inputrec,cr->nnodes);
- /* remove when vv and rerun works correctly! */
- if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
- {
- gmx_fatal(FARGS,
- "Currently can't do velocity verlet with rerun in parallel.");
- }
-
/* A parallel command line option consistency check that we can
only do after any threads have started. */
if (!PAR(cr) &&
if (opt2bSet("-ei",nfile,fnm))
{
/* Open input and output files, allocate space for ED data structure */
- ed = ed_open(nfile,fnm,Flags,cr);
+ ed = ed_open(mtop->natoms,&state->edsamstate,nfile,fnm,Flags,oenv,cr);
}
if (PAR(cr) && !((Flags & MD_PARTDEC) ||
cr->nnodes==1 ? "process" : "processes"
#endif
);
+ fflush(stderr);
#endif
gmx_omp_nthreads_init(fplog, cr,
/* A histidine residue exists that requires automated assignment, so
* doing the analysis of donors and acceptors is worthwhile. */
fprintf(stderr,
- "Analysing hydrogen-bonding network for automated assigment of histidine\n"
+ "Analysing hydrogen-bonding network for automated assignment of histidine\n"
" protonation.");
snew(donor,natom);
{
nn = rr[i].main;
}
+
if (nn[0] == '-')
{
- gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? " as a starting terminus" : (bEnd ? " as an ending terminus" : ""));
+ gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
}
}
gmx_analyze.c gmx_anaeig.c gmx_angle.c gmx_bond.c
gmx_bundle.c gmx_chi.c gmx_cluster.c gmx_confrms.c
gmx_covar.c gmx_current.c
- gmx_density.c gmx_densmap.c gmx_dih.c
+ gmx_density.c gmx_densmap.c
gmx_dielectric.c
gmx_kinetics.c gmx_spatial.c gmx_tune_pme.c
gmx_dipoles.c gmx_disre.c gmx_dist.c gmx_dyndom.c
{
static const char *desc[] = {
"[TT]g_angle[tt] computes the angle distribution for a number of angles",
- "or dihedrals. This way you can check whether your simulation",
- "is correct. With option [TT]-ov[tt] you can plot the average angle of",
- "a group of angles as a function of time. With the [TT]-all[tt] option",
- "the first graph is the average, the rest are the individual angles.[PAR]",
+ "or dihedrals.[PAR]",
+ "With option [TT]-ov[tt], you can plot the average angle of",
+ "a group of angles as a function of time. With the [TT]-all[tt] option,",
+ "the first graph is the average and the rest are the individual angles.[PAR]",
"With the [TT]-of[tt] option, [TT]g_angle[tt] also calculates the fraction of trans",
"dihedrals (only for dihedrals) as function of time, but this is",
- "probably only fun for a selected few.[PAR]",
- "With option [TT]-oc[tt] a dihedral correlation function is calculated.[PAR]",
- "It should be noted that the index file should contain",
- "atom-triples for angles or atom-quadruplets for dihedrals.",
+ "probably only fun for a select few.[PAR]",
+ "With option [TT]-oc[tt], a dihedral correlation function is calculated.[PAR]",
+ "It should be noted that the index file must contain",
+ "atom triplets for angles or atom quadruplets for dihedrals.",
"If this is not the case, the program will crash.[PAR]",
- "With option [TT]-or[tt] a trajectory file is dumped containing cos and",
- "sin of selected dihedral angles which subsequently can be used as",
- "input for a PCA analysis using [TT]g_covar[tt].[PAR]",
+ "With option [TT]-or[tt], a trajectory file is dumped containing cos and",
+ "sin of selected dihedral angles, which subsequently can be used as",
+ "input for a principal components analysis using [TT]g_covar[tt].[PAR]",
"Option [TT]-ot[tt] plots when transitions occur between",
"dihedral rotamers of multiplicity 3 and [TT]-oh[tt]",
"records a histogram of the times between such transitions,",
+++ /dev/null
-/*
- *
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
- *
- * For more info, check our website at http://www.gromacs.org
- *
- * And Hey:
- * Green Red Orange Magenta Azure Cyan Skyblue
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <math.h>
-
-#include "sysstuff.h"
-#include "string2.h"
-#include "copyrite.h"
-#include "futil.h"
-#include "smalloc.h"
-#include "statutil.h"
-#include "nrama.h"
-#include "physics.h"
-#include "macros.h"
-#include "xvgr.h"
-#include "vec.h"
-#include "gmx_ana.h"
-
-
-#define NOMIN 'X'
-
-static void ana_dih(FILE *out,char *index,int nframes,real dih[],t_dih *dd)
-{
- int i;
- real mind,maxd,sum,av,var,prev,width;
- gmx_bool bTrans;
-
- mind=5400,maxd=-5400,sum=0,av=0,var=0;
-
- prev=dih[0];
- for(i=0; (i<nframes); i++) {
- if ((dih[i]-prev) > 180) {
- /* PBC.. */
- dih[i]-=360;
- }
- else if ((dih[i]-prev) < -180)
- dih[i]+=360;
- prev=dih[i];
-
- sum+=dih[i];
- mind=min(mind,dih[i]);
- maxd=max(maxd,dih[i]);
- }
- av=sum/nframes;
- for(i=0; (i<nframes); i++)
- var+=sqr(dih[i]-av);
- var/=nframes;
- width=(360.0/dd->mult);
- bTrans=((maxd - mind) > width);
-
- fprintf(out,"%-10s %10.3f %10.3f %10.3f %10.3f %10.3f %-10s%3.0f\n",
- index,mind,av,maxd,var,sqrt(var),
- bTrans ? "Yep" : "",width);
-}
-
-static int find_min(real phi,int ntab,real phitab[])
-{
- int i,imin;
- real mind,mm;
- real width;
-
- /* Set closest minimum to the first one */
- width=360.0/ntab;
- mind=fabs(phi-phitab[0]);
- imin=0;
- for(i=1; (i<ntab); i++) {
- mm=fabs(phi-phitab[i]);
- if (mm < mind) {
- imin=i;
- mind=mm;
- }
- }
- if (mind < width*0.5 )
- return imin;
- else
- return -1;
-}
-
-static int vphi(t_dih *dih,real phi,int mult)
-{
- static real m2[] = { 90, 270 };
- static real m3[] = { 60, 180, 300 };
- static real m4[] = { 45, 135, 225, 315 };
- static real m6[] = { 30, 90, 150, 210, 270, 330 };
-
- real phiref;
- int vpp=0;
-
- phiref=RAD2DEG*(phi-dih->phi0);
- while (phiref < 0)
- phiref+=360;
- while (phiref > 360)
- phiref-=360;
-
- switch(mult) {
- case 2:
- vpp=find_min(phiref,2,m2);
- break;
- case 3:
- vpp=find_min(phiref,3,m3);
- break;
- case 4:
- vpp=find_min(phiref,4,m4);
- break;
- case 6:
- vpp=find_min(phiref,6,m6);
- break;
- default:
- gmx_fatal(FARGS,"No such multiplicity %d",dih->mult);
- }
-
- if (vpp == -1)
- return NOMIN;
- else
- return vpp+'0';
-}
-
-typedef struct t_cluster {
- int ndih;
- int freq;
- char *minimum;
- struct t_cluster *next;
-} t_cluster;
-
-static t_cluster *search_cluster(t_cluster *cl,char *minimum)
-{
- t_cluster *ccl=cl;
-
- while (ccl != NULL) {
- if (strcmp(minimum,ccl->minimum)==0)
- return ccl;
- ccl=ccl->next;
- }
- return NULL;
-}
-
-static void add_cluster(t_cluster **cl,int ndih,char *minimum)
-{
- t_cluster *loper;
- t_cluster *ccl;
-
- snew(ccl,1);
- ccl->ndih=ndih;
- ccl->freq=1;
- ccl->minimum=strdup(minimum);
- ccl->next=NULL;
-
- if (*cl == NULL)
- *cl=ccl;
- else {
- loper=*cl;
- while (loper->next != NULL)
- loper=loper->next;
- loper->next=ccl;
- }
-}
-
-static void p_cluster(FILE *out,t_cluster *cl)
-{
- t_cluster *loper;
-
- fprintf(out,"* * * C L U S T E R A N A L Y S I S * * *\n\n");
- fprintf(out," Frequency Dihedral minima\n");
- loper=cl;
- while (loper != NULL) {
- fprintf(out,"%10d %s\n",loper->freq,loper->minimum);
- loper=loper->next;
- }
-}
-
-static void ana_cluster(FILE *out, t_xrama *xr,real **dih,real time[],
- t_topology *top,int nframes,int mult)
-{
- t_cluster *cl=NULL,*scl;
- char *minimum;
- int i,j,nx;
-
- /* Number of dihedrals + terminating NULL
- * this allows for using string routines
- */
- snew(minimum,xr->ndih+1);
-
- for(i=0; (i<nframes); i++) {
- nx=0;
- for(j=0; (j<xr->ndih); j++) {
- minimum[j] = vphi(&xr->dih[j],dih[j][i],
- mult == -1 ? xr->dih[j].mult : mult);
- if (minimum[j] == NOMIN)
- nx++;
- }
- if (nx == 0) {
- if ((scl=search_cluster(cl,minimum)) == NULL)
- add_cluster(&cl,xr->ndih,minimum);
- else
- scl->freq++;
- }
- }
- p_cluster(out,cl);
-
- sfree(minimum);
-}
-
-static void ana_trans(FILE *out, t_xrama *xr,real **dih,real time[],
- t_topology *top,int nframes, const output_env_t oenv)
-{
- FILE *outd;
- real prev_phi,prev_psi;
- int i,j,phi,psi;
- char buf[10];
-
- fprintf(out,"\n\t* * * D I H E D R A L S T A T I S T I C S * * *\n\n");
- fprintf(out,"%-10s %10s %10s %10s %10s %10s %10s\n",
- "index","minimum","average","maximum","variance","std.dev",
- "transition");
- for(i=0; (i<xr->ndih); i++) {
- sprintf(buf,"dih-%d",i);
- ana_dih(out,buf,nframes,dih[i],&(xr->dih[i]));
- }
- for(i=0; (i<xr->npp); i++) {
- sprintf(buf,"%s",xr->pp[i].label);
- outd=xvgropen(buf,"Dihedral Angles","Time (ps)","Degrees",oenv);
-
- phi=xr->pp[i].iphi;
- psi=xr->pp[i].ipsi;
- prev_phi=dih[phi][0];
- prev_psi=dih[psi][0];
- for(j=0; (j<nframes); j++) {
- /* PBC.. */
- if ((dih[phi][j]-prev_phi) > 180)
- dih[phi][j]-=360;
- else if ((dih[phi][j]-prev_phi) < -180)
- dih[phi][j]+=360;
- prev_phi=dih[phi][j];
- if ((dih[psi][j]-prev_psi) > 180)
- dih[psi][j]-=360;
- else if ((dih[psi][j]-prev_psi) < -180)
- dih[psi][j]+=360;
- prev_psi=dih[psi][j];
- fprintf(outd,"%10g %10g %10g\n",time[j],prev_phi,prev_psi);
- }
- ffclose(outd);
- }
-}
-
-int gmx_dih(int argc,char *argv[])
-{
- const char *desc[] = {
- "[TT]g_dih[tt] can do two things. The default is to analyze dihedral transitions",
- "by merely computing all the dihedral angles defined in your topology",
- "for the whole trajectory. When a dihedral flips over to another minimum",
- "an angle/time plot is made.[PAR]",
- "The opther option is to discretize the dihedral space into a number of",
- "bins, and group each conformation in dihedral space in the",
- "appropriate bin. The output is then given as a number of dihedral",
- "conformations sorted according to occupancy."
- };
- static int mult = -1;
- static gmx_bool bSA = FALSE;
- t_pargs pa[] = {
- { "-sa", FALSE, etBOOL, {&bSA},
- "Perform cluster analysis in dihedral space instead of analysing dihedral transitions." },
- { "-mult", FALSE, etINT, {&mult},
- "mulitiplicity for dihedral angles (by default read from topology)" }
- };
- FILE *out;
- t_xrama *xr;
- t_topology *top;
- real **dih,*time;
- real dd;
- int i,nframes,maxframes=1000;
- output_env_t oenv;
- t_filenm fnm[] = {
- { efTRX, "-f", NULL, ffREAD },
- { efTPX, NULL, NULL, ffREAD },
- { efOUT, NULL, NULL, ffWRITE }
- };
-#define NFILE asize(fnm)
-
- parse_common_args(&argc,argv,PCA_CAN_VIEW | PCA_CAN_TIME | PCA_BE_NICE,
- NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,&oenv);
-
- if (mult != -1)
- fprintf(stderr,"Using %d for dihedral multiplicity rather than topology values\n",mult);
-
- snew(xr,1);
- init_rama(oenv,ftp2fn(efTRX,NFILE,fnm),
- ftp2fn(efTPX,NFILE,fnm),xr,3);
- top=read_top(ftp2fn(efTPX,NFILE,fnm),NULL);
-
- /* Brute force malloc, may be too big... */
- snew(dih,xr->ndih);
- for(i=0; (i<xr->ndih); i++)
- snew(dih[i],maxframes);
- snew(time,maxframes);
-
- fprintf(stderr,"\n");
- nframes = 0;
- while (new_data(xr)) {
- for(i=0; (i<xr->ndih); i++) {
- dd=xr->dih[i].ang*RAD2DEG;
- while (dd < 0)
- dd+=360;
- while (dd > 360)
- dd-=360;
- dih[i][nframes]=dd;
- }
- time[nframes]=xr->t;
- nframes++;
- if (nframes > maxframes) {
- maxframes += 1000;
- for(i=0; (i<xr->ndih); i++)
- srenew(dih[i],maxframes);
- srenew(time,maxframes);
- }
- }
-
- fprintf(stderr,"\nCalculated all dihedrals, now analysing...\n");
-
- out=ftp2FILE(efOUT,NFILE,fnm,"w");
-
- if (bSA) {
- /* Cluster and structure analysis */
- ana_cluster(out,xr,dih,time,top,nframes,mult);
- }
- else {
- /* Analyse transitions... */
- ana_trans(out,xr,dih,time,top,nframes,oenv);
- }
- ffclose(out);
-
- thanx(stderr);
-
- return 0;
-}
nfn = opt2fn_null("-n",NFILE,fnm);
if (( nfn == NULL ) && ( xfn == NULL))
- gmx_fatal(FARGS,"no index file and no structure file suplied");
+ gmx_fatal(FARGS,"no index file and no structure file supplied");
if ((disre_frac < 0) || (disre_frac >= 1))
gmx_fatal(FARGS,"disre_frac should be between 0 and 1");
"of the",
"helix in nm. This is simply the average rise (see above) times the",
"number of helical residues (see below).[BR]",
- "[BB]5.[bb] Number of helical residues (file [TT]n-ahx.xvg[tt]). The title says",
- "it all.[BR]",
- "[BB]6.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
- "[BB]7.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
+ "[BB]5.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
+ "[BB]6.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
"atoms only (file [TT]rms-ahx.xvg[tt]).[BR]",
- "[BB]8.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
- "[BB]9.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
- "[BB]10.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
+ "[BB]7.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
+ "[BB]8.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
+ "[BB]9.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
"[PAR]"
};
static const char *ppp[efhNR+2] = {
static const char *desc[] = {
"[TT]make_edi[tt] generates an essential dynamics (ED) sampling input file to be used with [TT]mdrun[tt]",
"based on eigenvectors of a covariance matrix ([TT]g_covar[tt]) or from a",
- "normal modes anaysis ([TT]g_nmeig[tt]).",
+ "normal modes analysis ([TT]g_nmeig[tt]).",
"ED sampling can be used to manipulate the position along collective coordinates",
"(eigenvectors) of (biological) macromolecules during a simulation. Particularly,",
"it may be used to enhance the sampling efficiency of MD simulations by stimulating",
"[TT]-radcon[tt]: perform acceptance radius contraction along selected eigenvectors",
"towards a target structure specified with [TT]-tar[tt].[PAR]",
"NOTE: each eigenvector can be selected only once. [PAR]",
- "[TT]-outfrq[tt]: frequency (in steps) of writing out projections etc. to [TT].edo[tt] file[PAR]",
+ "[TT]-outfrq[tt]: frequency (in steps) of writing out projections etc. to [TT].xvg[tt] file[PAR]",
"[TT]-slope[tt]: minimal slope in acceptance radius expansion. A new expansion",
"cycle will be started if the spontaneous increase of the radius (in nm/step)",
"is less than the value specified.[PAR]",
"before a new cycle is started.[PAR]",
"Note on the parallel implementation: since ED sampling is a 'global' thing",
"(collective coordinates etc.), at least on the 'protein' side, ED sampling",
- "is not very parallel-friendly from an implentation point of view. Because",
+ "is not very parallel-friendly from an implementation point of view. Because",
"parallel ED requires some extra communication, expect the performance to be",
- "lower as in a free MD simulation, especially on a large number of nodes. [PAR]",
- "All output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a .edo file. In the output",
- "file, per OUTFRQ step the following information is present: [PAR]",
- "[TT]*[tt] the step number[BR]",
- "[TT]*[tt] the number of the ED dataset. ([BB]Note[bb] that you can impose multiple ED constraints in",
+ "lower as in a free MD simulation, especially on a large number of nodes and/or",
+ "when the ED group contains a lot of atoms. [PAR]",
+ "Please also note that if your ED group contains more than a single protein,",
+ "then the [TT].tpr[tt] file must contain the correct PBC representation of the ED group.",
+ "Take a look on the initial RMSD from the reference structure, which is printed",
+ "out at the start of the simulation; if this is much higher than expected, one",
+ "of the ED molecules might be shifted by a box vector. [PAR]",
+ "All ED-related output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a [TT].xvg[tt] file",
+ "as a function of time in intervals of OUTFRQ steps.[PAR]",
+ "[BB]Note[bb] that you can impose multiple ED constraints and flooding potentials in",
"a single simulation (on different molecules) if several [TT].edi[tt] files were concatenated",
- "first. The constraints are applied in the order they appear in the [TT].edi[tt] file.) [BR]",
- "[TT]*[tt] RMSD (for atoms involved in fitting prior to calculating the ED constraints)[BR]",
- "* projections of the positions onto selected eigenvectors[BR]",
+ "first. The constraints are applied in the order they appear in the [TT].edi[tt] file. ",
+ "Depending on what was specified in the [TT].edi[tt] input file, the output file contains for each ED dataset[PAR]",
+ "[TT]*[tt] the RMSD of the fitted molecule to the reference structure (for atoms involved in fitting prior to calculating the ED constraints)[BR]",
+ "[TT]*[tt] projections of the positions onto selected eigenvectors[BR]",
"[PAR][PAR]",
"FLOODING:[PAR]",
"with [TT]-flood[tt], you can specify which eigenvectors are used to compute a flooding potential,",
"is kept in that region.",
"[PAR]",
"The origin is normally the average structure stored in the [TT]eigvec.trr[tt] file.",
- "It can be changed with [TT]-ori[tt] to an arbitrary position in configurational space.",
+ "It can be changed with [TT]-ori[tt] to an arbitrary position in configuration space.",
"With [TT]-tau[tt], [TT]-deltaF0[tt], and [TT]-Eflnull[tt] you control the flooding behaviour.",
"Efl is the flooding strength, it is updated according to the rule of adaptive flooding.",
"Tau is the time constant of adaptive flooding, high [GRK]tau[grk] means slow adaption (i.e. growth). ",
{ "-flood", FALSE, etSTR, {&evSelections[2]},
"Indices of eigenvectors for flooding"},
{ "-outfrq", FALSE, etINT, {&edi_params.outfrq},
- "Freqency (in steps) of writing output in [TT].edo[tt] file" },
+ "Freqency (in steps) of writing output in [TT].xvg[tt] file" },
{ "-slope", FALSE, etREAL, { &edi_params.slope},
"Minimal slope in acceptance radius expansion"},
{ "-linstep", FALSE, etSTR, {&evParams[0]},
*(top.atoms.atomname[index[i]]));
fprintf(fp,"%5d %10.5f %10.5f\n",
- bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,rmsf[i]*bfac,
+ bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,rmsf[i]*bfac,
pdb_bfac);
}
}
if (!bRes || i+1==isize ||
top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
fprintf(fp,"%5d %8.4f\n",
- bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+ bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
ffclose(fp);
}
if (!bRes || i+1==isize ||
top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
fprintf(fp,"%5d %8.4f\n",
- bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+ bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
ffclose(fp);
}
/* Returns TRUE when "opt" is needed at launch time */
static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
{
- /* Apart from the input .tpr we need all options that were set
+ /* Apart from the input .tpr and the error log we need all options that were set
* on the command line and that do not start with -b */
- if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2))
+ if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2) || 0 == strncmp(opt,"-err", 4))
+ {
return FALSE;
+ }
- if (bSet)
- return TRUE;
- else
- return FALSE;
+ return bSet;
}
{ efXVG, "-tpi", "tpi", ffOPTWR },
{ efXVG, "-tpid", "tpidist", ffOPTWR },
{ efEDI, "-ei", "sam", ffOPTRD },
- { efEDO, "-eo", "sam", ffOPTWR },
+ { efXVG, "-eo", "edsam", ffOPTWR },
{ efGCT, "-j", "wham", ffOPTRD },
{ efGCT, "-jo", "bam", ffOPTWR },
{ efXVG, "-ffout", "gct", ffOPTWR },
{ efSTO, "-bc", "bench", ffWRITE },
{ efEDR, "-be", "bench", ffWRITE },
{ efLOG, "-bg", "bench", ffWRITE },
- { efEDO, "-beo", "bench", ffOPTWR },
+ { efXVG, "-beo", "benchedo", ffOPTWR },
{ efXVG, "-bdhdl", "benchdhdl",ffOPTWR },
{ efXVG, "-bfield", "benchfld" ,ffOPTWR },
{ efXVG, "-btpi", "benchtpi", ffOPTWR },