set(CPACK_COMPONENT_GROUP_TOOLS_DESCRIPTION "All GROMACS executable tools")
set(CPACK_COMPONENT_GROUP_MDRUN_DESCRIPTION "GROMACS executable for running simulations")
+# CMake modules/macros are in a subdirectory to keep this file cleaner
+# This needs to be set before project() in order to pick up toolchain files
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
+
project(Gromacs C)
include(Dart)
mark_as_advanced(DART_ROOT)
# provide backward compatibility of software written against the Gromacs API.
set(API_VERSION ${NUM_VERSION})
-# Cmake modules/macros are in a subdirectory to keep this file cleaner
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
endif()
option(GMX_FORCE_CXX "Enable C++ compilation even if not necessary" OFF)
mark_as_advanced(GMX_FORCE_CXX)
-option(GMX_NO_QUOTES "Disable Gromacs cool quotes" OFF)
+option(GMX_COOL_QUOTES "Enable Gromacs cool quotes" ON)
+mark_as_advanced(GMX_COOL_QUOTES)
if(GMX_GPU OR GMX_OPENMM OR GMX_FORCE_CXX)
enable_language(CXX)
endif()
set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
-########################################################################
-# Fix stupid flags on Windows
-########################################################################
-SET(SHARED_LIBS_DEFAULT ON)
-IF( WIN32 AND NOT CYGWIN)
- if (NOT BUILD_SHARED_LIBS)
- option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
- mark_as_advanced(GMX_PREFER_STATIC_LIBS)
- SET(SHARED_LIBS_DEFAULT OFF)
- else()
- add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
- endif()
-
- IF (GMX_PREFER_STATIC_LIBS)
- #Only setting Debug and Release flags. Others configurations current not used.
- STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
- SET(CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE} CACHE STRING "" FORCE)
- STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
- SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
- if(CMAKE_CXX_COMPILER_LOADED)
- STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
- SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
- STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
- endif()
- ENDIF()
-
- #Workaround for cmake bug 13174. Replace deprecated options.
- IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
- if(BUILD_SHARED_LIBS)
- STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
- SET(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} CACHE STRING "" FORCE)
- endif()
- STRING(REPLACE /GZ /RTC1 CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
- SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
- ENDIF()
- IF( CMAKE_CXX_COMPILER_ID MATCHES "Intel" AND CMAKE_CXX_COMPILER_LOADED)
- STRING(REPLACE /GZ /RTC1 CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
-
- STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
- SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
- ENDIF()
-ENDIF()
-
-
########################################################################
# User input options #
########################################################################
option(GMX_THREAD_MPI "Build a thread-MPI-based multithreaded version of GROMACS (not compatible with MPI)" ON)
option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
mark_as_advanced(GMX_SOFTWARE_INVSQRT)
-option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" OFF)
-mark_as_advanced(GMX_POWERPC_INVSQRT)
option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
mark_as_advanced(GMX_FAHCORE)
option(GMX_CYCLE_SUBCOUNTERS "Enable cycle subcounters to get a more detailed cycle timings" OFF)
mark_as_advanced(GMX_CYCLE_SUBCOUNTERS)
+option(GMX_SKIP_DEFAULT_CFLAGS "Don't automatically add suggested/required Compiler flags." OFF)
+mark_as_advanced(GMX_SKIP_DEFAULT_CFLAGS)
+
######################################################################
# Compiler tests
# These need to be done early (before further tests).
if(GMX_SOFTWARE_INVSQRT)
set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
endif(GMX_SOFTWARE_INVSQRT)
-if(GMX_POWERPC_INVSQRT)
- set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
-endif(GMX_POWERPC_INVSQRT)
########################################################################
#Process MPI settings
########################################################################
# Find external packages #
########################################################################
+SET(SHARED_LIBS_DEFAULT ON)
if(UNIX)
if(GMX_PREFER_STATIC_LIBS)
# On Linux .a is the static library suffix, on Mac OS X .lib can also
# be used, so we'll add both to the preference list.
SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib;.a" ${CMAKE_FIND_LIBRARY_SUFFIXES})
- if(SHARED_LIBS_DEFAULT)
- if(BUILD_SHARED_LIBS) #Warn the user about the combination. But don't overwrite the request.
- message(WARNING "Static libraries requested, and shared Gromacs libraries requested.")
- elseif(NOT DEFINED BUILD_SHARED_LIBS) #Change default to OFF. Don't warn if it's already off.
- message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
- set(SHARED_LIBS_DEFAULT OFF)
- endif()
+ if(BUILD_SHARED_LIBS) #Warn the user about the combination. But don't overwrite the request.
+ message(WARNING "Static libraries requested, and shared Gromacs libraries requested.")
+ elseif(NOT DEFINED BUILD_SHARED_LIBS) #Change default to OFF. Don't warn if it's already off.
+ message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
+ set(SHARED_LIBS_DEFAULT OFF)
endif()
endif()
endif()
-option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic with MPI, Windows)" ${SHARED_LIBS_DEFAULT})
+
+IF( WIN32 AND NOT CYGWIN)
+ SET(SHARED_LIBS_DEFAULT OFF) #becuase shared libs on Windows is still new - turning it off by default
+ if (NOT BUILD_SHARED_LIBS)
+ option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
+ if(NOT GMX_PREFER_STATIC_LIBS)
+ message(WARNING "Shared system libraries requested, and static Gromacs libraries requested.")
+ endif()
+ else()
+ option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" OFF)
+ if(GMX_PREFER_STATIC_LIBS)
+ #this combination segfaults (illigal passing of file handles)
+ message(FATAL_ERROR "Static system libraries requested, and shared Gromacs libraries requested.")
+ endif()
+ add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
+ set(PKG_CFLAGS "$PKG_CFLAGS -DUSE_VISIBILITY -DTMPI_USE_VISIBILITY")
+ endif()
+ mark_as_advanced(GMX_PREFER_STATIC_LIBS)
+
+ IF (GMX_PREFER_STATIC_LIBS)
+ #Only setting Debug and Release flags. Others configurations are current not used.
+ STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+ STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+ if(CMAKE_CXX_COMPILER_LOADED)
+ STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+ STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+ endif()
+ ENDIF()
+ IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
+ if(BUILD_SHARED_LIBS) #not sure why incremental building with shared libs doesn't work
+ STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+ endif()
+ ENDIF()
+ENDIF()
+
+option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic e.g. with MPI)" ${SHARED_LIBS_DEFAULT})
option(GMX_GSL "Add support for gsl" OFF)
if (GMX_GSL)
set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_INTERNAL_XDR")
endif(NOT GMX_SYSTEM_XDR)
+# include avx test source, used if the AVX flags are set below
+include(gmxTestAVXMaskload)
+
# Process nonbonded accelerated kernels settings
string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
# nothing to do
elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
- GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" ACCELERATION_C_FLAGS)
if(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
endif(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
if (CMAKE_CXX_COMPILER_LOADED)
- GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" ACCELERATION_CXX_FLAGS)
if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
endif()
# We dont warn for lacking SSE2 flag support, since that is probably standard today.
# Only test the include after we have tried to add the correct flag for SSE2 support
- check_include_file(emmintrin.h HAVE_EMMINTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(emmintrin.h HAVE_EMMINTRIN_H ${ACCELERATION_C_FLAGS})
if(NOT HAVE_EMMINTRIN_H)
message(FATAL_ERROR "Cannot find emmintrin.h, which is required for SSE2 intrinsics support.")
elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
- GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" ACCELERATION_C_FLAGS)
if (NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" ACCELERATION_C_FLAGS)
endif(NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
- message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
# Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
- # intrinsics when SSE2 support is enabled, so we try that instead.
+ # intrinsics when SSE2 support is enabled, so we try that instead first.
if (GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
+ message(WARNING "Neither SSE4.1 or SSE2 seems to be supported by your Windows compiler. Something is likely broken.")
+ else()
+ message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance")
endif()
endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
if (CMAKE_CXX_COMPILER_LOADED)
GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
# Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
# intrinsics when SSE2 support is enabled, so we try that instead.
if (GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
endif()
endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
endif()
# This must come after we have added the -msse4.1 flag on some platforms.
- check_include_file(smmintrin.h HAVE_SMMINTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(smmintrin.h HAVE_SMMINTRIN_H ${ACCELERATION_C_FLAGS})
if(NOT HAVE_SMMINTRIN_H)
message(FATAL_ERROR "Cannot find smmintrin.h, which is required for SSE4.1 intrinsics support.")
# Set the AVX compiler flag for both these choices!
- GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" ACCELERATION_C_FLAGS)
if (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" ACCELERATION_C_FLAGS)
endif (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
if (CMAKE_CXX_COMPILER_LOADED)
- GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
- GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
# Set the FMA4 flags (MSVC doesn't require any)
if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
- GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+ if (${CMAKE_COMPILER_ID} MATCHES "Clang")
+ message(FATAL_ERROR "Clang up to at least version 3.2 produces incorrect code for AVX_128_FMA. Sorry, but you will have to select a different compiler or acceleration.")
+ endif()
+ GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" ACCELERATION_C_FLAGS)
if (NOT GNU_FMA_CFLAG)
message(WARNING "No C FMA4 flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
endif(NOT GNU_FMA_CFLAG)
- GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" GROMACS_C_FLAGS)
+ GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" ACCELERATION_C_FLAGS)
# No big deal if we do not have xop, so no point yelling warnings about it.
if (CMAKE_CXX_COMPILER_LOADED)
- GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" ACCELERATION_CXX_FLAGS)
if (NOT GNU_FMA_CXXFLAG)
message(WARNING "No C++ FMA flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
endif (NOT GNU_FMA_CXXFLAG)
- GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" GROMACS_CXX_FLAGS)
+ GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" ACCELERATION_CXX_FLAGS)
# No big deal if we do not have xop, so no point yelling warnings about it.
endif()
endif()
# Only test the header after we have tried to add the flag for AVX support
- check_include_file(immintrin.h HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(immintrin.h HAVE_IMMINTRIN_H ${ACCELERATION_C_FLAGS})
if(NOT HAVE_IMMINTRIN_H)
message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
try_compile(TEST_AVX ${CMAKE_BINARY_DIR}
"${CMAKE_SOURCE_DIR}/cmake/TestAVX.c"
- COMPILE_DEFINITIONS "${GROMACS_C_FLAGS}")
+ COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
if(NOT TEST_AVX)
message(FATAL_ERROR "Cannot compile AVX intrinsics. Consider switching compiler.")
endif()
endif()
# GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
- check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
- check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(x86intrin.h HAVE_X86INTRIN_H ${ACCELERATION_C_FLAGS})
+ check_include_file(intrin.h HAVE_INTRIN_H ${ACCELERATION_C_FLAGS})
# The user should not be able to set this orthogonally to the acceleration
set(GMX_X86_SSE4_1 1)
endif()
endif()
+ # Unfortunately gcc-4.5.2 and gcc-4.6.0 has a bug where they use the wrong datatype for the formal
+ # parameter of the mask for maskload/maskstore arguments. Check if this is present, since we can work around it.
+ gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
+
elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
if (NOT ACCELERATION_QUIETLY)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
- set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI not compatible with BlueGene, disabled!" FORCE)
set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
# The automatic testing for endianness does not work for the BlueGene cross-compiler
set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
- set(GMX_POWER6 1)
- set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
- set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
set(COREWRAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../corewrap" CACHE STRING
"Path to swindirect.h")
include_directories(${COREWRAP_INCLUDE_DIR})
+ set_property(CACHE GMX_COOL_QUOTES VALUE OFF)
endif(GMX_FAHCORE)
# # # # # # # # # # NO MORE TESTS AFTER THIS LINE! # # # # # # # # # # #
# these are set after everything else
-if (NOT DEFINED GROMACS_C_FLAGS_SET)
- set(GROMACS_C_FLAGS_SET true CACHE INTERNAL "Whether to reset the C flags"
- FORCE)
- set(CMAKE_C_FLAGS "${GROMACS_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING
- "Flags used by the compiler during all build types" FORCE)
- if (CMAKE_CXX_COMPILER_LOADED)
- set(CMAKE_CXX_FLAGS "${GROMACS_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING
- "Flags used by the compiler during all build types" FORCE)
+if (NOT GMX_SKIP_DEFAULT_CFLAGS)
+ set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_CXX_FLAGS}")
+ set(CMAKE_EXE_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+ set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+else()
+ message("Recommended flags which are not added because GMX_SKIP_DEFAULT_CFLAGS=yes:")
+ message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CFLAGS}")
+ message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}")
+ message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}")
+ if(CMAKE_CXX_COMPILER_LOADED)
+ message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CXXFLAGS}")
+ message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
+ message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
endif()
- set(CMAKE_EXE_LINKER_FLAGS
- "${GROMACS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
- CACHE STRING "Linker flags for creating executables" FORCE)
- set(CMAKE_SHARED_LINKER_FLAGS
- "${GROMACS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
- CACHE STRING "Linker flags for creating shared libraries" FORCE)
-endif (NOT DEFINED GROMACS_C_FLAGS_SET)
+ message("CMAKE_EXE_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+ message("CMAKE_SHARED_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+endif()
if(NOT GMX_OPENMP)
#Unset all OpenMP flags in case OpenMP was disabled either by the user
unset(OpenMP_LINKER_FLAGS CACHE)
unset(OpenMP_SHARED_LINKER_FLAGS)
endif()
+set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
######################################
# Output compiler and CFLAGS used
HEAD|Analyzing bonded interactions
g_angle|calculates distributions and correlations for angles and dihedrals
g_bond|calculates bond length distributions
-g_dih|analyzes dihedral transitions
mk_angndx|generates index files for g_angle
END
--- /dev/null
+#include<immintrin.h>
+int main()
+{
+ __m256d a;
+ __m256i mask;
+ double d[4]={1,2,3,4};
+
+ a = _mm256_setzero_pd();
+ mask = _mm256_castpd_si256(a);
+
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+ a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
+#else
+ a = _mm256_maskload_pd(d,mask);
+#endif
+}
+
GMX_TEST_CFLAG(CFLAGS_WARN "-Wall" GMXC_CFLAGS)
GMX_TEST_CFLAG(CFLAGS_STDGNU "-std=gnu99" GMXC_CFLAGS)
GMX_TEST_CFLAG(CFLAGS_OPT "-ip -funroll-all-loops" GMXC_CFLAGS_RELEASE)
- GMX_TEST_CFLAG(CFLAGS_SSE2 "-msse2" GMXC_CFLAGS_RELEASE)
GMX_TEST_CFLAG(CFLAGS_X86 "-mtune=core2" GMXC_CFLAGS_RELEASE)
GMX_TEST_CFLAG(CFLAGS_IA64 "-mtune=itanium2" GMXC_CFLAGS_RELEASE)
else()
GMX_TEST_CFLAG(CFLAGS_WARN "/W2" GMXC_CFLAGS)
- GMX_TEST_CFLAG(CFLAGS_SSE2 "/arch:SSE2" GMXC_CFLAGS_RELEASE)
GMX_TEST_CFLAG(CFLAGS_X86 "/Qip" GMXC_CFLAGS_RELEASE)
endif()
endif()
endif()
GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall" GMXC_CXXFLAGS)
GMX_TEST_CXXFLAG(CXXFLAGS_OPT "-ip -funroll-all-loops" GMXC_CXXFLAGS_RELEASE)
- GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "-msse2" GMXC_CXXFLAGS_RELEASE)
GMX_TEST_CXXFLAG(CXXFLAGS_X86 "-mtune=core2" GMXC_CXXFLAGS_RELEASE)
GMX_TEST_CXXFLAG(CXXFLAGS_IA64 "-mtune=itanium2"
GMXC_CXXFLAGS_RELEASE)
else()
GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W2" GMXC_CXXFLAGS)
- GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "/arch:SSE2" GMXC_CXXFLAGS_RELEASE)
GMX_TEST_CXXFLAG(CXXFLAGS_X86 "/Qip" GMXC_CXXFLAGS_RELEASE)
endif()
endif()
if (MSVC)
# disable warnings for:
# inconsistent dll linkage
+ # forcing value to bool (for C++)
GMX_TEST_CFLAG(CFLAGS_WARN "/wd4273" GMXC_CFLAGS)
- GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/wd4273" GMXC_CXXFLAGS)
+ GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/wd4273 /wd4800" GMXC_CXXFLAGS)
endif()
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
# now actually set the flags:
# C
- if ( NOT DEFINED GMXCFLAGS_SET AND NOT DEFINED ENV{CFLAGS} )
- set(GMXCFLAGS_SET true CACHE INTERNAL "Whether to reset the C flags"
- FORCE)
-
- set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}"
- CACHE STRING "Flags used by the compiler during all build types."
- FORCE)
- set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}"
- CACHE STRING "Flags used by the compiler during release builds."
- FORCE)
- set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}"
- CACHE STRING "Flags used by the compiler during debug builds."
- FORCE)
+ if ( NOT GMX_SKIP_DEFAULT_CFLAGS )
+ set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}")
+ set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
+ set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
endif()
# C++
- if ( NOT DEFINED GMXCXXFLAGS_SET AND NOT DEFINED ENV{CXXFLAGS} AND CMAKE_CXX_COMPILER_LOADED)
- set(GMXCXXFLAGS_SET true CACHE INTERNAL "Whether to reset the C++ flags"
- FORCE)
- set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}"
- CACHE STRING "Flags used by the compiler during all build types."
- FORCE)
+ if ( NOT GMX_SKIP_DEFAULT_CFLAGS)
+ set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE
- "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}"
- CACHE STRING "Flags used by the compiler during release builds."
- FORCE)
+ "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
set(CMAKE_CXX_FLAGS_DEBUG
- "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}"
- CACHE STRING "Flags used by the compiler during debug builds."
- FORCE)
+ "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
endif()
ENDMACRO(gmx_c_flags)
endif()
find_package(MPI)
if(${${MPI_PREFIX}_FOUND})
- set(GROMACS_C_FLAGS ${GROMACS_C_FLAGS} ${${MPI_PREFIX}_COMPILE_FLAGS})
- set(GROMACS_LINKER_FLAGS ${GROMACS_LINKER_FLAGS} ${${MPI_PREFIX}_LINK_FLAGS})
+ set(MPI_COMPILE_FLAGS ${${MPI_PREFIX}_COMPILE_FLAGS})
+ set(MPI_LINKER_FLAGS ${${MPI_PREFIX}_LINK_FLAGS})
include_directories(${${MPI_PREFIX}_INCLUDE_PATH})
list(APPEND GMX_EXTRA_LIBRARIES ${${MPI_PREFIX}_LIBRARIES})
endif()
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# GMX_TEST_AVX_GCC_MASKLOAD_BUG(VARIABLE)
+#
+# VARIABLE will be set if the compiler is a buggy version
+# of GCC (prior to 4.5.3, and maybe 4.6) that has an incorrect second
+# argument to the AVX _mm256_maskload_ps() intrinsic.
+#
+# You need to use this variable in a cmakedefine, and then handle
+# the case separately in your code - no automatic cure, unfortunately.
+#
+MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG AVX_CFLAGS VARIABLE)
+ IF(NOT DEFINED ${VARIABLE})
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug")
+ # some compilers like clang accept both cases,
+ # so first try a normal compile to avoid flagging those as buggy.
+ TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+ "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+ COMPILE_DEFINITIONS "${AVX_CFLAGS}" )
+ IF(${VARIABLE}_COMPILEOK)
+ SET(${VARIABLE} 0 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+ ELSE()
+ TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+ "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+ COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+ IF(${VARIABLE}_COMPILEOK)
+ SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
+ ELSE()
+ MESSAGE(WARNING "Cannot compile AVX code - assuming gcc AVX maskload bug not present." )
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+ ENDIF()
+ ENDIF()
+ ENDIF(NOT DEFINED ${VARIABLE})
+ENDMACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE)
+
+
+
+
/* Sets the ED input/output filenames, opens output (.edo) file */
void init_edsam(gmx_mtop_t *mtop,t_inputrec *ir,t_commrec *cr,
- gmx_edsam_t ed, rvec x[], matrix box);
+ gmx_edsam_t ed, rvec x[], matrix box, edsamstate_t *edsamstate);
/* Init routine for ED and flooding. Calls init_edi in a loop for every .edi-cycle
* contained in the input file, creates a NULL terminated list of t_edpar structures */
int
gmx_dielectric(int argc,char *argv[]);
-GMX_LIBGMXANA_EXPORT
-int
-gmx_dih(int argc,char *argv[]);
-
GMX_LIBGMXANA_EXPORT
int
gmx_dipoles(int argc,char *argv[]);
*/
#ifndef GMX_CPUID_H_
#define GMX_CPUID_H_
+
+#include <stdio.h>
+
#include "visibility.h"
+
#ifdef __cplusplus
extern "C" {
#endif
#ifndef _gmx_math_x86_avx_128_fma_double_h_
#define _gmx_math_x86_avx_128_fma_double_h_
+#include <immintrin.h> /* AVX */
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
#include <math.h>
#include "gmx_x86_avx_128_fma.h"
#ifdef HAVE_X86INTRIN_H
#include <x86intrin.h> /* FMA */
#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
#include <stdio.h>
return sse_overflow;
}
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
+
#endif /* _gmx_x86_avx_128_fma_h_ */
return sse_overflow;
}
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
#endif /* _gmx_x86_avx_256_h_ */
unsigned excl; /* The exclusion (interaction) bits */
} nbnxn_cj_t;
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc)) => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc) => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
#define NBNXN_CI_SHIFT 127
#define NBNXN_CI_DO_LJ(subc) (1<<(7+3*(subc)))
#define NBNXN_CI_HALF_LJ(subc) (1<<(8+3*(subc)))
/* Simple pair-list i-unit */
typedef struct {
int ci; /* i-cluster */
- int shift; /* Shift vector index plus possible flags */
+ int shift; /* Shift vector index plus possible flags, see above */
int cj_ind_start; /* Start index into cj */
int cj_ind_end; /* End index into cj */
} nbnxn_ci_t;
}
energyhistory_t;
+typedef struct
+{
+ /* If one uses essential dynamics or flooding on a group of atoms from
+ * more than one molecule, we cannot make this group whole with
+ * do_pbc_first_mtop(). We assume that the ED group has the correct PBC
+ * representation at the beginning of the simulation and keep track
+ * of the shifts to always get it into that representation.
+ * For proper restarts from a checkpoint we store the positions of the
+ * reference group at the time of checkpoint writing */
+ gmx_bool bFromCpt; /* Did we start from a checkpoint file? */
+ int nED; /* No. of ED/Flooding data sets, if <1 no ED */
+ int *nref; /* No. of atoms in i'th reference structure */
+ int *nav; /* Same for average structure */
+ rvec **old_sref; /* Positions of the reference atoms
+ at the last time step (with correct PBC
+ representation) */
+ rvec **old_sref_p; /* Pointer to these positions */
+ rvec **old_sav; /* Same for the average positions */
+ rvec **old_sav_p;
+}
+edsamstate_t;
+
typedef struct
{
int natoms;
energyhistory_t enerhist; /* Energy history for statistics */
df_history_t dfhist; /*Free energy history for free energy analysis */
+ edsamstate_t edsamstate; /* Essential dynamics / flooding history */
int ddp_count; /* The DD partitioning count for this state */
int ddp_count_cg_gl; /* The DD part. count for index_gl */
#define INVSQRT_DONE
#endif /* gmx_invsqrt */
-#ifdef GMX_POWERPC_SQRT
-static real gmx_powerpc_invsqrt(real x)
-{
- const real half=0.5;
- const real three=3.0;
- t_convert result,bit_pattern;
- unsigned int exp,fract;
- real lu;
- real y;
-#ifdef GMX_DOUBLE
- real y2;
-#endif
-
- lu = __frsqrte((double)x);
-
- y=(half*lu*(three-((x*lu)*lu)));
-
-#if (GMX_POWERPC_SQRT==2)
- /* Extra iteration required */
- y=(half*y*(three-((x*y)*y)));
-#endif
-
-#ifdef GMX_DOUBLE
- y2=(half*y*(three-((x*y)*y)));
-
- return y2; /* 10 Flops */
-#else
- return y; /* 5 Flops */
-#endif
-}
-#define gmx_invsqrt(x) gmx_powerpc_invsqrt(x)
-#define INVSQRT_DONE
-#endif /* powerpc_invsqrt */
-
#ifndef INVSQRT_DONE
# ifdef GMX_DOUBLE
# ifdef HAVE_RSQRT
# If you only use one shell you can copy that GMXRC.* instead.
-# only csh/tcsh understand 'set'
-set is_csh = 123
-test "$is_csh" = 123 && goto CSH
+# only csh/tcsh set the variable $shell (note: lower case!)
+test $shell && goto CSH
# if we got here, shell is bsh/bash/zsh/ksh
# bsh cannot remove part of a variable with %%
<br><a href=online/g_density.html>g_density</a>
<br><a href=online/g_densmap.html>g_densmap</a>
<br><a href=online/g_dielectric.html>g_dielectric</a>
-<br><a href=online/g_dih.html>g_dih</a>
<br><a href=online/g_dipoles.html>g_dipoles</a>
<br><a href=online/g_disre.html>g_disre</a>
<br><a href=online/g_dist.html>g_dist</a>
<TR><TD><A HREF="online/g_bond.html">g_bond</A></TD><TD>calculates bond length distributions</TD>
<TR><TD><A HREF="online/mk_angndx.html">mk_angndx</A></TD><TD>generates index files for g_angle</TD>
<TR><TD><A HREF="online/g_angle.html">g_angle</A></TD><TD>calculates distributions and correlations for angles and dihedrals</TD>
-<TR><TD><A HREF="online/g_dih.html">g_dih</A></TD><TD>analyzes dihedral transitions</TD>
</TABLE>
<A NAME="HNR11">
+++ /dev/null
-<HTML>
-<HEAD>
-<TITLE>g_dih</TITLE>
-<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
-<TABLE WIDTH="98%" NOBORDER >
-<TR><TD WIDTH=400>
-<TABLE WIDTH=400 NOBORDER>
-<TD WIDTH=116>
-<a href="http://www.gromacs.org/"><img SRC="../images/gmxlogo_small.png"BORDER=0 </a></td>
-<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>g_dih</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p><B>VERSION 4.5<br>
-Thu 26 Aug 2010</B></td></tr></TABLE>
-<HR>
-<H3>Description</H3>
-<p>
-g_dih can do two things. The default is to analyze dihedral transitions
-by merely computing all the dihedral angles defined in your topology
-for the whole trajectory. When a dihedral flips over to another minimum
-an angle/time plot is made.<p>
-The opther option is to discretize the dihedral space into a number of
-bins, and group each conformation in dihedral space in the
-appropriate bin. The output is then given as a number of dihedral
-conformations sorted according to occupancy.
-<P>
-<H3>Files</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>filename</TH><TH>type</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-f</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> traj.xtc</a></tt> </TD><TD> Input </TD><TD> Trajectory: <a href="xtc.html">xtc</a> <a href="trr.html">trr</a> <a href="trj.html">trj</a> <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> cpt </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-s</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> topol.tpr</a></tt> </TD><TD> Input </TD><TD> Run input file: <a href="tpr.html">tpr</a> <a href="tpb.html">tpb</a> <a href="tpa.html">tpa</a> </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-o</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="out.html"> hello.out</a></tt> </TD><TD> Output </TD><TD> Generic output file </TD></TR>
-</TABLE>
-<P>
-<H3>Other options</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>type</TH><TH>default</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> Print help info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]version</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> Print version info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0 </tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0 </tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0 </tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]sa</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no </tt> </TD><TD> Perform cluster analysis in dihedral space instead of analysing dihedral transitions. </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-mult</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> mulitiplicity for dihedral angles (by default read from topology) </TD></TD>
-</TABLE>
-<P>
-<hr>
-<div ALIGN=RIGHT>
-<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
-<font size="-1"><a href="mailto:gromacs@gromacs.org">gromacs@gromacs.org</a></font><br>
-</div>
-</BODY>
<dl>
<dt>C format
-<dd><tt>"%5d%5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
+<dd><tt>"%5d%-5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
<dt>Fortran format
<dd><tt>(i5,2a5,i5,3f8.3,3f8.4)</tt>
<dt>Pascal format
use <b>verlet-buffer-drift</b>=-1 and set <b>rlist</b> manually.</dd>
<dt><b>rlist: (1) [nm]</b></dt>
-<dd>Cut-off distance for the short-range neighbor list, should be ≥ 0.
+<dd>Cut-off distance for the short-range neighbor list.
With <b>cutoff-scheme</b>=<b>Verlet</b>, this is by default set by the
<b>verlet-buffer-drift</b> option and the value of <b>rlist</b> is ignored.</dd>
<dt><b>rcoulomb-switch: (0) [nm]</b></dt>
<dd>where to start switching the Coulomb potential</dd>
-<dt><b>rcoulomb: (-1) [nm]</b></dt>
-<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx-->, should be ≥ 0</dd>
+<dt><b>rcoulomb: (1) [nm]</b></dt>
+<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx--></dd>
<dt><b>epsilon-r: (1)</b></dt>
<dd>The relative <!--Idx-->dielectric constant<!--EIdx-->.
<dt><b>rvdw-switch: (0) [nm]</b></dt>
<dd>where to start switching the LJ potential</dd>
-<dt><b>rvdw: (-1) [nm]</b></dt>
-<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx-->, should be ≥ 0</dd>
+<dt><b>rvdw: (1) [nm]</b></dt>
+<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx--></dd>
<dt><b>DispCorr:</b></dt>
<dd><dl compact></dd>
DEPENDS ${GROMACS_HEADERS})
add_custom_target(gromacs_include_links DEPENDS gromacs)
-add_executable(template template.c)
-remove_definitions( -DHAVE_CONFIG_H )
-add_definitions("${PKG_CFLAGS}")
-target_link_libraries(template gmx)
-include_directories("${CMAKE_CURRENT_BINARY_DIR}")
-add_dependencies(template gromacs_include_links)
+option(GMX_BUILD_TEMPLATE "Build gromacs template program" ON)
+mark_as_advanced(GMX_BUILD_TEMPLATE)
+# GMX_PREFER_STATIC_OPENMP=yes is a special case to build binaries
+# to distribute and as the template is not installed it can be
+# ignored.
+# The template is build in a user-like environment, hence we use
+# flags from PKG_CFLAGS. Again GMX_PREFER_STATIC_OPENMP=yes would
+# need special link flags (OpenMP_LINKER_FLAGS), which are not
+# very user-like.
+if (GMX_BUILD_TEMPLATE AND NOT GMX_PREFER_STATIC_OPENMP)
+ add_executable(template template.c)
+ remove_definitions( -DHAVE_CONFIG_H )
+ add_definitions("${PKG_CFLAGS}")
+ target_link_libraries(template gmx)
+ include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+ add_dependencies(template gromacs_include_links)
+endif()
install(FILES README template.c Makefile.pkg
DESTINATION ${DATA_INSTALL_DIR}/template
g_covar
g_density
g_dielectric
-g_dih
g_dipoles
g_disre
g_dist
/* AVX 256-bit instructions available */
#cmakedefine GMX_X86_AVX_256
+/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
+#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+
/* SSE2 was selected as CPU acceleration level */
#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
/* Use the GROMACS software 1/sqrt(x) */
#cmakedefine GMX_SOFTWARE_INVSQRT
-/* Use the PowerPC hardware 1/sqrt(x) */
-#cmakedefine GMX_POWERPC_INVSQRT
-
/* Use sub-counters */
#cmakedefine GMX_CYCLE_SUBCOUNTERS
/* Build special-purpose mdrun library */
#cmakedefine GMX_FAHCORE
-/* Disable gromacs quotes */
-#cmakedefine GMX_NO_QUOTES
+/* Enable gromacs quotes */
+#cmakedefine GMX_COOL_QUOTES
#ifdef GMX_FAHCORE
#define FULLINDIRECT 1
ap->bAvail[i] = FALSE;
}
}
- upstring(atomnm);
- upstring(resnm);
ap->atomnm[ap->nprop] = strdup(atomnm);
ap->resnm[ap->nprop] = strdup(resnm);
j = ap->nprop;
else {
strncpy(atomname,atomnm,MAXQ-1);
}
- upstring(atomname);
strncpy(resname,resnm,MAXQ-1);
- upstring(resname);
j = get_prop_index(&(ap->prop[eprop]),ap->restype,resname,
atomname,&bExact);
* But old code can not read a new entry that is present in the file
* (but can read a new format when new entries are not present).
*/
-static const int cpt_version = 14;
+static const int cpt_version = 15;
const char *est_names[estNR]=
}
}
+static void do_cpt_real_err(XDR *xd,const char *desc,real *f)
+{
+ bool_t res=0;
+
+#ifdef GMX_DOUBLE
+ res = xdr_double(xd,f);
+#else
+ res = xdr_float(xd,f);
+#endif
+ if (res == 0)
+ {
+ cp_error();
+ }
+}
+
+static void do_cpt_n_rvecs_err(XDR *xd,const char *desc,int n, rvec f[],FILE *list)
+{
+ int i,j;
+
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<DIM; j++)
+ {
+ do_cpt_real_err(xd, desc, &f[i][j]);
+ }
+ }
+
+ if (list)
+ {
+ pr_rvecs(list,0,desc,f,n);
+ }
+}
+
/* If nval >= 0, nval is used; on read this should match the passed value.
* If nval n<0, *nptr is used; on read the value is stored in nptr
*/
int *natoms,int *ngtc, int *nnhpres, int *nhchainlength,
int *nlambda, int *flags_state,
int *flags_eks,int *flags_enh, int *flags_dfh,
+ int *nED,
FILE *list)
{
bool_t res=0;
} else {
*flags_dfh = 0;
}
+
+ if (*file_version >= 15)
+ {
+ do_cpt_int_err(xd,"ED data sets",nED,list);
+ }
+ else
+ {
+ *nED = 0;
+ }
}
static int do_cpt_footer(XDR *xd,gmx_bool bRead,int file_version)
return ret;
}
+
+/* This function stores the last whole configuration of the reference and
+ * average structure in the .cpt file
+ */
+static int do_cpt_EDstate(XDR *xd,gmx_bool bRead,
+ edsamstate_t *EDstate, FILE *list)
+{
+ int i,j;
+ int ret=0;
+ char buf[STRLEN];
+
+
+ EDstate->bFromCpt = bRead;
+
+ if (EDstate->nED <= 0)
+ {
+ return ret;
+ }
+
+ /* When reading, init_edsam has not been called yet,
+ * so we have to allocate memory first. */
+ if (bRead)
+ {
+ snew(EDstate->nref , EDstate->nED);
+ snew(EDstate->old_sref, EDstate->nED);
+ snew(EDstate->nav , EDstate->nED);
+ snew(EDstate->old_sav , EDstate->nED);
+ }
+
+ /* Read/write the last whole conformation of SREF and SAV for each ED dataset (usually only one) */
+ for (i=0; i< EDstate->nED; i++)
+ {
+ /* Reference structure SREF */
+ sprintf(buf, "ED%d # of atoms in reference structure", i+1);
+ do_cpt_int_err(xd, buf, &EDstate->nref[i],list);
+ sprintf(buf, "ED%d x_ref", i+1);
+ if (bRead)
+ {
+ snew(EDstate->old_sref[i], EDstate->nref[i]);
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref[i], list);
+ }
+ else
+ {
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref_p[i], list);
+ }
+
+ /* Average structure SAV */
+ sprintf(buf, "ED%d # of atoms in average structure", i+1);
+ do_cpt_int_err(xd, buf, &EDstate->nav[i] ,list);
+ sprintf(buf, "ED%d x_av", i+1);
+ if (bRead)
+ {
+ snew(EDstate->old_sav[i], EDstate->nav[i]);
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav[i], list);
+ }
+ else
+ {
+ do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav_p[i], list);
+ }
+ }
+
+ return ret;
+}
+
+
static int do_cpt_files(XDR *xd, gmx_bool bRead,
gmx_file_position_t **p_outputfiles, int *nfiles,
FILE *list, int file_version)
DOMAINDECOMP(cr) ? cr->dd->nc : NULL,&npmenodes,
&state->natoms,&state->ngtc,&state->nnhpres,
&state->nhchainlength,&(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+ &state->edsamstate.nED,
NULL);
sfree(version);
(do_cpt_ekinstate(gmx_fio_getxdr(fp),FALSE,flags_eks,&state->ekinstate,NULL) < 0)||
(do_cpt_enerhist(gmx_fio_getxdr(fp),FALSE,flags_enh,&state->enerhist,NULL) < 0) ||
(do_cpt_df_hist(gmx_fio_getxdr(fp),FALSE,flags_dfh,&state->dfhist,NULL) < 0) ||
+ (do_cpt_EDstate(gmx_fio_getxdr(fp),FALSE,&state->edsamstate,NULL) < 0) ||
(do_cpt_files(gmx_fio_getxdr(fp),FALSE,&outputfiles,&noutputfiles,NULL,
file_version) < 0))
{
&eIntegrator_f,simulation_part,step,t,
&nppnodes_f,dd_nc_f,&npmenodes_f,
&natoms,&ngtc,&nnhpres,&nhchainlength,&nlambda,
- &fflags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+ &fflags,&flags_eks,&flags_enh,&flags_dfh,
+ &state->edsamstate.nED,NULL);
if (bAppendOutputFiles &&
file_version >= 13 && double_prec != GMX_CPT_BUILD_DP)
cp_error();
}
+ ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+ if (ret)
+ {
+ cp_error();
+ }
+
if (file_version < 6)
{
const char *warn="Reading checkpoint file in old format, assuming that the run that generated this file started at step 0, if this is not the case the averages stored in the energy file will be incorrect.";
&version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
&eIntegrator,simulation_part,step,t,&nppnodes,dd_nc,&npme,
&state->natoms,&state->ngtc,&state->nnhpres,&state->nhchainlength,
- &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+ &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+ &state->edsamstate.nED,NULL);
ret =
do_cpt_state(gmx_fio_getxdr(fp),TRUE,state->flags,state,bReadRNG,NULL);
if (ret)
cp_error();
}
+ ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+ if (ret)
+ {
+ cp_error();
+ }
+
ret = do_cpt_files(gmx_fio_getxdr(fp),TRUE,
outputfiles != NULL ? outputfiles : &files_loc,
outputfiles != NULL ? nfiles : &nfiles_loc,
&eIntegrator,&simulation_part,&step,&t,&nppnodes,dd_nc,&npme,
&state.natoms,&state.ngtc,&state.nnhpres,&state.nhchainlength,
&(state.dfhist.nlambda),&state.flags,
- &flags_eks,&flags_enh,&flags_dfh,out);
+ &flags_eks,&flags_enh,&flags_dfh,&state.edsamstate.nED,out);
ret = do_cpt_state(gmx_fio_getxdr(fp),TRUE,state.flags,&state,TRUE,out);
if (ret)
{
ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
flags_dfh,&state.dfhist,out);
}
+
+ if (ret == 0)
+ {
+ ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state.edsamstate,out);
+ }
+
if (ret == 0)
{
do_cpt_files(gmx_fio_getxdr(fp),TRUE,&outputfiles,&nfiles,out,file_version);
* but we dont call this routine often, and it avoids using
* a mutex for locking the variable...
*/
-#if defined(GMX_FAHCORE) || defined(GMX_NO_QUOTES)
+#ifdef GMX_COOL_QUOTES
+ return (getenv("GMX_NO_QUOTES") == NULL);
+#else
/*be uncool*/
return FALSE;
-#else
- return (getenv("GMX_NO_QUOTES") == NULL);
#endif
}
#else
fprintf(fp, "Precision: single\n");
#endif
+ fprintf(fp, "Memory model: %lu bit\n",8*sizeof(void *));
#ifdef GMX_THREAD_MPI
fprintf(fp, "MPI library: thread_mpi\n");
def_bonded ("RBDIHS", "Ryckaert-Bell.", 4, 6, 6, eNR_RB, rbdihs ),
def_bonded ("FOURDIHS", "Fourier Dih.", 4, 4, 4, eNR_FOURDIH, rbdihs ),
def_bonded ("IDIHS", "Improper Dih.", 4, 2, 2, eNR_IMPROPER,idihs ),
- def_bonded ("PIDIHS", "Improper Dih.", 4, 3, 3, eNR_PROPER, pdihs ),
+ def_bonded ("PIDIHS", "Improper Dih.", 4, 3, 3, eNR_IMPROPER, pdihs ),
def_bondedt ("TABDIHS", "Tab. Dih.", 4, 2, 2, eNR_TABDIHS, tab_dihs ),
def_bonded ("CMAP", "CMAP Dih.", 5, -1, -1, eNR_CMAP, unimplemented ),
def_bonded ("GB12", "GB 1-2 Pol.", 2, 4, 0, eNR_GB, unimplemented ),
URL: http://www.gromacs.org
Version: @PROJECT_VERSION@
Requires:
-Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
Libs: -L${libdir} -lgmx@GMX_LIBS_SUFFIX@ -lm
Cflags: -I${includedir} @PKG_CFLAGS@
* written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
* a full list of developers and information, check out http://www.gromacs.org
*
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) any
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
* later version.
* As a special exception, you may use this file as part of a free software
* library without restriction. Specifically, if other files instantiate
* templates or use macros or inline functions from this file, or you compile
* this file and link it with other files to produce an executable, this
* file does not by itself cause the resulting executable to be covered by
- * the GNU Lesser General Public License.
+ * the GNU Lesser General Public License.
*
* In plain-speak: do not worry about classes/macros/templates either - only
* changes to the library have to be LGPL, not an application linking with it.
__m128d xmm1)
{
__m128d t2;
-
+
t2 = _mm_unpackhi_pd(xmm1,xmm1);
- _mm_store_sd(ptrA,xmm1);
- _mm_store_sd(ptrB,t2);
+ _mm_store_sd(ptrA,xmm1);
+ _mm_store_sd(ptrB,t2);
}
static void
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
- _mm_store_sd(ptrA,xmm1);
+ _mm_store_sd(ptrA,xmm1);
}
double * gmx_restrict ptrB, __m128d xmm1)
{
__m128d t1;
-
+
t1 = _mm_unpackhi_pd(xmm1,xmm1);
xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA));
t1 = _mm_add_sd(t1,_mm_load_sd(ptrB));
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
-
+
tmp = gmx_mm_load_1real_pd(ptrA);
tmp = _mm_add_sd(tmp,xmm1);
gmx_mm_store_1real_pd(ptrA,tmp);
__m128d * gmx_restrict c12)
{
__m128d t1,t2,t3;
-
+
/* The c6/c12 array should be aligned */
t1 = _mm_loadu_pd(p1);
t2 = _mm_loadu_pd(p2);
- *c6 = _mm_unpacklo_pd(t1,t2);
- *c12 = _mm_unpackhi_pd(t1,t2);
+ *c6 = _mm_unpacklo_pd(t1,t2);
+ *c12 = _mm_unpackhi_pd(t1,t2);
}
static gmx_inline void
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1,
- __m128d * gmx_restrict y1,
- __m128d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1,
+ __m128d * gmx_restrict y1,
+ __m128d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz;
-
+
mem_xy = _mm_loadu_pd(xyz);
mem_z = _mm_load_sd(xyz+2);
mem_sxy = _mm_loadu_pd(xyz_shift);
mem_sz = _mm_load_sd(xyz_shift+2);
-
+
mem_xy = _mm_add_pd(mem_xy,mem_sxy);
mem_z = _mm_add_pd(mem_z,mem_sz);
-
+
*x1 = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
*y1 = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
*z1 = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-
+
t1 = _mm_loadu_pd(xyz);
t2 = _mm_loadu_pd(xyz+2);
t3 = _mm_loadu_pd(xyz+4);
t4 = _mm_loadu_pd(xyz+6);
t5 = _mm_load_sd(xyz+8);
-
+
sxy = _mm_loadu_pd(xyz_shift);
sz = _mm_load_sd(xyz_shift+2);
szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-
+
t1 = _mm_add_pd(t1,sxy);
t2 = _mm_add_pd(t2,szx);
t3 = _mm_add_pd(t3,syz);
t4 = _mm_add_pd(t4,sxy);
t5 = _mm_add_sd(t5,sz);
-
+
*x1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
*y1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
*z1 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
- __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+ __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-
+
t1 = _mm_loadu_pd(xyz);
t2 = _mm_loadu_pd(xyz+2);
t3 = _mm_loadu_pd(xyz+4);
t4 = _mm_loadu_pd(xyz+6);
t5 = _mm_loadu_pd(xyz+8);
t6 = _mm_loadu_pd(xyz+10);
-
+
sxy = _mm_loadu_pd(xyz_shift);
sz = _mm_load_sd(xyz_shift+2);
szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-
+
t1 = _mm_add_pd(t1,sxy);
t2 = _mm_add_pd(t2,szx);
t3 = _mm_add_pd(t3,syz);
t4 = _mm_add_pd(t4,sxy);
t5 = _mm_add_pd(t5,szx);
t6 = _mm_add_pd(t6,syz);
-
+
*x1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
*y1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
*z1 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
- *x = _mm_load_sd(p1);
- *y = _mm_load_sd(p1+1);
- *z = _mm_load_sd(p1+2);
+ *x = _mm_load_sd(p1);
+ *y = _mm_load_sd(p1+1);
+ *z = _mm_load_sd(p1+2);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
- *x1 = _mm_load_sd(p1);
- *y1 = _mm_load_sd(p1+1);
- *z1 = _mm_load_sd(p1+2);
- *x2 = _mm_load_sd(p1+3);
- *y2 = _mm_load_sd(p1+4);
- *z2 = _mm_load_sd(p1+5);
- *x3 = _mm_load_sd(p1+6);
- *y3 = _mm_load_sd(p1+7);
- *z3 = _mm_load_sd(p1+8);
+ *x1 = _mm_load_sd(p1);
+ *y1 = _mm_load_sd(p1+1);
+ *z1 = _mm_load_sd(p1+2);
+ *x2 = _mm_load_sd(p1+3);
+ *y2 = _mm_load_sd(p1+4);
+ *z2 = _mm_load_sd(p1+5);
+ *x3 = _mm_load_sd(p1+6);
+ *y3 = _mm_load_sd(p1+7);
+ *z3 = _mm_load_sd(p1+8);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
-__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+ __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrB);
t3 = _mm_loadu_pd(ptrA+2);
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy, __m128d z)
-{
- __m128d t1,t2;
-
- t1 = _mm_loadu_pd(ptrA);
- t2 = _mm_load_sd(ptrA+2);
-
- t1 = _mm_sub_pd(t1,xy);
- t2 = _mm_sub_sd(t2,z);
-
- _mm_storeu_pd(ptrA,t1);
- _mm_store_sd(ptrA+2,t2);
-}
-
-
-static void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3)
-{
- __m128d t1,t2;
- __m128d tA,tB,tC,tD,tE;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_load_sd(ptrA+8);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_sd(tE,z3);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_store_sd(ptrA+8,tE);
-}
-
-static void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3,
- __m128d xy4, __m128d z4)
-{
- __m128d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_loadu_pd(ptrA+8);
- tF = _mm_loadu_pd(ptrA+10);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
- t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
- t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_pd(tE,t3);
- tF = _mm_sub_pd(tF,t4);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_storeu_pd(ptrA+8,tE);
- _mm_storeu_pd(ptrA+10,tF);
-}
-
-
static void
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
__m128d t1,t2,t3;
-
+
t1 = _mm_load_sd(ptrA);
t2 = _mm_load_sd(ptrA+1);
t3 = _mm_load_sd(ptrA+2);
-
+
t1 = _mm_sub_sd(t1,x1);
t2 = _mm_sub_sd(t2,y1);
t3 = _mm_sub_sd(t3,z1);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_t1 = _mm_sub_pd(_t1,_x1);\
+_t2 = _mm_sub_pd(_t2,_z1);\
+_t3 = _mm_sub_pd(_t3,_y2);\
+_t4 = _mm_sub_pd(_t4,_x3);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
- __m128d x3, __m128d y3, __m128d z3)
+ __m128d x3, __m128d y3, __m128d z3)
{
__m128d t1,t2,t3,t4,t5;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t4 = _mm_loadu_pd(ptrA+6);
t5 = _mm_load_sd(ptrA+8);
-
+
x1 = _mm_unpacklo_pd(x1,y1);
z1 = _mm_unpacklo_pd(z1,x2);
y2 = _mm_unpacklo_pd(y2,z2);
x3 = _mm_unpacklo_pd(x3,y3);
/* nothing to be done for z3 */
-
+
t1 = _mm_sub_pd(t1,x1);
t2 = _mm_sub_pd(t2,z1);
t3 = _mm_sub_pd(t3,y2);
_mm_storeu_pd(ptrA+6,t4);
_mm_store_sd(ptrA+8,t5);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_z3 = _mm_unpacklo_pd(_z3,_x4);\
+_y4 = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA, _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
__m128d x3, __m128d y3, __m128d z3,
- __m128d x4, __m128d y4, __m128d z4)
+ __m128d x4, __m128d y4, __m128d z4)
{
__m128d t1,t2,t3,t4,t5,t6;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t4 = _mm_loadu_pd(ptrA+6);
t5 = _mm_loadu_pd(ptrA+8);
t6 = _mm_loadu_pd(ptrA+10);
-
+
x1 = _mm_unpacklo_pd(x1,y1);
z1 = _mm_unpacklo_pd(z1,x2);
y2 = _mm_unpacklo_pd(y2,z2);
x3 = _mm_unpacklo_pd(x3,y3);
z3 = _mm_unpacklo_pd(z3,x4);
y4 = _mm_unpacklo_pd(y4,z4);
-
+
_mm_storeu_pd(ptrA, _mm_sub_pd( t1,x1 ));
_mm_storeu_pd(ptrA+2, _mm_sub_pd( t2,z1 ));
_mm_storeu_pd(ptrA+4, _mm_sub_pd( t3,y2 ));
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
}
+#endif
+
static void
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
__m128d t1,t2,t3,t4,t5,t6,t7;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_load_sd(ptrA+2);
t3 = _mm_loadu_pd(ptrB);
t4 = _mm_load_sd(ptrB+2);
-
+
t5 = _mm_unpacklo_pd(x1,y1);
t6 = _mm_unpackhi_pd(x1,y1);
t7 = _mm_unpackhi_pd(z1,z1);
-
+
t1 = _mm_sub_pd(t1,t5);
t2 = _mm_sub_sd(t2,z1);
-
+
t3 = _mm_sub_pd(t3,t6);
t4 = _mm_sub_sd(t4,t7);
-
+
_mm_storeu_pd(ptrA,t1);
_mm_store_sd(ptrA+2,t2);
_mm_storeu_pd(ptrB,t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrB);\
+_t7 = _mm_loadu_pd(ptrB+2);\
+_t8 = _mm_loadu_pd(ptrB+4);\
+_t9 = _mm_loadu_pd(ptrB+6);\
+_t10 = _mm_load_sd(ptrB+8);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpackhi_pd(_z3,_z3);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_t6 = _mm_sub_pd(_t6,_tB);\
+_t7 = _mm_sub_pd(_t7,_tD);\
+_t8 = _mm_sub_pd(_t8,_tF);\
+_t9 = _mm_sub_pd(_t9,_tH);\
+_t10 = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
- __m128d x3, __m128d y3, __m128d z3)
+ __m128d x3, __m128d y3, __m128d z3)
{
__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128d tA,tB,tC,tD,tE,tF,tG,tH,tI;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t8 = _mm_loadu_pd(ptrB+4);
t9 = _mm_loadu_pd(ptrB+6);
t10 = _mm_load_sd(ptrB+8);
-
+
tA = _mm_unpacklo_pd(x1,y1);
tB = _mm_unpackhi_pd(x1,y1);
tC = _mm_unpacklo_pd(z1,x2);
tG = _mm_unpacklo_pd(x3,y3);
tH = _mm_unpackhi_pd(x3,y3);
tI = _mm_unpackhi_pd(z3,z3);
-
+
t1 = _mm_sub_pd(t1,tA);
t2 = _mm_sub_pd(t2,tC);
t3 = _mm_sub_pd(t3,tE);
t4 = _mm_sub_pd(t4,tG);
t5 = _mm_sub_sd(t5,z3);
-
+
t6 = _mm_sub_pd(t6,tB);
t7 = _mm_sub_pd(t7,tD);
t8 = _mm_sub_pd(t8,tF);
t9 = _mm_sub_pd(t9,tH);
t10 = _mm_sub_sd(t10,tI);
-
+
_mm_storeu_pd(ptrA,t1);
_mm_storeu_pd(ptrA+2,t2);
_mm_storeu_pd(ptrA+4,t3);
_mm_storeu_pd(ptrB+6,t9);
_mm_store_sd(ptrB+8,t10);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_t7 = _mm_loadu_pd(ptrB);\
+_t8 = _mm_loadu_pd(ptrB+2);\
+_t9 = _mm_loadu_pd(ptrB+4);\
+_t10 = _mm_loadu_pd(ptrB+6);\
+_t11 = _mm_loadu_pd(ptrB+8);\
+_t12 = _mm_loadu_pd(ptrB+10);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpacklo_pd(_z3,_x4);\
+_tJ = _mm_unpackhi_pd(_z3,_x4);\
+_tK = _mm_unpacklo_pd(_y4,_z4);\
+_tL = _mm_unpackhi_pd(_y4,_z4);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_pd(_t5,_tI);\
+_t6 = _mm_sub_pd(_t6,_tK);\
+_t7 = _mm_sub_pd(_t7,_tB);\
+_t8 = _mm_sub_pd(_t8,_tD);\
+_t9 = _mm_sub_pd(_t9,_tF);\
+_t10 = _mm_sub_pd(_t10,_tH);\
+_t11 = _mm_sub_pd(_t11,_tJ);\
+_t12 = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA, _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB, _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
__m128d x3, __m128d y3, __m128d z3,
- __m128d x4, __m128d y4, __m128d z4)
+ __m128d x4, __m128d y4, __m128d z4)
{
__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
__m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-
+
t1 = _mm_loadu_pd(ptrA);
t2 = _mm_loadu_pd(ptrA+2);
t3 = _mm_loadu_pd(ptrA+4);
t10 = _mm_loadu_pd(ptrB+6);
t11 = _mm_loadu_pd(ptrB+8);
t12 = _mm_loadu_pd(ptrB+10);
-
+
tA = _mm_unpacklo_pd(x1,y1);
tB = _mm_unpackhi_pd(x1,y1);
tC = _mm_unpacklo_pd(z1,x2);
tJ = _mm_unpackhi_pd(z3,x4);
tK = _mm_unpacklo_pd(y4,z4);
tL = _mm_unpackhi_pd(y4,z4);
-
+
t1 = _mm_sub_pd(t1,tA);
t2 = _mm_sub_pd(t2,tC);
t3 = _mm_sub_pd(t3,tE);
t4 = _mm_sub_pd(t4,tG);
t5 = _mm_sub_pd(t5,tI);
t6 = _mm_sub_pd(t6,tK);
-
+
t7 = _mm_sub_pd(t7,tB);
t8 = _mm_sub_pd(t8,tD);
t9 = _mm_sub_pd(t9,tF);
t10 = _mm_sub_pd(t10,tH);
t11 = _mm_sub_pd(t11,tJ);
t12 = _mm_sub_pd(t12,tL);
-
+
_mm_storeu_pd(ptrA, t1);
_mm_storeu_pd(ptrA+2,t2);
_mm_storeu_pd(ptrA+4,t3);
_mm_storeu_pd(ptrB+8,t11);
_mm_storeu_pd(ptrB+10,t12);
}
-
+#endif
static gmx_inline void
{
fix1 = _mm_hadd_pd(fix1,fiy1);
fiz1 = _mm_hadd_pd(fiz1,fiz1);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
-
+
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+fix1 = _mm_add_pd(fix1,fix3);\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+fiz1 = _mm_add_sd(fiz1,_t2);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
double * gmx_restrict fshiftptr)
{
__m128d t1,t2;
-
+
fix1 = _mm_hadd_pd(fix1,fiy1);
fiz1 = _mm_hadd_pd(fiz1,fix2);
fiy2 = _mm_hadd_pd(fiy2,fiz2);
fix3 = _mm_hadd_pd(fix3,fiy3);
fiz3 = _mm_hadd_pd(fiz3,fiz3);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
-
+
fix1 = _mm_add_pd(fix1,fix3);
t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
fix1 = _mm_add_pd(fix1,t1); /* x and y sums */
-
+
t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));
fiz1 = _mm_add_sd(fiz1,fiz3);
fiz1 = _mm_add_sd(fiz1,t2); /* z sum */
-
+
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fix4);\
+fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));\
+_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+fix3 = _mm_add_pd(fix3,_t2);\
+fix1 = _mm_add_pd(fix1,fix3);\
+fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
double * gmx_restrict fshiftptr)
{
__m128d t1,t2;
-
+
fix1 = _mm_hadd_pd(fix1,fiy1);
fiz1 = _mm_hadd_pd(fiz1,fix2);
fiy2 = _mm_hadd_pd(fiy2,fiz2);
fix3 = _mm_hadd_pd(fix3,fiy3);
fiz3 = _mm_hadd_pd(fiz3,fix4);
fiy4 = _mm_hadd_pd(fiy4,fiz4);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
-
+
t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
fix1 = _mm_add_pd(fix1,t1);
t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));
fix3 = _mm_add_pd(fix3,t2);
fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */
-
+
fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));
fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));
fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */
-
+
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
+#endif
static gmx_inline void
{
pot1 = _mm_hadd_pd(pot1,pot2);
pot2 = _mm_unpackhi_pd(pot1,pot1);
-
+
_mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
_mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB)));
}
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1,
- __m128 * gmx_restrict y1,
- __m128 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
{
__m128 t1,t2,t3,t4;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = _mm_maskload_ps(ptrA,mask);
- t2 = _mm_maskload_ps(ptrB,mask);
- t3 = _mm_maskload_ps(ptrC,mask);
- t4 = _mm_maskload_ps(ptrD,mask);
+ t1 = gmx_mm_maskload_ps(ptrA,mask);
+ t2 = gmx_mm_maskload_ps(ptrB,mask);
+ t3 = gmx_mm_maskload_ps(ptrC,mask);
+ t4 = gmx_mm_maskload_ps(ptrD,mask);
_MM_TRANSPOSE4_PS(t1,t2,t3,t4);
*x1 = t1;
*y1 = t2;
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+ __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+ __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+ _t13 = _mm_unpackhi_ps(_x1,_y1);\
+ _x1 = _mm_unpacklo_ps(_x1,_y1);\
+ _t14 = _mm_unpackhi_ps(_z1,_x2);\
+ _z1 = _mm_unpacklo_ps(_z1,_x2);\
+ _t15 = _mm_unpackhi_ps(_y2,_z2);\
+ _y2 = _mm_unpacklo_ps(_y2,_z2);\
+ _t16 = _mm_unpackhi_ps(_x3,_y3);\
+ _x3 = _mm_unpacklo_ps(_x3,_y3);\
+ _t17 = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
+ _t18 = _mm_movehl_ps(_z3,_z3);\
+ _t19 = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
+ _t20 = _mm_movelh_ps(_x1,_z1);\
+ _t21 = _mm_movehl_ps(_z1,_x1);\
+ _t22 = _mm_movelh_ps(_t13,_t14);\
+ _t14 = _mm_movehl_ps(_t14,_t13);\
+ _t23 = _mm_movelh_ps(_y2,_x3);\
+ _t24 = _mm_movehl_ps(_x3,_y2);\
+ _t25 = _mm_movelh_ps(_t15,_t16);\
+ _t16 = _mm_movehl_ps(_t16,_t15);\
+ _t1 = _mm_loadu_ps(ptrA);\
+ _t2 = _mm_loadu_ps(ptrA+4);\
+ _t3 = _mm_load_ss(ptrA+8);\
+ _t1 = _mm_sub_ps(_t1,_t20);\
+ _t2 = _mm_sub_ps(_t2,_t23);\
+ _t3 = _mm_sub_ss(_t3,_z3);\
+ _mm_storeu_ps(ptrA,_t1);\
+ _mm_storeu_ps(ptrA+4,_t2);\
+ _mm_store_ss(ptrA+8,_t3);\
+ _t4 = _mm_loadu_ps(ptrB);\
+ _t5 = _mm_loadu_ps(ptrB+4);\
+ _t6 = _mm_load_ss(ptrB+8);\
+ _t4 = _mm_sub_ps(_t4,_t21);\
+ _t5 = _mm_sub_ps(_t5,_t24);\
+ _t6 = _mm_sub_ss(_t6,_t17);\
+ _mm_storeu_ps(ptrB,_t4);\
+ _mm_storeu_ps(ptrB+4,_t5);\
+ _mm_store_ss(ptrB+8,_t6);\
+ _t7 = _mm_loadu_ps(ptrC);\
+ _t8 = _mm_loadu_ps(ptrC+4);\
+ _t9 = _mm_load_ss(ptrC+8);\
+ _t7 = _mm_sub_ps(_t7,_t22);\
+ _t8 = _mm_sub_ps(_t8,_t25);\
+ _t9 = _mm_sub_ss(_t9,_t18);\
+ _mm_storeu_ps(ptrC,_t7);\
+ _mm_storeu_ps(ptrC+4,_t8);\
+ _mm_store_ss(ptrC+8,_t9);\
+ _t10 = _mm_loadu_ps(ptrD);\
+ _t11 = _mm_loadu_ps(ptrD+4);\
+ _t12 = _mm_load_ss(ptrD+8);\
+ _t10 = _mm_sub_ps(_t10,_t14);\
+ _t11 = _mm_sub_ps(_t11,_t16);\
+ _t12 = _mm_sub_ss(_t12,_t19);\
+ _mm_storeu_ps(ptrD,_t10);\
+ _mm_storeu_ps(ptrD+4,_t11);\
+ _mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
_mm_storeu_ps(ptrD+4,t11);
_mm_store_ss(ptrD+8,t12);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+ __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+ __m128 _t23,_t24;\
+ _t13 = _mm_unpackhi_ps(_x1,_y1);\
+ _x1 = _mm_unpacklo_ps(_x1,_y1);\
+ _t14 = _mm_unpackhi_ps(_z1,_x2);\
+ _z1 = _mm_unpacklo_ps(_z1,_x2);\
+ _t15 = _mm_unpackhi_ps(_y2,_z2);\
+ _y2 = _mm_unpacklo_ps(_y2,_z2);\
+ _t16 = _mm_unpackhi_ps(_x3,_y3);\
+ _x3 = _mm_unpacklo_ps(_x3,_y3);\
+ _t17 = _mm_unpackhi_ps(_z3,_x4);\
+ _z3 = _mm_unpacklo_ps(_z3,_x4);\
+ _t18 = _mm_unpackhi_ps(_y4,_z4);\
+ _y4 = _mm_unpacklo_ps(_y4,_z4);\
+ _t19 = _mm_movelh_ps(_x1,_z1);\
+ _z1 = _mm_movehl_ps(_z1,_x1);\
+ _t20 = _mm_movelh_ps(_t13,_t14);\
+ _t14 = _mm_movehl_ps(_t14,_t13);\
+ _t21 = _mm_movelh_ps(_y2,_x3);\
+ _x3 = _mm_movehl_ps(_x3,_y2);\
+ _t22 = _mm_movelh_ps(_t15,_t16);\
+ _t16 = _mm_movehl_ps(_t16,_t15);\
+ _t23 = _mm_movelh_ps(_z3,_y4);\
+ _y4 = _mm_movehl_ps(_y4,_z3);\
+ _t24 = _mm_movelh_ps(_t17,_t18);\
+ _t18 = _mm_movehl_ps(_t18,_t17);\
+ _t1 = _mm_loadu_ps(ptrA);\
+ _t2 = _mm_loadu_ps(ptrA+4);\
+ _t3 = _mm_loadu_ps(ptrA+8);\
+ _t1 = _mm_sub_ps(_t1,_t19);\
+ _t2 = _mm_sub_ps(_t2,_t21);\
+ _t3 = _mm_sub_ps(_t3,_t23);\
+ _mm_storeu_ps(ptrA,_t1);\
+ _mm_storeu_ps(ptrA+4,_t2);\
+ _mm_storeu_ps(ptrA+8,_t3);\
+ _t4 = _mm_loadu_ps(ptrB);\
+ _t5 = _mm_loadu_ps(ptrB+4);\
+ _t6 = _mm_loadu_ps(ptrB+8);\
+ _t4 = _mm_sub_ps(_t4,_z1);\
+ _t5 = _mm_sub_ps(_t5,_x3);\
+ _t6 = _mm_sub_ps(_t6,_y4);\
+ _mm_storeu_ps(ptrB,_t4);\
+ _mm_storeu_ps(ptrB+4,_t5);\
+ _mm_storeu_ps(ptrB+8,_t6);\
+ _t7 = _mm_loadu_ps(ptrC);\
+ _t8 = _mm_loadu_ps(ptrC+4);\
+ _t9 = _mm_loadu_ps(ptrC+8);\
+ _t7 = _mm_sub_ps(_t7,_t20);\
+ _t8 = _mm_sub_ps(_t8,_t22);\
+ _t9 = _mm_sub_ps(_t9,_t24);\
+ _mm_storeu_ps(ptrC,_t7);\
+ _mm_storeu_ps(ptrC+4,_t8);\
+ _mm_storeu_ps(ptrC+8,_t9);\
+ _t10 = _mm_loadu_ps(ptrD);\
+ _t11 = _mm_loadu_ps(ptrD+4);\
+ _t12 = _mm_loadu_ps(ptrD+8);\
+ _t10 = _mm_sub_ps(_t10,_t14);\
+ _t11 = _mm_sub_ps(_t11,_t16);\
+ _t12 = _mm_sub_ps(_t12,_t18);\
+ _mm_storeu_ps(ptrD,_t10);\
+ _mm_storeu_ps(ptrD+4,_t11);\
+ _mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
_mm_storeu_ps(ptrD+4,t11);
_mm_storeu_ps(ptrD+8,t12);
}
-
+#endif
static gmx_inline void
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
_mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4;\
+\
+ fix1 = _mm_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+ fix1 = _mm_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+ _t4 = _mm_load_ss(fshiftptr+2);\
+ _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+ _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+ _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+ _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+ _t3 = _mm_permute_ps(_t3 ,_MM_SHUFFLE(1,2,0,0));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _mm_store_ss(fshiftptr+2,_t1);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t1);
_mm_storeh_pi((__m64 *)(fshiftptr),t1);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5;\
+\
+ fix1 = _mm_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm_hadd_ps(fiz3,fix4);\
+ fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+ fix1 = _mm_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+ _t5 = _mm_load_ss(fshiftptr+2);\
+ _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+ _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
+ _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
+ _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
+ _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+ _t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _t5 = _mm_add_ps(_t5,_t1);\
+ _mm_store_ss(fshiftptr+2,_t5);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t5);
_mm_storeh_pi((__m64 *)(fshiftptr),t5);
}
-
+#endif
static gmx_inline void
}
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
- __m128 pot2, float * gmx_restrict ptrB,
- __m128 pot3, float * gmx_restrict ptrC,
- __m128 pot4, float * gmx_restrict ptrD)
-{
- _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
- pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
- pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(1,1,1,1));
- pot3 = _mm_permute_ps(pot1,_MM_SHUFFLE(2,2,2,2));
- pot4 = _mm_permute_ps(pot1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
- _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
- _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
#endif /* _kernelutil_x86_avx_128_fma_single_h_ */
static gmx_inline void
gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m256d * gmx_restrict x1,
- __m256d * gmx_restrict y1,
- __m256d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m256d * gmx_restrict x1,
+ __m256d * gmx_restrict y1,
+ __m256d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
static gmx_inline void
gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+ __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+ __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
static gmx_inline void
gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
- __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+ __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+ __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
+ __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
}
-static void
-gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
- __m256d t1,t2,t3;
-
- t1 = _mm256_loadu_pd(p1); /* x2 z1 | y1 x1 */
- t2 = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4)); /* - - | z2 y2 */
-
- *x1 = t1;
- *y2 = t2;
-
- t3 = gmx_mm256_unpack128hi_pd(t1,t1);
-
- *z1 = t3;
- *y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
- *z2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
- *x2 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
-}
-
static void
gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
t1 = _mm256_loadu_pd(p1);
t2 = _mm256_loadu_pd(p1+4);
t3 = _mm256_loadu_pd(p1+8);
-
+
t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
*z1 = t4;
*x3 = t5;
*y4 = t6;
-
+
*y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
*z2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
*x4 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
}
-static void
-gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-{
- __m256d tA,tB,tC;
-
- tA = _mm256_loadu_pd(ptrA); /* - z1 | y1 x1 */
- tB = _mm256_loadu_pd(ptrB); /* - z2 | y2 x2 */
-
- tC = _mm256_unpacklo_pd(tA,tB); /* z2 z1 | x2 x1 */
-
- *x1 = tC;
- *y1 = _mm256_unpackhi_pd(tA,tB);
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
-}
-
-
-static void
-gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
- __m256d t1,t2,t3,t4,t5;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4)); /* - - | z2a y2a */
- t4 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4)); /* - - | z2b y2b */
-
- t5 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
- *y2 = _mm256_unpacklo_pd(t3,t4); /* - - | y2b y2a */
- *z2 = _mm256_unpackhi_pd(t3,t4); /* - - | z2b z2a */
- *x1 = t5;
- *y1 = t1;
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
- *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-}
-
-
-static void
-gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
- t4 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
- t5 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
- t6 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
-
- t7 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
-
- t2 = _mm256_unpacklo_pd(t3,t4); /* x3b x3a | y2b y2a */
- t3 = _mm256_unpackhi_pd(t3,t4); /* y3b y3a | z2b z2a */
-
- *z3 = _mm256_unpacklo_pd(t5,t6); /* - - | z3b z3a */
-
- *x1 = t7;
- *y1 = t1;
- *y2 = t2;
- *z2 = t3;
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
- *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
- *x3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
- *y3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-}
-
-
-static void
-gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
- __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
- __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
- t4 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
- t5 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
- t6 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
-
- t7 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
-
- t2 = _mm256_unpacklo_pd(t3,t4); /* x3b x3a | y2b y2a */
- t3 = _mm256_unpackhi_pd(t3,t4); /* y3b y3a | z2b z2a */
-
- t4 = _mm256_unpacklo_pd(t5,t6); /* y4b y4a | z3b z3a */
- t5 = _mm256_unpackhi_pd(t5,t6); /* z4b z4a | x4b x4a */
-
- *x1 = t7;
- *y1 = t1;
- *y2 = t2;
- *z2 = t3;
- *z3 = t4;
- *x4 = t5;
-
- *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
- *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
- *x3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
- *y3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
- *y4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
- *z4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
-}
-
-
-
static void
gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
{
- __m256d t1,t2,t3,t4,t5,t6;
+ __m256d t1,t2,t3,t4,t5,t6;
t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
*z1 = gmx_mm256_unpack128hi_pd(t5,t1);
}
-static void
-gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
- const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
- __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
- __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-
- t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
- t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
- t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
- t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
- t5 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4)); /* - - | z2a y2a */
- t6 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4)); /* - - | z2b y2b */
- t7 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4)); /* - - | z2c y2c */
- t8 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4)); /* - - | z2d y2d */
-
- t9 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
- t10 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
- t1 = _mm256_unpacklo_pd(t3,t4); /* z1d z1c | x1d x1c */
- t2 = _mm256_unpackhi_pd(t3,t4); /* x2d x2c | y1d y1c */
- t3 = _mm256_unpacklo_pd(t5,t6); /* - - | y2b y2a */
- t4 = _mm256_unpackhi_pd(t5,t6); /* - - | z2b z2a */
- t5 = _mm256_unpacklo_pd(t7,t8); /* - - | y2d y2c */
- t6 = _mm256_unpackhi_pd(t7,t8); /* - - | z2d z2c */
-
- *x1 = gmx_mm256_unpack128lo_pd(t9,t1);
- *y1 = gmx_mm256_unpack128lo_pd(t10,t2);
- *z1 = gmx_mm256_unpack128hi_pd(t9,t1);
-
- *x2 = gmx_mm256_unpack128hi_pd(t10,t2);
- *y2 = gmx_mm256_unpack128lo_pd(t3,t5);
- *z2 = gmx_mm256_unpack128lo_pd(t4,t6);
-}
static void
-/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
-{
- __m256d t1,t2;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
- t1 = _mm256_sub_pd(t1,t2);
- /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
- _mm256_storeu_pd(ptrA,t1);
-}
-
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m256d xyz1, __m256d xyz2, __m256d xyz3)
-{
- __m256d t1,t2;
- __m256d tA,tB;
- __m128d tC;
-
- tA = _mm256_loadu_pd(ptrA);
- tB = _mm256_loadu_pd(ptrA+4);
- tC = _mm_load_sd(ptrA+8);
-
- /* xyz1: - z1 | y1 x1 */
- /* xyz2: - z2 | y2 x2 */
- /* xyz3: - z3 | y3 x3 */
-
- xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
- t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
- xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
- xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
- t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
- xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
-
- tA = _mm256_sub_pd(tA,xyz1);
- tB = _mm256_sub_pd(tB,xyz2);
- tC = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
-
- _mm256_storeu_pd(ptrA,tA);
- _mm256_storeu_pd(ptrA+4,tB);
- _mm_store_sd(ptrA+8,tC);
-}
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
-{
- __m256d t1,t2,t3;
- __m256d tA,tB,tC;
-
- tA = _mm256_loadu_pd(ptrA);
- tB = _mm256_loadu_pd(ptrA+4);
- tC = _mm256_loadu_pd(ptrA+8);
-
- /* xyz1: - z1 | y1 x1 */
- /* xyz2: - z2 | y2 x2 */
- /* xyz3: - z3 | y3 x3 */
- /* xyz4: - z4 | y4 x4 */
-
- xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
- t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
- xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
- xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
- t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
- xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
- xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z4 - | x4 y4 */
- t3 = _mm256_permute2f128_pd(xyz4,xyz4,0x21); /* x4 y4 | z4 - */
- t3 = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
- xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /* xz y4 | x4 z3 */
-
- tA = _mm256_sub_pd(tA,xyz1);
- tB = _mm256_sub_pd(tB,xyz2);
- tC = _mm256_sub_pd(tC,xyz4);
-
- _mm256_storeu_pd(ptrA,tA);
- _mm256_storeu_pd(ptrA+4,tB);
- _mm256_storeu_pd(ptrA+8,tC);
-}
-
-
-
-static void
-gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1)
-{
- __m128d t1,t2,t3;
-
- t1 = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
- t2 = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
- t3 = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
- _mm_store_sd(ptrA,t1);
- _mm_store_sd(ptrA+1,t2);
- _mm_store_sd(ptrA+2,t3);
-}
-
-
-static void
-gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1;
- __m128d tA;
- t1 = _mm256_loadu_pd(ptrA);
- tA = _mm_loadu_pd(ptrA+4);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-
- t1 = _mm256_sub_pd(x1,t1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm_storeu_pd(ptrA+4,tA);
-}
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
-{
- __m256d t1,t2;
- __m128d tA;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrA+4);
- tA = _mm_load_sd(ptrA+8);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
- y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
- t1 = _mm256_sub_pd(t1,x1);
- t2 = _mm256_sub_pd(t2,y2);
- tA = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrA+4,t2);
- _mm_store_sd(ptrA+8,tA);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
-{
- __m256d t1,t2,t3;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrA+4);
- t3 = _mm256_loadu_pd(ptrA+8);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- z3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
- y4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
- y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
- z3 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
-
- t1 = _mm256_sub_pd(t1,x1);
- t2 = _mm256_sub_pd(t2,y2);
- t3 = _mm256_sub_pd(t3,z3);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrA+4,t2);
- _mm256_storeu_pd(ptrA+8,t3);
-}
-
-static void
-gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
- double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1)
-{
- __m256d t1,t2,t3,t4;
- __m256i mask;
-
- t3 = _mm256_loadu_pd(ptrA);
- t4 = _mm256_loadu_pd(ptrB);
-
- t1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- t2 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- t1 = gmx_mm256_unpack128lo_pd(t1,z1); /* - z1a | y1a x1a */
- z1 = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
- t2 = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
-
- /* Construct a mask without executing any data loads */
- mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
- _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
-
- t3 = _mm256_sub_pd(t3,t1);
- t4 = _mm256_sub_pd(t4,t2);
-
- /* Careful with potentially overlapping stores, need to be masked */
- _mm256_maskstore_pd(ptrA,mask,t3);
- _mm256_maskstore_pd(ptrB,mask,t4);
-}
-
-static void
-gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1,t2,t5;
- __m128d t3,t4;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm_loadu_pd(ptrA+4);
- t4 = _mm_loadu_pd(ptrB+4);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t1 = _mm256_sub_pd(t1,z2);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
- t4 = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
-
- /* Careful with potentially overlapping stores, need to be masked */
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm_storeu_pd(ptrA+4,t3);
- _mm_storeu_pd(ptrB+4,t4);
-}
-
-static void
-gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
-{
- __m256d t1,t2,t3,t4,t5,t6;
- __m128d tA,tB;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrA+4);
- t4 = _mm256_loadu_pd(ptrB+4);
- tA = _mm_load_sd(ptrA+8);
- tB = _mm_load_sd(ptrB+8);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
-
- t6 = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
-
- y3 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t5 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
- x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
- t1 = _mm256_sub_pd(t1,y3);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm256_sub_pd(t3,t5);
- t4 = _mm256_sub_pd(t4,x1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
- tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrA+4,t3);
- _mm256_storeu_pd(ptrB+4,t4);
- _mm_store_sd(ptrA+8,tA);
- _mm_store_sd(ptrB+8,tB);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrA+4);
- t4 = _mm256_loadu_pd(ptrB+4);
- t5 = _mm256_loadu_pd(ptrA+8);
- t6 = _mm256_loadu_pd(ptrB+8);
-
- t7 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
-
- y3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
- z3 = _mm256_unpackhi_pd(z3,x4); /* - - | x4b z3b */
- x4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
- y4 = _mm256_unpackhi_pd(y4,z4); /* - - | z4b y4b */
-
- z4 = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t7 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
- x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
- x2 = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
- y2 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
-
- t1 = _mm256_sub_pd(t1,z4);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm256_sub_pd(t3,t7);
- t4 = _mm256_sub_pd(t4,x1);
- t5 = _mm256_sub_pd(t5,x2);
- t6 = _mm256_sub_pd(t6,y2);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrA+4,t3);
- _mm256_storeu_pd(ptrB+4,t4);
- _mm256_storeu_pd(ptrA+8,t5);
- _mm256_storeu_pd(ptrB+8,t6);
-}
-
-
-
static void
gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1)
+ double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+ __m256d x1, __m256d y1, __m256d z1)
{
__m256d t1,t2,tA,tB,tC,tD;
__m256i mask;
/* Construct a mask without executing any data loads */
mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
- _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
+ _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
tA = _mm256_loadu_pd(ptrA);
tB = _mm256_loadu_pd(ptrB);
_mm256_maskstore_pd(ptrD,mask,tD);
}
-static void
-gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1,t2,t3,t4,t5,t6;
- __m128d tA,tB,tC,tD,tE,tF;
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrC);
- t4 = _mm256_loadu_pd(ptrD);
- tA = _mm_loadu_pd(ptrA+4);
- tB = _mm_loadu_pd(ptrB+4);
- tC = _mm_loadu_pd(ptrC+4);
- tD = _mm_loadu_pd(ptrD+4);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
- y1 = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
- x2 = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
- t6 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- z2 = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
- t5 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
- y1 = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
-
- tE = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
- tF = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
-
- t1 = _mm256_sub_pd(t1,t6);
- t2 = _mm256_sub_pd(t2,t5);
- t3 = _mm256_sub_pd(t3,z2);
- t4 = _mm256_sub_pd(t4,y1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
- tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
- tC = _mm_sub_pd(tC,tE);
- tD = _mm_sub_pd(tD,tF);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrC,t3);
- _mm256_storeu_pd(ptrD,t4);
- _mm_storeu_pd(ptrA+4,tA);
- _mm_storeu_pd(ptrB+4,tB);
- _mm_storeu_pd(ptrC+4,tC);
- _mm_storeu_pd(ptrD+4,tD);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+ __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+ __m128d _tA,_tB,_tC,_tD,_tE;\
+ _t1 = _mm256_loadu_pd(ptrA);\
+ _t2 = _mm256_loadu_pd(ptrB);\
+ _t3 = _mm256_loadu_pd(ptrC);\
+ _t4 = _mm256_loadu_pd(ptrD);\
+ _t5 = _mm256_loadu_pd(ptrA+4);\
+ _t6 = _mm256_loadu_pd(ptrB+4);\
+ _t7 = _mm256_loadu_pd(ptrC+4);\
+ _t8 = _mm256_loadu_pd(ptrD+4);\
+ _tA = _mm_load_sd(ptrA+8);\
+ _tB = _mm_load_sd(ptrB+8);\
+ _tC = _mm_load_sd(ptrC+8);\
+ _tD = _mm_load_sd(ptrD+8);\
+ _t9 = _mm256_unpacklo_pd(_x1,_y1);\
+ _x1 = _mm256_unpackhi_pd(_x1,_y1);\
+ _y1 = _mm256_unpacklo_pd(_z1,_x2);\
+ _z1 = _mm256_unpackhi_pd(_z1,_x2);\
+ _x2 = _mm256_unpacklo_pd(_y2,_z2);\
+ _y2 = _mm256_unpackhi_pd(_y2,_z2);\
+ _z2 = _mm256_unpacklo_pd(_x3,_y3);\
+ _x3 = _mm256_unpackhi_pd(_x3,_y3);\
+ _t10 = gmx_mm256_unpack128lo_pd(_t9,_y1);\
+ _y3 = gmx_mm256_unpack128hi_pd(_t9,_y1);\
+ _t9 = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+ _y1 = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+ _x1 = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+ _z1 = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+ _x2 = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+ _z2 = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+ _t1 = _mm256_sub_pd(_t1,_t10);\
+ _t2 = _mm256_sub_pd(_t2,_t9);\
+ _t3 = _mm256_sub_pd(_t3,_y3);\
+ _t4 = _mm256_sub_pd(_t4,_y1);\
+ _t5 = _mm256_sub_pd(_t5,_x1);\
+ _t6 = _mm256_sub_pd(_t6,_x2);\
+ _t7 = _mm256_sub_pd(_t7,_z1);\
+ _t8 = _mm256_sub_pd(_t8,_z2);\
+ _tA = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3));\
+ _tB = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3),_GMX_MM_PERMUTE128D(1,1)));\
+ _tE = _mm256_extractf128_pd(_z3,0x1);\
+ _tC = _mm_sub_sd(_tC, _tE);\
+ _tD = _mm_sub_sd(_tD, _mm_permute_pd(_tE,_GMX_MM_PERMUTE128D(1,1)));\
+ _mm256_storeu_pd(ptrA,_t1);\
+ _mm256_storeu_pd(ptrB,_t2);\
+ _mm256_storeu_pd(ptrC,_t3);\
+ _mm256_storeu_pd(ptrD,_t4);\
+ _mm256_storeu_pd(ptrA+4,_t5);\
+ _mm256_storeu_pd(ptrB+4,_t6);\
+ _mm256_storeu_pd(ptrC+4,_t7);\
+ _mm256_storeu_pd(ptrD+4,_t8);\
+ _mm_store_sd(ptrA+8,_tA);\
+ _mm_store_sd(ptrB+8,_tB);\
+ _mm_store_sd(ptrC+8,_tC);\
+ _mm_store_sd(ptrD+8,_tD);\
}
-
-
+#else
+/* Real function for sane compilers */
static void
gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
+ double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+ __m256d x1, __m256d y1, __m256d z1,
+ __m256d x2, __m256d y2, __m256d z2,
+ __m256d x3, __m256d y3, __m256d z3)
{
__m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128d tA,tB,tC,tD,tE;
_mm_store_sd(ptrC+8,tC);
_mm_store_sd(ptrD+8,tD);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{ \
+ __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14;\
+ __m128d _tA,_tB,_tC,_tD,_tE;\
+ _t1 = _mm256_loadu_pd(ptrA);\
+ _t2 = _mm256_loadu_pd(ptrB);\
+ _t3 = _mm256_loadu_pd(ptrC);\
+ _t4 = _mm256_loadu_pd(ptrD);\
+ _t5 = _mm256_loadu_pd(ptrA+4);\
+ _t6 = _mm256_loadu_pd(ptrB+4);\
+ _t7 = _mm256_loadu_pd(ptrC+4);\
+ _t8 = _mm256_loadu_pd(ptrD+4);\
+ _t9 = _mm256_loadu_pd(ptrA+8);\
+ _t10 = _mm256_loadu_pd(ptrB+8);\
+ _t11 = _mm256_loadu_pd(ptrC+8);\
+ _t12 = _mm256_loadu_pd(ptrD+8);\
+ _t13 = _mm256_unpacklo_pd(_x1,_y1);\
+ _x1 = _mm256_unpackhi_pd(_x1,_y1);\
+ _y1 = _mm256_unpacklo_pd(_z1,_x2);\
+ _z1 = _mm256_unpackhi_pd(_z1,_x2);\
+ _x2 = _mm256_unpacklo_pd(_y2,_z2);\
+ _y2 = _mm256_unpackhi_pd(_y2,_z2);\
+ _z2 = _mm256_unpacklo_pd(_x3,_y3);\
+ _x3 = _mm256_unpackhi_pd(_x3,_y3);\
+ _y3 = _mm256_unpacklo_pd(_z3,_x4);\
+ _z3 = _mm256_unpackhi_pd(_z3,_x4);\
+ _x4 = _mm256_unpacklo_pd(_y4,_z4);\
+ _y4 = _mm256_unpackhi_pd(_y4,_z4);\
+ _z4 = gmx_mm256_unpack128lo_pd(_t13,_y1);\
+ _t13 = gmx_mm256_unpack128hi_pd(_t13,_y1);\
+ _y1 = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+ _x1 = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+ _z1 = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+ _x2 = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+ _z2 = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+ _y2 = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+ _x3 = gmx_mm256_unpack128lo_pd(_y3,_x4);\
+ _y3 = gmx_mm256_unpack128hi_pd(_y3,_x4);\
+ _x4 = gmx_mm256_unpack128lo_pd(_z3,_y4);\
+ _z3 = gmx_mm256_unpack128hi_pd(_z3,_y4);\
+ _t1 = _mm256_sub_pd(_t1,_z4);\
+ _t2 = _mm256_sub_pd(_t2,_y1);\
+ _t3 = _mm256_sub_pd(_t3,_t13);\
+ _t4 = _mm256_sub_pd(_t4,_x1);\
+ _t5 = _mm256_sub_pd(_t5,_z1);\
+ _t6 = _mm256_sub_pd(_t6,_z2);\
+ _t7 = _mm256_sub_pd(_t7,_x2);\
+ _t8 = _mm256_sub_pd(_t8,_y2);\
+ _t9 = _mm256_sub_pd(_t9,_x3);\
+ _t10 = _mm256_sub_pd(_t10,_x4);\
+ _t11 = _mm256_sub_pd(_t11,_y3);\
+ _t12 = _mm256_sub_pd(_t12,_z3);\
+ _mm256_storeu_pd(ptrA,_t1);\
+ _mm256_storeu_pd(ptrB,_t2);\
+ _mm256_storeu_pd(ptrC,_t3);\
+ _mm256_storeu_pd(ptrD,_t4);\
+ _mm256_storeu_pd(ptrA+4,_t5);\
+ _mm256_storeu_pd(ptrB+4,_t6);\
+ _mm256_storeu_pd(ptrC+4,_t7);\
+ _mm256_storeu_pd(ptrD+4,_t8);\
+ _mm256_storeu_pd(ptrA+8,_t9);\
+ _mm256_storeu_pd(ptrB+8,_t10);\
+ _mm256_storeu_pd(ptrC+8,_t11);\
+ _mm256_storeu_pd(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- double * gmx_restrict ptrC, double * gmx_restrict ptrD,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
+ double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+ __m256d x1, __m256d y1, __m256d z1,
+ __m256d x2, __m256d y2, __m256d z2,
+ __m256d x3, __m256d y3, __m256d z3,
+ __m256d x4, __m256d y4, __m256d z4)
{
__m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
__m128d tA,tB,tC,tD,tE;
_mm256_storeu_pd(ptrC+8,t11);
_mm256_storeu_pd(ptrD+8,t12);
}
+#endif
static gmx_inline void
gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
+ double * gmx_restrict fptr,
+ double * gmx_restrict fshiftptr)
{
__m256d t1,t2;
__m128d tA,tB;
_mm256_storeu_pd(fshiftptr,t2);
}
-static gmx_inline void
-gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- __m256d fix2, __m256d fiy2, __m256d fiz2,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
-{
- __m256d t1,t2,t3;
- __m128d tA,tB,tC,tD,tE;
- fix1 = _mm256_hadd_pd(fix1,fiy1);
- fiz1 = _mm256_hadd_pd(fiz1,fix2);
- fiy2 = _mm256_hadd_pd(fiy2,fiz2);
- /* Add across the two lanes by swapping and adding back */
- tA = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
- tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
- tC = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
-
- t1 = gmx_mm256_set_m128d(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
-
- t2 = _mm256_loadu_pd(fptr);
- tD = _mm_loadu_pd(fptr+4);
-
- t2 = _mm256_add_pd(t2,t1);
- tD = _mm_add_pd(tD,tC);
- _mm256_storeu_pd(fptr,t2);
- _mm_storeu_pd(fptr+4,tD);
-
- /* Add up shift force */
- /* t1: fix2 fiz1 | fiy1 fix1 */
- /* tC: fiz2 fiy2 */
-
- tA = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
- tB = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1)); /* fiy2 fix2 */
- tC = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1)); /* - fiz2 */
-
- tB = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
- tC = _mm_add_sd(tC,tA);
-
- tD = _mm_loadu_pd(fshiftptr);
- tE = _mm_load_sd(fshiftptr+2);
-
- tD = _mm_add_pd(tD,tB);
- tE = _mm_add_pd(tE,tC);
-
- _mm_storeu_pd(fshiftptr,tD);
- _mm_store_sd(fshiftptr+2,tE);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{ \
+ __m256d _t1,_t2,_t3,_t4;\
+ __m128d _tz3,_tA,_tB,_tC,_tD;\
+ fix1 = _mm256_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm256_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd());\
+ _t1 = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+ _t2 = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+ _t1 = _mm256_add_pd(_t1,_t2);\
+ _t3 = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+ _t4 = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+ _t3 = _mm256_add_pd(_t3,_t4);\
+ _tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));\
+ _t2 = _mm256_loadu_pd(fptr);\
+ _t4 = _mm256_loadu_pd(fptr+4);\
+ _tA = _mm_load_sd(fptr+8);\
+ _t2 = _mm256_add_pd(_t2,_t1);\
+ _t4 = _mm256_add_pd(_t4,_t3);\
+ _tA = _mm_add_sd(_tA,_tz3);\
+ _mm256_storeu_pd(fptr,_t2);\
+ _mm256_storeu_pd(fptr+4,_t4);\
+ _mm_store_sd(fptr+8,_tA);\
+ _tB = _mm256_extractf128_pd(_t1,0x1);\
+ _tC = _mm256_extractf128_pd(_t3,0x1);\
+ _tz3 = _mm_add_sd(_tz3,_tB);\
+ _tD = _mm_permute_pd(_mm256_castpd256_pd128(_t3),_GMX_MM_PERMUTE128D(1,1));\
+ _tz3 = _mm_add_sd(_tz3,_tD);\
+ _tC = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t1));\
+ _tD = _mm_shuffle_pd(_tB,_mm256_castpd256_pd128(_t3),_MM_SHUFFLE2(0,1));\
+ _tC = _mm_add_pd(_tC,_tD);\
+ _tA = _mm_loadu_pd(fshiftptr);\
+ _tB = _mm_load_sd(fshiftptr+2);\
+ _tA = _mm_add_pd(_tA,_tC);\
+ _tB = _mm_add_sd(_tB,_tz3);\
+ _mm_storeu_pd(fshiftptr,_tA);\
+ _mm_store_sd(fshiftptr+2,_tB);\
}
-
-
-
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- __m256d fix2, __m256d fiy2, __m256d fiz2,
- __m256d fix3, __m256d fiy3, __m256d fiz3,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
+ __m256d fix2, __m256d fiy2, __m256d fiz2,
+ __m256d fix3, __m256d fiy3, __m256d fiz3,
+ double * gmx_restrict fptr,
+ double * gmx_restrict fshiftptr)
{
__m256d t1,t2,t3,t4;
__m128d tz3,tA,tB,tC,tD;
_mm_storeu_pd(fshiftptr,tA);
_mm_store_sd(fshiftptr+2,tB);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m256d _t1,_t2,_t3,_t4,_t5,_t6;\
+ __m128d _tA,_tB,_tC,_tD;\
+ fix1 = _mm256_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm256_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm256_hadd_pd(fiz3,fix4);\
+ fiy4 = _mm256_hadd_pd(fiy4,fiz4);\
+ _t1 = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+ _t2 = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+ _t1 = _mm256_add_pd(_t1,_t2);\
+ _t3 = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+ _t4 = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+ _t3 = _mm256_add_pd(_t3,_t4);\
+ _t5 = gmx_mm256_unpack128lo_pd(fiz3,fiy4);\
+ _t6 = gmx_mm256_unpack128hi_pd(fiz3,fiy4);\
+ _t5 = _mm256_add_pd(_t5,_t6);\
+ _t2 = _mm256_loadu_pd(fptr);\
+ _t4 = _mm256_loadu_pd(fptr+4);\
+ _t6 = _mm256_loadu_pd(fptr+8);\
+ _t2 = _mm256_add_pd(_t2,_t1);\
+ _t4 = _mm256_add_pd(_t4,_t3);\
+ _t6 = _mm256_add_pd(_t6,_t5);\
+ _mm256_storeu_pd(fptr,_t2);\
+ _mm256_storeu_pd(fptr+4,_t4);\
+ _mm256_storeu_pd(fptr+8,_t6);\
+ _tA = _mm256_extractf128_pd(_t1,0x1);\
+ _tB = _mm256_extractf128_pd(_t3,0x1);\
+ _tC = _mm256_extractf128_pd(_t5,0x1);\
+ _tB = _mm_add_pd(_tB,_mm256_castpd256_pd128(_t1));\
+ _tA = _mm_add_pd(_tA,_mm256_castpd256_pd128(_t5));\
+ _tC = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t3));\
+ _tD = _mm_shuffle_pd(_tA,_tC,_MM_SHUFFLE2(0,1));\
+ _tB = _mm_add_pd(_tB,_tD);\
+ _tC = _mm_permute_pd(_tC,_GMX_MM_PERMUTE128D(1,1));\
+ _tC = _mm_add_sd(_tC,_tA);\
+ _tA = _mm_loadu_pd(fshiftptr);\
+ _tD = _mm_load_sd(fshiftptr+2);\
+ _tA = _mm_add_pd(_tA,_tB);\
+ _tD = _mm_add_sd(_tD,_tC);\
+ _mm_storeu_pd(fshiftptr,_tA);\
+ _mm_store_sd(fshiftptr+2,_tD);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
- __m256d fix2, __m256d fiy2, __m256d fiz2,
- __m256d fix3, __m256d fiy3, __m256d fiz3,
- __m256d fix4, __m256d fiy4, __m256d fiz4,
- double * gmx_restrict fptr,
- double * gmx_restrict fshiftptr)
+ __m256d fix2, __m256d fiy2, __m256d fiz2,
+ __m256d fix3, __m256d fiy3, __m256d fiz3,
+ __m256d fix4, __m256d fiy4, __m256d fiz4,
+ double * gmx_restrict fptr,
+ double * gmx_restrict fshiftptr)
{
__m256d t1,t2,t3,t4,t5,t6;
__m128d tA,tB,tC,tD;
_mm_storeu_pd(fshiftptr,tA);
_mm_store_sd(fshiftptr+2,tD);
}
+#endif
static void
gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
- __m256d pot2, double * gmx_restrict ptrB)
+ __m256d pot2, double * gmx_restrict ptrB)
{
__m128d t1,t2;
}
-static void
-gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
- __m256d pot2, double * gmx_restrict ptrB,
- __m256d pot3, double * gmx_restrict ptrC,
- __m256d pot4, double * gmx_restrict ptrD)
-{
- __m256d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF,tG,tH;
-
- tA = _mm_load_sd(ptrA);
- tB = _mm_load_sd(ptrB);
- tC = _mm_load_sd(ptrC);
- tD = _mm_load_sd(ptrD);
-
- /* do a transpose */
- t1 = _mm256_unpacklo_pd(pot1, pot2); /* p2c p1c | p2a p1a */
- t2 = _mm256_unpackhi_pd(pot1, pot2); /* p2d p1d | p2b p1b */
- t3 = _mm256_unpacklo_pd(pot3, pot4); /* p4c p3c | p4a p3a */
- t4 = _mm256_unpackhi_pd(pot3, pot4); /* p4d p3d | p4b p3b */
- pot1 = _mm256_permute2f128_pd(t1, t3, 0x20); /* p4a p3a | p2a p1a */
- pot2 = _mm256_permute2f128_pd(t2, t4, 0x20); /* p4b p3b | p2b p1b */
- pot3 = _mm256_permute2f128_pd(t1, t3, 0x31); /* p4c p3c | p2c p1c */
- pot4 = _mm256_permute2f128_pd(t2, t4, 0x31); /* p4d p3d | p2d p1d */
-
- pot1 = _mm256_add_pd(pot1,pot2);
- pot3 = _mm256_add_pd(pot3,pot4);
- pot1 = _mm256_add_pd(pot1,pot3); /* Sum in the four elements */
-
- tE = _mm256_castpd256_pd128(pot1);
- tF = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
- tG = _mm256_extractf128_pd(pot1,0x1);
- tH = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
-
- tA = _mm_add_sd(tA,tE);
- tB = _mm_add_sd(tB,tF);
- tC = _mm_add_sd(tC,tG);
- tD = _mm_add_sd(tD,tH);
-
- _mm_store_sd(ptrA,tA);
- _mm_store_sd(ptrB,tB);
- _mm_store_sd(ptrC,tC);
- _mm_store_sd(ptrD,tD);
-}
-
-
#endif /* _kernelutil_x86_avx_256_double_h_ */
static gmx_inline void
gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m256 * gmx_restrict x1,
- __m256 * gmx_restrict y1,
- __m256 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m256 * gmx_restrict x1,
+ __m256 * gmx_restrict y1,
+ __m256 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
static gmx_inline void
gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
- __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
- __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+ __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+ __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
static gmx_inline void
gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
- __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
- __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
- __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+ __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+ __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
+ __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
{
__m128 t1,t2,t3,t4;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = _mm_maskload_ps(ptrA,mask);
- t2 = _mm_maskload_ps(ptrB,mask);
- t3 = _mm_maskload_ps(ptrC,mask);
- t4 = _mm_maskload_ps(ptrD,mask);
+ t1 = gmx_mm_maskload_ps(ptrA,mask);
+ t2 = gmx_mm_maskload_ps(ptrB,mask);
+ t3 = gmx_mm_maskload_ps(ptrC,mask);
+ t4 = gmx_mm_maskload_ps(ptrD,mask);
_MM_TRANSPOSE4_PS(t1,t2,t3,t4);
*x1 = _mm256_castps128_ps256(t1);
*y1 = _mm256_castps128_ps256(t2);
__m256 t1,t2,t3,t4,t5,t6,t7,t8;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
- t2 = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
- t3 = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
- t4 = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
+ t1 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
+ t2 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
+ t3 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
+ t4 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
t5 = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
t6 = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
t1 = _mm256_unpacklo_ps(t1,t3); /* - - z3g z3e | - - z3c z3a */
t2 = _mm256_unpacklo_ps(t2,t4); /* - - z3h z3f | - - z3d z3b */
-
+
*z3 = _mm256_unpacklo_ps(t1,t2);
}
t6 = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
t7 = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
t8 = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
-
+
*z3 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
*x4 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
*y4 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
static gmx_inline void
gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC,float * gmx_restrict ptrD,
- __m256 x1, __m256 y1, __m256 z1)
+ float * gmx_restrict ptrC,float * gmx_restrict ptrD,
+ __m256 x1, __m256 y1, __m256 z1)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8;
__m128i mask;
t3 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /* - z1c y1c x1c */
t4 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /* - z1d y1d x1d */
- t5 = _mm_maskload_ps(ptrA,mask);
- t6 = _mm_maskload_ps(ptrB,mask);
- t7 = _mm_maskload_ps(ptrC,mask);
- t8 = _mm_maskload_ps(ptrD,mask);
+ t5 = gmx_mm_maskload_ps(ptrA,mask);
+ t6 = gmx_mm_maskload_ps(ptrB,mask);
+ t7 = gmx_mm_maskload_ps(ptrC,mask);
+ t8 = gmx_mm_maskload_ps(ptrD,mask);
t5 = _mm_sub_ps(t5,t1);
t6 = _mm_sub_ps(t6,t2);
t7 = _mm_sub_ps(t7,t3);
t8 = _mm_sub_ps(t8,t4);
- _mm_maskstore_ps(ptrA,mask,t5);
- _mm_maskstore_ps(ptrB,mask,t6);
- _mm_maskstore_ps(ptrC,mask,t7);
- _mm_maskstore_ps(ptrD,mask,t8);
+ gmx_mm_maskstore_ps(ptrA,mask,t5);
+ gmx_mm_maskstore_ps(ptrB,mask,t6);
+ gmx_mm_maskstore_ps(ptrC,mask,t7);
+ gmx_mm_maskstore_ps(ptrD,mask,t8);
}
-
-
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ x1,y1,z1,x2,y2,z2,x3,y3,z3) \
+{\
+ __m256 _t1,_t2,_t3,_t4,_t5,_t6;\
+ __m128 _tA,_tB,_tC,_tD;\
+\
+ _t1 = _mm256_loadu_ps(ptrA);\
+ _t2 = _mm256_loadu_ps(ptrB);\
+ _t3 = _mm256_loadu_ps(ptrC);\
+ _t4 = _mm256_loadu_ps(ptrD);\
+ _tA = _mm_load_ss(ptrA+8);\
+ _tB = _mm_load_ss(ptrB+8);\
+ _tC = _mm_load_ss(ptrC+8);\
+ _tD = _mm_load_ss(ptrD+8);\
+ _t5 = _mm256_unpacklo_ps(x1,y1);\
+ x1 = _mm256_unpackhi_ps(x1,y1);\
+ y1 = _mm256_unpacklo_ps(z1,x2);\
+ z1 = _mm256_unpackhi_ps(z1,x2);\
+ x2 = _mm256_unpacklo_ps(y2,z2);\
+ y2 = _mm256_unpackhi_ps(y2,z2);\
+ _t6 = _mm256_unpacklo_ps(x3,y3);\
+ x3 = _mm256_unpackhi_ps(x3,y3);\
+ _t5 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+ x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+ y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1);\
+ z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+ z2 = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(1,0,1,0));\
+ _t5 = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(3,2,3,2));\
+ y1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+ x1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_sub_ps(_t1,z2);\
+ _t2 = _mm256_sub_ps(_t2,_t5);\
+ _t3 = _mm256_sub_ps(_t3,y1);\
+ _t4 = _mm256_sub_ps(_t4,x1);\
+ _tA = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3));\
+ _tB = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));\
+ _tC = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));\
+ _tD = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));\
+ _mm256_storeu_ps(ptrA,_t1);\
+ _mm256_storeu_ps(ptrB,_t2);\
+ _mm256_storeu_ps(ptrC,_t3);\
+ _mm256_storeu_ps(ptrD,_t4);\
+ _mm_store_ss(ptrA+8,_tA);\
+ _mm_store_ss(ptrB+8,_tB);\
+ _mm_store_ss(ptrC+8,_tC);\
+ _mm_store_ss(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3)
{
__m256 t1,t2,t3,t4,t5,t6;
__m128 tA,tB,tC,tD;
_mm_store_ss(ptrC+8,tC);
_mm_store_ss(ptrD+8,tD);
}
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+ x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4) \
+{\
+ __m256 _t1,_t2,_t3,_t4,_t5;\
+ __m128 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH;\
+\
+ _t1 = _mm256_loadu_ps(ptrA);\
+ _t2 = _mm256_loadu_ps(ptrB);\
+ _t3 = _mm256_loadu_ps(ptrC);\
+ _t4 = _mm256_loadu_ps(ptrD);\
+ _tA = _mm_loadu_ps(ptrA+8);\
+ _tB = _mm_loadu_ps(ptrB+8);\
+ _tC = _mm_loadu_ps(ptrC+8);\
+ _tD = _mm_loadu_ps(ptrD+8);\
+ _t5 = _mm256_unpacklo_ps(x1,y1);\
+ x1 = _mm256_unpackhi_ps(x1,y1);\
+ y1 = _mm256_unpacklo_ps(z1,x2);\
+ z1 = _mm256_unpackhi_ps(z1,x2);\
+ x2 = _mm256_unpacklo_ps(y2,z2);\
+ y2 = _mm256_unpackhi_ps(y2,z2);\
+ z2 = _mm256_unpacklo_ps(x3,y3);\
+ x3 = _mm256_unpackhi_ps(x3,y3);\
+ y3 = _mm256_unpacklo_ps(z3,x4);\
+ z3 = _mm256_unpackhi_ps(z3,x4);\
+ x4 = _mm256_unpacklo_ps(y4,z4);\
+ y4 = _mm256_unpackhi_ps(y4,z4);\
+ x2 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+ x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+ y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);\
+ z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+ z2 = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0));\
+ _t5 = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2));\
+ y1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+ x1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+ _tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0));\
+ _tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2));\
+ _tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0));\
+ _tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_sub_ps(_t1,z2);\
+ _t2 = _mm256_sub_ps(_t2,_t5);\
+ _t3 = _mm256_sub_ps(_t3,y1);\
+ _t4 = _mm256_sub_ps(_t4,x1);\
+ _tA = _mm_sub_ps(_tA,_tE);\
+ _tB = _mm_sub_ps(_tB,_tF);\
+ _tC = _mm_sub_ps(_tC,_tG);\
+ _tD = _mm_sub_ps(_tD,_tH);\
+ _mm256_storeu_ps(ptrA,_t1);\
+ _mm256_storeu_ps(ptrB,_t2);\
+ _mm256_storeu_ps(ptrC,_t3);\
+ _mm256_storeu_ps(ptrD,_t4);\
+ _mm_storeu_ps(ptrA+8,_tA);\
+ _mm_storeu_ps(ptrB+8,_tB);\
+ _mm_storeu_ps(ptrC+8,_tC);\
+ _mm_storeu_ps(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3,
- __m256 x4, __m256 y4, __m256 z4)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3,
+ __m256 x4, __m256 y4, __m256 z4)
{
__m256 t1,t2,t3,t4,t5;
__m128 tA,tB,tC,tD,tE,tF,tG,tH;
_mm_storeu_ps(ptrC+8,tC);
_mm_storeu_ps(ptrD+8,tD);
}
-
+#endif
static gmx_inline void
gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- float * gmx_restrict ptrE, float * gmx_restrict ptrF,
- float * gmx_restrict ptrG, float * gmx_restrict ptrH,
- __m256 x1, __m256 y1, __m256 z1)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+ float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+ __m256 x1, __m256 y1, __m256 z1)
{
__m256 t1,t2,t3,t4,t5,t6;
__m256 tA,tB,tC,tD;
/* Construct a mask without executing any data loads */
mask = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
- tA = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask));
- tB = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask));
- tC = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask));
- tD = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask));
+ tA = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
+ tB = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
+ tC = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
+ tD = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
t1 = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
t2 = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
tC = _mm256_sub_ps(tC,t5);
tD = _mm256_sub_ps(tD,t6);
- _mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
- _mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
- _mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
- _mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
- _mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
- _mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
- _mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
- _mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
+ gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
+ gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
+ gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
+ gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
+ gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
+ gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
+ gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
+ gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+ __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+ __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+ _tA = _mm256_loadu_ps(ptrA);\
+ _tB = _mm256_loadu_ps(ptrB);\
+ _tC = _mm256_loadu_ps(ptrC);\
+ _tD = _mm256_loadu_ps(ptrD);\
+ _tE = _mm256_loadu_ps(ptrE);\
+ _tF = _mm256_loadu_ps(ptrF);\
+ _tG = _mm256_loadu_ps(ptrG);\
+ _tH = _mm256_loadu_ps(ptrH);\
+ _t1 = _mm256_unpacklo_ps(_x1,_y1);\
+ _t2 = _mm256_unpackhi_ps(_x1,_y1);\
+ _t3 = _mm256_unpacklo_ps(_z1,_x2);\
+ _t4 = _mm256_unpackhi_ps(_z1,_x2);\
+ _t5 = _mm256_unpacklo_ps(_y2,_z2);\
+ _t6 = _mm256_unpackhi_ps(_y2,_z2);\
+ _t7 = _mm256_unpacklo_ps(_x3,_y3);\
+ _t8 = _mm256_unpackhi_ps(_x3,_y3);\
+ _t9 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+ _t10 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+ _t11 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+ _t12 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+ _t2 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+ _t3 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+ _t4 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+ _t5 = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+ _t6 = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+ _t7 = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+ _t8 = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+ _t1 = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+ _t2 = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+ _t9 = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+ _t10 = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+ _tA = _mm256_sub_ps(_tA,_t5);\
+ _tB = _mm256_sub_ps(_tB,_t7);\
+ _tC = _mm256_sub_ps(_tC,_t1);\
+ _tD = _mm256_sub_ps(_tD,_t9);\
+ _tE = _mm256_sub_ps(_tE,_t6);\
+ _tF = _mm256_sub_ps(_tF,_t8);\
+ _tG = _mm256_sub_ps(_tG,_t2);\
+ _tH = _mm256_sub_ps(_tH,_t10);\
+ _mm256_storeu_ps(ptrA,_tA);\
+ _mm256_storeu_ps(ptrB,_tB);\
+ _mm256_storeu_ps(ptrC,_tC);\
+ _mm256_storeu_ps(ptrD,_tD);\
+ _mm256_storeu_ps(ptrE,_tE);\
+ _mm256_storeu_ps(ptrF,_tF);\
+ _mm256_storeu_ps(ptrG,_tG);\
+ _mm256_storeu_ps(ptrH,_tH);\
+ _tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));\
+ _tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));\
+ _tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));\
+ _tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));\
+ _tI = _mm256_unpacklo_ps(_tI,_tK);\
+ _tJ = _mm256_unpacklo_ps(_tJ,_tL);\
+ _tI = _mm256_unpacklo_ps(_tI,_tJ);\
+ _tI = _mm256_sub_ps(_tI,_z3);\
+ _tJ = _mm256_permute_ps(_tI,_MM_SHUFFLE(1,1,1,1));\
+ _tK = _mm256_permute_ps(_tI,_MM_SHUFFLE(2,2,2,2));\
+ _tL = _mm256_permute_ps(_tI,_MM_SHUFFLE(3,3,3,3));\
+ _mm_store_ss(ptrA+8,_mm256_castps256_ps128(_tI));\
+ _mm_store_ss(ptrB+8,_mm256_castps256_ps128(_tJ));\
+ _mm_store_ss(ptrC+8,_mm256_castps256_ps128(_tK));\
+ _mm_store_ss(ptrD+8,_mm256_castps256_ps128(_tL));\
+ _mm_store_ss(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+ _mm_store_ss(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+ _mm_store_ss(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+ _mm_store_ss(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- float * gmx_restrict ptrE, float * gmx_restrict ptrF,
- float * gmx_restrict ptrG, float * gmx_restrict ptrH,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+ float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3)
{
__m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
__m256 tA,tB,tC,tD,tE,tF,tG,tH;
_mm256_storeu_ps(ptrF,tF);
_mm256_storeu_ps(ptrG,tG);
_mm256_storeu_ps(ptrH,tH);
-
+
tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
-
+
tI = _mm256_unpacklo_ps(tI,tK); /* - - zG zE | - - zC zA */
tJ = _mm256_unpacklo_ps(tJ,tL); /* - - zH zF | - - zD zB */
tI = _mm256_unpacklo_ps(tI,tJ); /* zH zG zF zE | zD zC zB zA */
_mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
_mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
}
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH, \
+ _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+ __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+ _tA = _mm256_loadu_ps(ptrA);\
+ _tB = _mm256_loadu_ps(ptrB);\
+ _tC = _mm256_loadu_ps(ptrC);\
+ _tD = _mm256_loadu_ps(ptrD);\
+ _tE = _mm256_loadu_ps(ptrE);\
+ _tF = _mm256_loadu_ps(ptrF);\
+ _tG = _mm256_loadu_ps(ptrG);\
+ _tH = _mm256_loadu_ps(ptrH);\
+ _t1 = _mm256_unpacklo_ps(_x1,_y1);\
+ _t2 = _mm256_unpackhi_ps(_x1,_y1);\
+ _t3 = _mm256_unpacklo_ps(_z1,_x2);\
+ _t4 = _mm256_unpackhi_ps(_z1,_x2);\
+ _t5 = _mm256_unpacklo_ps(_y2,_z2);\
+ _t6 = _mm256_unpackhi_ps(_y2,_z2);\
+ _t7 = _mm256_unpacklo_ps(_x3,_y3);\
+ _t8 = _mm256_unpackhi_ps(_x3,_y3);\
+ _t9 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+ _t10 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+ _t11 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+ _t12 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+ _t1 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+ _t2 = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+ _t3 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+ _t4 = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+ _t5 = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+ _t6 = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+ _t7 = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+ _t8 = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+ _t1 = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+ _t2 = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+ _t9 = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+ _t10 = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+ _tA = _mm256_sub_ps(_tA,_t5);\
+ _tB = _mm256_sub_ps(_tB,_t7);\
+ _tC = _mm256_sub_ps(_tC,_t1);\
+ _tD = _mm256_sub_ps(_tD,_t9);\
+ _tE = _mm256_sub_ps(_tE,_t6);\
+ _tF = _mm256_sub_ps(_tF,_t8);\
+ _tG = _mm256_sub_ps(_tG,_t2);\
+ _tH = _mm256_sub_ps(_tH,_t10);\
+ _mm256_storeu_ps(ptrA,_tA);\
+ _mm256_storeu_ps(ptrB,_tB);\
+ _mm256_storeu_ps(ptrC,_tC);\
+ _mm256_storeu_ps(ptrD,_tD);\
+ _mm256_storeu_ps(ptrE,_tE);\
+ _mm256_storeu_ps(ptrF,_tF);\
+ _mm256_storeu_ps(ptrG,_tG);\
+ _mm256_storeu_ps(ptrH,_tH);\
+ _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));\
+ _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));\
+ _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));\
+ _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));\
+ _t1 = _mm256_unpacklo_ps(_z3,_x4);\
+ _t2 = _mm256_unpackhi_ps(_z3,_x4);\
+ _t3 = _mm256_unpacklo_ps(_y4,_z4);\
+ _t4 = _mm256_unpackhi_ps(_y4,_z4);\
+ _t5 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+ _t6 = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+ _t7 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+ _t8 = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+ _tI = _mm256_sub_ps(_tI,_t5);\
+ _tJ = _mm256_sub_ps(_tJ,_t6);\
+ _tK = _mm256_sub_ps(_tK,_t7);\
+ _tL = _mm256_sub_ps(_tL,_t8);\
+ _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(_tI));\
+ _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(_tJ));\
+ _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(_tK));\
+ _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(_tL));\
+ _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+ _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+ _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+ _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- float * gmx_restrict ptrE, float * gmx_restrict ptrF,
- float * gmx_restrict ptrG, float * gmx_restrict ptrH,
- __m256 x1, __m256 y1, __m256 z1,
- __m256 x2, __m256 y2, __m256 z2,
- __m256 x3, __m256 y3, __m256 z3,
- __m256 x4, __m256 y4, __m256 z4)
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+ float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+ float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+ __m256 x1, __m256 y1, __m256 z1,
+ __m256 x2, __m256 y2, __m256 z2,
+ __m256 x3, __m256 y3, __m256 z3,
+ __m256 x4, __m256 y4, __m256 z4)
{
__m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
__m256 tA,tB,tC,tD,tE,tF,tG,tH;
tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
-
+
t1 = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
t2 = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
t3 = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
_mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
_mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
}
-
+#endif
static gmx_inline void
gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
- float * gmx_restrict fptr,
- float * gmx_restrict fshiftptr)
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m128 t1,t2,t3;
/* Add across the two lanes */
t1 = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
-
+
t2 = _mm_load_ss(fptr);
t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
t3 = _mm_load_ss(fshiftptr);
_mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{ \
+ __m256 _t1,_t2,_t3;\
+ __m128 _tA,_tB,_tC;\
+\
+ fix1 = _mm256_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm256_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+ fix1 = _mm256_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+\
+ _t1 = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+ _t2 = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+ _t1 = _mm256_add_ps(_t1,_t2);\
+ _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+ _t3 = _mm256_loadu_ps(fptr);\
+ _t3 = _mm256_add_ps(_t3,_t1);\
+ _mm256_storeu_ps(fptr,_t3);\
+ _tB = _mm_load_ss(fptr+8);\
+ _tB = _mm_add_ss(_tB,_tA);\
+ _mm_store_ss(fptr+8,_tB);\
+\
+ _tB = _mm256_extractf128_ps(_t1,0x1);\
+ _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+ _tB = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+ _tC = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+ _tB = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+ _tA = _mm_add_ps(_tB,_tC);\
+ _tA = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+ _tC = _mm_loadu_ps(fshiftptr);\
+ _tC = _mm_add_ps(_tC,_tA);\
+ _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
- __m256 fix2, __m256 fiy2, __m256 fiz2,
- __m256 fix3, __m256 fiy3, __m256 fiz3,
- float * gmx_restrict fptr,
- float * gmx_restrict fshiftptr)
+ __m256 fix2, __m256 fiy2, __m256 fiz2,
+ __m256 fix3, __m256 fiy3, __m256 fiz3,
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m256 t1,t2,t3;
__m128 tA,tB,tC;
tB = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
tA = _mm_add_ps(tB,tC); /* - z y x */
-
+
tA = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
tC = _mm_loadu_ps(fshiftptr);
tC = _mm_add_ps(tC,tA);
_mm_storeu_ps(fshiftptr,tC);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{ \
+ __m256 _t1,_t2,_t3; \
+ __m128 _tA,_tB,_tC; \
+\
+ fix1 = _mm256_hadd_ps(fix1,fiy1);\
+ fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+ fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+ fix3 = _mm256_hadd_ps(fix3,fiy3);\
+ fiz3 = _mm256_hadd_ps(fiz3,fix4);\
+ fiy4 = _mm256_hadd_ps(fiy4,fiz4);\
+\
+ fix1 = _mm256_hadd_ps(fix1,fiz1);\
+ fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+ fiz3 = _mm256_hadd_ps(fiz3,fiy4);\
+\
+ _t1 = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+ _t2 = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+ _t1 = _mm256_add_ps(_t1,_t2);\
+ _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+ _t3 = _mm256_loadu_ps(fptr);\
+ _t3 = _mm256_add_ps(_t3,_t1);\
+ _mm256_storeu_ps(fptr,_t3);\
+ _tB = _mm_loadu_ps(fptr+8);\
+ _tB = _mm_add_ps(_tB,_tA);\
+ _mm_storeu_ps(fptr+8,_tB);\
+\
+ _tB = _mm256_extractf128_ps(_t1,0x1);\
+ _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+ _tB = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+ _tC = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+ _tA = _mm_permute_ps(_tA,_MM_SHUFFLE(0,3,2,1));\
+ _tB = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+ _tA = _mm_add_ps(_tA,_tC);\
+ _tA = _mm_add_ps(_tA,_tB);\
+ _tA = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+ _tC = _mm_loadu_ps(fshiftptr);\
+ _tC = _mm_add_ps(_tC,_tA);\
+ _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
- __m256 fix2, __m256 fiy2, __m256 fiz2,
- __m256 fix3, __m256 fiy3, __m256 fiz3,
- __m256 fix4, __m256 fiy4, __m256 fiz4,
- float * gmx_restrict fptr,
- float * gmx_restrict fshiftptr)
+ __m256 fix2, __m256 fiy2, __m256 fiz2,
+ __m256 fix3, __m256 fiy3, __m256 fiz3,
+ __m256 fix4, __m256 fiy4, __m256 fiz4,
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m256 t1,t2,t3;
__m128 tA,tB,tC;
tC = _mm_add_ps(tC,tA);
_mm_storeu_ps(fshiftptr,tC);
}
+#endif
}
-static gmx_inline void
-gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
- __m256 pot2, float * gmx_restrict ptrB,
- __m256 pot3, float * gmx_restrict ptrC,
- __m256 pot4, float * gmx_restrict ptrD)
-{
- __m128 t1,t2,t3,t4;
-
- pot1 = _mm256_hadd_ps(pot1,pot2);
- pot3 = _mm256_hadd_ps(pot3,pot4);
- pot1 = _mm256_hadd_ps(pot1,pot3);
- t1 = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
- t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
- t3 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
- t4 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
- _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
- _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
- _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
-}
-
-
#endif /* _kernelutil_x86_avx_256_single_h_ */
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1,
- __m128d * gmx_restrict y1,
- __m128d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1,
+ __m128d * gmx_restrict y1,
+ __m128d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz;
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
- __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+ __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
- *x = _mm_load_sd(p1);
- *y = _mm_load_sd(p1+1);
- *z = _mm_load_sd(p1+2);
+ *x = _mm_load_sd(p1);
+ *y = _mm_load_sd(p1+1);
+ *z = _mm_load_sd(p1+2);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
- *x1 = _mm_load_sd(p1);
- *y1 = _mm_load_sd(p1+1);
- *z1 = _mm_load_sd(p1+2);
- *x2 = _mm_load_sd(p1+3);
- *y2 = _mm_load_sd(p1+4);
- *z2 = _mm_load_sd(p1+5);
- *x3 = _mm_load_sd(p1+6);
- *y3 = _mm_load_sd(p1+7);
- *z3 = _mm_load_sd(p1+8);
+ *x1 = _mm_load_sd(p1);
+ *y1 = _mm_load_sd(p1+1);
+ *z1 = _mm_load_sd(p1+2);
+ *x2 = _mm_load_sd(p1+3);
+ *y2 = _mm_load_sd(p1+4);
+ *z2 = _mm_load_sd(p1+5);
+ *x3 = _mm_load_sd(p1+6);
+ *y3 = _mm_load_sd(p1+7);
+ *z3 = _mm_load_sd(p1+8);
}
static gmx_inline void
/* Routines to decrement rvec in memory, typically use for j particle force updates */
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy, __m128d z)
+ __m128d xy, __m128d z)
{
__m128d t1,t2;
_mm_store_sd(ptrA+2,t2);
}
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3)
-{
- __m128d t1,t2;
- __m128d tA,tB,tC,tD,tE;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_load_sd(ptrA+8);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_sd(tE,z3);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3,
- __m128d xy4, __m128d z4)
-{
- __m128d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_loadu_pd(ptrA+8);
- tF = _mm_loadu_pd(ptrA+10);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
- t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
- t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_pd(tE,t3);
- tF = _mm_sub_pd(tF,t4);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_storeu_pd(ptrA+8,tE);
- _mm_storeu_pd(ptrA+10,tF);
-}
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_load_sd(ptrA+8);\
+ _x1 = _mm_unpacklo_pd(_x1,_y1);\
+ _z1 = _mm_unpacklo_pd(_z1,_x2);\
+ _y2 = _mm_unpacklo_pd(_y2,_z2);\
+ _x3 = _mm_unpacklo_pd(_x3,_y3);\
+ _t1 = _mm_sub_pd(_t1,_x1);\
+ _t2 = _mm_sub_pd(_t2,_z1);\
+ _t3 = _mm_sub_pd(_t3,_y2);\
+ _t4 = _mm_sub_pd(_t4,_x3);\
+ _t5 = _mm_sub_sd(_t5,_z3);\
+ _mm_storeu_pd(ptrA,_t1);\
+ _mm_storeu_pd(ptrA+2,_t2);\
+ _mm_storeu_pd(ptrA+4,_t3);\
+ _mm_storeu_pd(ptrA+6,_t4);\
+ _mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+6,t4);
_mm_store_sd(ptrA+8,t5);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_loadu_pd(ptrA+8);\
+ _t6 = _mm_loadu_pd(ptrA+10);\
+ _x1 = _mm_unpacklo_pd(_x1,_y1);\
+ _z1 = _mm_unpacklo_pd(_z1,_x2);\
+ _y2 = _mm_unpacklo_pd(_y2,_z2);\
+ _x3 = _mm_unpacklo_pd(_x3,_y3);\
+ _z3 = _mm_unpacklo_pd(_z3,_x4);\
+ _y4 = _mm_unpacklo_pd(_y4,_z4);\
+ _mm_storeu_pd(ptrA, _mm_sub_pd( _t1,_x1 ));\
+ _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2,_z1 ));\
+ _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3,_y2 ));\
+ _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4,_x3 ));\
+ _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5,_z3 ));\
+ _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
}
+#endif
+
static gmx_inline void
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
_mm_store_sd(ptrB+2,t4);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+ __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_load_sd(ptrA+8);\
+ _t6 = _mm_loadu_pd(ptrB);\
+ _t7 = _mm_loadu_pd(ptrB+2);\
+ _t8 = _mm_loadu_pd(ptrB+4);\
+ _t9 = _mm_loadu_pd(ptrB+6);\
+ _t10 = _mm_load_sd(ptrB+8);\
+ _tA = _mm_unpacklo_pd(_x1,_y1);\
+ _tB = _mm_unpackhi_pd(_x1,_y1);\
+ _tC = _mm_unpacklo_pd(_z1,_x2);\
+ _tD = _mm_unpackhi_pd(_z1,_x2);\
+ _tE = _mm_unpacklo_pd(_y2,_z2);\
+ _tF = _mm_unpackhi_pd(_y2,_z2);\
+ _tG = _mm_unpacklo_pd(_x3,_y3);\
+ _tH = _mm_unpackhi_pd(_x3,_y3);\
+ _tI = _mm_unpackhi_pd(_z3,_z3);\
+ _t1 = _mm_sub_pd(_t1,_tA);\
+ _t2 = _mm_sub_pd(_t2,_tC);\
+ _t3 = _mm_sub_pd(_t3,_tE);\
+ _t4 = _mm_sub_pd(_t4,_tG);\
+ _t5 = _mm_sub_sd(_t5,_z3);\
+ _t6 = _mm_sub_pd(_t6,_tB);\
+ _t7 = _mm_sub_pd(_t7,_tD);\
+ _t8 = _mm_sub_pd(_t8,_tF);\
+ _t9 = _mm_sub_pd(_t9,_tH);\
+ _t10 = _mm_sub_sd(_t10,_tI);\
+ _mm_storeu_pd(ptrA,_t1);\
+ _mm_storeu_pd(ptrA+2,_t2);\
+ _mm_storeu_pd(ptrA+4,_t3);\
+ _mm_storeu_pd(ptrA+6,_t4);\
+ _mm_store_sd(ptrA+8,_t5);\
+ _mm_storeu_pd(ptrB,_t6);\
+ _mm_storeu_pd(ptrB+2,_t7);\
+ _mm_storeu_pd(ptrB+4,_t8);\
+ _mm_storeu_pd(ptrB+6,_t9);\
+ _mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+6,t9);
_mm_store_sd(ptrB+8,t10);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+ __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+ __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+ _t1 = _mm_loadu_pd(ptrA);\
+ _t2 = _mm_loadu_pd(ptrA+2);\
+ _t3 = _mm_loadu_pd(ptrA+4);\
+ _t4 = _mm_loadu_pd(ptrA+6);\
+ _t5 = _mm_loadu_pd(ptrA+8);\
+ _t6 = _mm_loadu_pd(ptrA+10);\
+ _t7 = _mm_loadu_pd(ptrB);\
+ _t8 = _mm_loadu_pd(ptrB+2);\
+ _t9 = _mm_loadu_pd(ptrB+4);\
+ _t10 = _mm_loadu_pd(ptrB+6);\
+ _t11 = _mm_loadu_pd(ptrB+8);\
+ _t12 = _mm_loadu_pd(ptrB+10);\
+ _tA = _mm_unpacklo_pd(_x1,_y1);\
+ _tB = _mm_unpackhi_pd(_x1,_y1);\
+ _tC = _mm_unpacklo_pd(_z1,_x2);\
+ _tD = _mm_unpackhi_pd(_z1,_x2);\
+ _tE = _mm_unpacklo_pd(_y2,_z2);\
+ _tF = _mm_unpackhi_pd(_y2,_z2);\
+ _tG = _mm_unpacklo_pd(_x3,_y3);\
+ _tH = _mm_unpackhi_pd(_x3,_y3);\
+ _tI = _mm_unpacklo_pd(_z3,_x4);\
+ _tJ = _mm_unpackhi_pd(_z3,_x4);\
+ _tK = _mm_unpacklo_pd(_y4,_z4);\
+ _tL = _mm_unpackhi_pd(_y4,_z4);\
+ _t1 = _mm_sub_pd(_t1,_tA);\
+ _t2 = _mm_sub_pd(_t2,_tC);\
+ _t3 = _mm_sub_pd(_t3,_tE);\
+ _t4 = _mm_sub_pd(_t4,_tG);\
+ _t5 = _mm_sub_pd(_t5,_tI);\
+ _t6 = _mm_sub_pd(_t6,_tK);\
+ _t7 = _mm_sub_pd(_t7,_tB);\
+ _t8 = _mm_sub_pd(_t8,_tD);\
+ _t9 = _mm_sub_pd(_t9,_tF);\
+ _t10 = _mm_sub_pd(_t10,_tH);\
+ _t11 = _mm_sub_pd(_t11,_tJ);\
+ _t12 = _mm_sub_pd(_t12,_tL);\
+ _mm_storeu_pd(ptrA, _t1);\
+ _mm_storeu_pd(ptrA+2,_t2);\
+ _mm_storeu_pd(ptrA+4,_t3);\
+ _mm_storeu_pd(ptrA+6,_t4);\
+ _mm_storeu_pd(ptrA+8,_t5);\
+ _mm_storeu_pd(ptrA+10,_t6);\
+ _mm_storeu_pd(ptrB, _t7);\
+ _mm_storeu_pd(ptrB+2,_t8);\
+ _mm_storeu_pd(ptrB+4,_t9);\
+ _mm_storeu_pd(ptrB+6,_t10);\
+ _mm_storeu_pd(ptrB+8,_t11);\
+ _mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+8,t11);
_mm_storeu_pd(ptrB+10,t12);
}
+#endif
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+ GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+ GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+ _t1 = fix3;\
+ fix3 = _mm_unpacklo_pd(fix3,fiy3);\
+ fiy3 = _mm_unpackhi_pd(_t1,fiy3);\
+ fix1 = _mm_add_pd(fix1,fiy1);\
+ fiz1 = _mm_add_pd(fiz1,fix2);\
+ fiy2 = _mm_add_pd(fiy2,fiz2);\
+ fix3 = _mm_add_pd(fix3,fiy3);\
+ fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ fiz1 = _mm_add_sd(fiz1,_t2);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+ GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+ GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+ GMX_MM_TRANSPOSE2_PD(fix3,fiy3);\
+ GMX_MM_TRANSPOSE2_PD(fiz3,fix4);\
+ GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);\
+ fix1 = _mm_add_pd(fix1,fiy1);\
+ fiz1 = _mm_add_pd(fiz1,fix2);\
+ fiy2 = _mm_add_pd(fiy2,fiz2);\
+ fix3 = _mm_add_pd(fix3,fiy3);\
+ fiz3 = _mm_add_pd(fiz3,fix4);\
+ fiy4 = _mm_add_pd(fiy4,fiz4);\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));\
+ _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+ fix3 = _mm_add_pd(fix3,_t2);\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+ fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
fix3 = _mm_add_pd(fix3,fiy3);
fiz3 = _mm_add_pd(fiz3,fix4);
fiy4 = _mm_add_pd(fiy4,fiz4);
-
+
_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
+#endif
static gmx_inline void
/* We require SSE2 now! */
-#include <math.h>
+#include <math.h>
#include "gmx_x86_sse2.h"
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1,
- __m128 * gmx_restrict y1,
- __m128 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
-
+
t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
t3 = _mm_load_ss(xyz_shift+2);
t4 = _mm_load_ss(xyz+2);
t1 = _mm_add_ps(t1,t2);
t3 = _mm_add_ss(t3,t4);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
tB = _mm_load_ss(xyz_shift+2);
-
+
t1 = _mm_loadu_ps(xyz);
t2 = _mm_loadu_ps(xyz+4);
t3 = _mm_load_ss(xyz+8);
-
+
tA = _mm_movelh_ps(tA,tB);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ss(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
tB = _mm_load_ss(xyz_shift+2);
-
+
t1 = _mm_loadu_ps(xyz);
t2 = _mm_loadu_ps(xyz+4);
t3 = _mm_loadu_ps(xyz+8);
-
+
tA = _mm_movelh_ps(tA,tB);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ps(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 t1,t2,t3,t4;
t1 = _mm_loadu_ps(ptrA);
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
__m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 t1,t2,t3,t4;
t1 = _mm_loadu_ps(ptrA);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18 = _mm_movehl_ps(_z3,_z3);\
+_t19 = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20 = _mm_movelh_ps(_x1,_z1);\
+_t21 = _mm_movehl_ps(_z1,_x1);\
+_t22 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t23 = _mm_movelh_ps(_y2,_x3);\
+_t24 = _mm_movehl_ps(_x3,_y2);\
+_t25 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_load_ss(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t20);\
+_t2 = _mm_sub_ps(_t2,_t23);\
+_t3 = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_load_ss(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_t21);\
+_t5 = _mm_sub_ps(_t5,_t24);\
+_t6 = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_load_ss(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t22);\
+_t8 = _mm_sub_ps(_t8,_t25);\
+_t9 = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_load_ss(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3)
+ __m128 x3, __m128 y3, __m128 z3)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
_mm_storeu_ps(ptrD+4,t11);
_mm_store_ss(ptrD+8,t12);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_unpackhi_ps(_z3,_x4);\
+_z3 = _mm_unpacklo_ps(_z3,_x4);\
+_t18 = _mm_unpackhi_ps(_y4,_z4);\
+_y4 = _mm_unpacklo_ps(_y4,_z4);\
+_t19 = _mm_movelh_ps(_x1,_z1);\
+_z1 = _mm_movehl_ps(_z1,_x1);\
+_t20 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t21 = _mm_movelh_ps(_y2,_x3);\
+_x3 = _mm_movehl_ps(_x3,_y2);\
+_t22 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t23 = _mm_movelh_ps(_z3,_y4);\
+_y4 = _mm_movehl_ps(_y4,_z3);\
+_t24 = _mm_movelh_ps(_t17,_t18);\
+_t18 = _mm_movehl_ps(_t18,_t17);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_loadu_ps(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t19);\
+_t2 = _mm_sub_ps(_t2,_t21);\
+_t3 = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_loadu_ps(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_z1);\
+_t5 = _mm_sub_ps(_t5,_x3);\
+_t6 = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_loadu_ps(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t20);\
+_t8 = _mm_sub_ps(_t8,_t22);\
+_t9 = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_loadu_ps(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static void
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 x3, __m128 y3, __m128 z3,
- __m128 x4, __m128 y4, __m128 z4)
+ __m128 x4, __m128 y4, __m128 z4)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
__m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
_mm_storeu_ps(ptrD+4,t11);
_mm_storeu_ps(ptrD+8,t12);
}
-
+#endif
static gmx_inline void
_mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4;\
+\
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+ _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+ _t2 = _mm_movehl_ps(_mm_setzero_ps(),fiz3);\
+ _t1 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));\
+ _t3 = _mm_shuffle_ps(_t2,_t2,_MM_SHUFFLE(0,0,0,1));\
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+ fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+ fiz3 = _mm_add_ss(_mm_add_ps(fiz3,_t1) , _mm_add_ps(_t2,_t3));\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+ _t4 = _mm_load_ss(fshiftptr+2);\
+ _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+ _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+ _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+ _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+ _t3 = _mm_shuffle_ps(_t3 ,_t3 ,_MM_SHUFFLE(1,2,0,0));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _mm_store_ss(fshiftptr+2,_t1);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t1);
_mm_storeh_pi((__m64 *)(fshiftptr),t1);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128 _t1,_t2,_t3,_t4,_t5;\
+ _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+ _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+ _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);\
+ fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+ fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+ fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));\
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+ _t5 = _mm_load_ss(fshiftptr+2);\
+ _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+ _t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+ _t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+ _t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+ _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+ _t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
+ _t1 = _mm_add_ps(_t1,_t2);\
+ _t3 = _mm_add_ps(_t3,_t4);\
+ _t1 = _mm_add_ps(_t1,_t3);\
+ _t5 = _mm_add_ps(_t5,_t1);\
+ _mm_store_ss(fshiftptr+2,_t5);\
+ _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
_mm_store_ss(fshiftptr+2,t5);
_mm_storeh_pi((__m64 *)(fshiftptr),t5);
}
-
+#endif
static void
}
-static void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
- __m128 pot2, float * gmx_restrict ptrB,
- __m128 pot3, float * gmx_restrict ptrC,
- __m128 pot4, float * gmx_restrict ptrD)
-{
- _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
- pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
- pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
- pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
- pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
- _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
- _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
#endif /* _kernelutil_x86_sse2_single_h_ */
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1,
- __m128d * gmx_restrict y1,
- __m128d * gmx_restrict z1)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1,
+ __m128d * gmx_restrict y1,
+ __m128d * gmx_restrict z1)
{
__m128d mem_xy,mem_z,mem_sxy,mem_sz;
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
__m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
- const double * gmx_restrict xyz,
- __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
- __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
- __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
- __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+ const double * gmx_restrict xyz,
+ __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+ __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+ __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+ __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
{
__m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
- *x = _mm_load_sd(p1);
- *y = _mm_load_sd(p1+1);
- *z = _mm_load_sd(p1+2);
+ *x = _mm_load_sd(p1);
+ *y = _mm_load_sd(p1+1);
+ *z = _mm_load_sd(p1+2);
}
static gmx_inline void
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
__m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
{
- *x1 = _mm_load_sd(p1);
- *y1 = _mm_load_sd(p1+1);
- *z1 = _mm_load_sd(p1+2);
- *x2 = _mm_load_sd(p1+3);
- *y2 = _mm_load_sd(p1+4);
- *z2 = _mm_load_sd(p1+5);
- *x3 = _mm_load_sd(p1+6);
- *y3 = _mm_load_sd(p1+7);
- *z3 = _mm_load_sd(p1+8);
+ *x1 = _mm_load_sd(p1);
+ *y1 = _mm_load_sd(p1+1);
+ *z1 = _mm_load_sd(p1+2);
+ *x2 = _mm_load_sd(p1+3);
+ *y2 = _mm_load_sd(p1+4);
+ *z2 = _mm_load_sd(p1+5);
+ *x3 = _mm_load_sd(p1+6);
+ *y3 = _mm_load_sd(p1+7);
+ *z3 = _mm_load_sd(p1+8);
}
static gmx_inline void
/* Routines to decrement rvec in memory, typically use for j particle force updates */
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy, __m128d z)
+ __m128d xy, __m128d z)
{
__m128d t1,t2;
_mm_store_sd(ptrA+2,t2);
}
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3)
-{
- __m128d t1,t2;
- __m128d tA,tB,tC,tD,tE;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_load_sd(ptrA+8);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_sd(tE,z3);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m128d xy1, __m128d z1,
- __m128d xy2, __m128d z2,
- __m128d xy3, __m128d z3,
- __m128d xy4, __m128d z4)
-{
- __m128d t1,t2,t3,t4;
- __m128d tA,tB,tC,tD,tE,tF;
-
- tA = _mm_loadu_pd(ptrA);
- tB = _mm_loadu_pd(ptrA+2);
- tC = _mm_loadu_pd(ptrA+4);
- tD = _mm_loadu_pd(ptrA+6);
- tE = _mm_loadu_pd(ptrA+8);
- tF = _mm_loadu_pd(ptrA+10);
-
- /* xy1: y1 x1 */
- t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
- t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
- /* xy3: y3 x3 */
- t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
- t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
- tA = _mm_sub_pd(tA,xy1);
- tB = _mm_sub_pd(tB,t1);
- tC = _mm_sub_pd(tC,t2);
- tD = _mm_sub_pd(tD,xy3);
- tE = _mm_sub_pd(tE,t3);
- tF = _mm_sub_pd(tF,t4);
-
- _mm_storeu_pd(ptrA,tA);
- _mm_storeu_pd(ptrA+2,tB);
- _mm_storeu_pd(ptrA+4,tC);
- _mm_storeu_pd(ptrA+6,tD);
- _mm_storeu_pd(ptrA+8,tE);
- _mm_storeu_pd(ptrA+10,tF);
-}
static gmx_inline void
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_t1 = _mm_sub_pd(_t1,_x1);\
+_t2 = _mm_sub_pd(_t2,_z1);\
+_t3 = _mm_sub_pd(_t3,_y2);\
+_t4 = _mm_sub_pd(_t4,_x3);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+6,t4);
_mm_store_sd(ptrA+8,t5);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_x1 = _mm_unpacklo_pd(_x1,_y1);\
+_z1 = _mm_unpacklo_pd(_z1,_x2);\
+_y2 = _mm_unpacklo_pd(_y2,_z2);\
+_x3 = _mm_unpacklo_pd(_x3,_y3);\
+_z3 = _mm_unpacklo_pd(_z3,_x4);\
+_y4 = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA, _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
}
+#endif
+
static gmx_inline void
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
_mm_store_sd(ptrB+2,t4);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_load_sd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrB);\
+_t7 = _mm_loadu_pd(ptrB+2);\
+_t8 = _mm_loadu_pd(ptrB+4);\
+_t9 = _mm_loadu_pd(ptrB+6);\
+_t10 = _mm_load_sd(ptrB+8);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpackhi_pd(_z3,_z3);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_sd(_t5,_z3);\
+_t6 = _mm_sub_pd(_t6,_tB);\
+_t7 = _mm_sub_pd(_t7,_tD);\
+_t8 = _mm_sub_pd(_t8,_tF);\
+_t9 = _mm_sub_pd(_t9,_tH);\
+_t10 = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+6,t9);
_mm_store_sd(ptrB+8,t10);
}
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1 = _mm_loadu_pd(ptrA);\
+_t2 = _mm_loadu_pd(ptrA+2);\
+_t3 = _mm_loadu_pd(ptrA+4);\
+_t4 = _mm_loadu_pd(ptrA+6);\
+_t5 = _mm_loadu_pd(ptrA+8);\
+_t6 = _mm_loadu_pd(ptrA+10);\
+_t7 = _mm_loadu_pd(ptrB);\
+_t8 = _mm_loadu_pd(ptrB+2);\
+_t9 = _mm_loadu_pd(ptrB+4);\
+_t10 = _mm_loadu_pd(ptrB+6);\
+_t11 = _mm_loadu_pd(ptrB+8);\
+_t12 = _mm_loadu_pd(ptrB+10);\
+_tA = _mm_unpacklo_pd(_x1,_y1);\
+_tB = _mm_unpackhi_pd(_x1,_y1);\
+_tC = _mm_unpacklo_pd(_z1,_x2);\
+_tD = _mm_unpackhi_pd(_z1,_x2);\
+_tE = _mm_unpacklo_pd(_y2,_z2);\
+_tF = _mm_unpackhi_pd(_y2,_z2);\
+_tG = _mm_unpacklo_pd(_x3,_y3);\
+_tH = _mm_unpackhi_pd(_x3,_y3);\
+_tI = _mm_unpacklo_pd(_z3,_x4);\
+_tJ = _mm_unpackhi_pd(_z3,_x4);\
+_tK = _mm_unpacklo_pd(_y4,_z4);\
+_tL = _mm_unpackhi_pd(_y4,_z4);\
+_t1 = _mm_sub_pd(_t1,_tA);\
+_t2 = _mm_sub_pd(_t2,_tC);\
+_t3 = _mm_sub_pd(_t3,_tE);\
+_t4 = _mm_sub_pd(_t4,_tG);\
+_t5 = _mm_sub_pd(_t5,_tI);\
+_t6 = _mm_sub_pd(_t6,_tK);\
+_t7 = _mm_sub_pd(_t7,_tB);\
+_t8 = _mm_sub_pd(_t8,_tD);\
+_t9 = _mm_sub_pd(_t9,_tF);\
+_t10 = _mm_sub_pd(_t10,_tH);\
+_t11 = _mm_sub_pd(_t11,_tJ);\
+_t12 = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA, _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB, _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
_mm_storeu_pd(ptrB+8,t11);
_mm_storeu_pd(ptrB+10,t12);
}
-
+#endif
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ fix1 = _mm_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ fiz1 = _mm_add_sd(fiz1,_t2);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+ fptr,fshiftptr) \
+{\
+ __m128d _t1,_t2;\
+ fix1 = _mm_hadd_pd(fix1,fiy1);\
+ fiz1 = _mm_hadd_pd(fiz1,fix2);\
+ fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+ fix3 = _mm_hadd_pd(fix3,fiy3);\
+ fiz3 = _mm_hadd_pd(fiz3,fix4);\
+ fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+ _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+ _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+ _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+ _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+ _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));\
+ _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+ _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+ fix1 = _mm_add_pd(fix1,_t1);\
+ _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+ fix3 = _mm_add_pd(fix3,_t2);\
+ fix1 = _mm_add_pd(fix1,fix3);\
+ fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+ fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+ fiz1 = _mm_add_sd(fiz1,fiz3);\
+ _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+ _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-
-
+#endif
static gmx_inline void
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
#ifndef _kernelutil_x86_sse4_1_single_h_
#define _kernelutil_x86_sse4_1_single_h_
-#include <math.h>
+#include <math.h>
#include "gmx_x86_sse4_1.h"
#undef gmx_restrict
-#define gmx_restrict
+#define gmx_restrict
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
const float * gmx_restrict ptrD)
{
__m128 t1,t2;
-
+
t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
return _mm_unpacklo_ps(t1,t2);
__m128 xmm1)
{
__m128 t2,t3,t4;
-
- t3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
- t2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
- t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
- _mm_store_ss(ptrA,xmm1);
- _mm_store_ss(ptrB,t2);
- _mm_store_ss(ptrC,t3);
- _mm_store_ss(ptrD,t4);
+
+ t3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+ t2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+ t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+ _mm_store_ss(ptrA,xmm1);
+ _mm_store_ss(ptrB,t2);
+ _mm_store_ss(ptrC,t3);
+ _mm_store_ss(ptrD,t4);
}
/* Similar to store, but increments value in memory */
float * gmx_restrict ptrD, __m128 xmm1)
{
__m128 tmp;
-
+
tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
tmp = _mm_add_ps(tmp,xmm1);
gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
__m128 * gmx_restrict c12)
{
__m128 t1,t2,t3,t4;
-
+
t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1); /* - - c12a c6a */
t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2); /* - - c12b c6b */
t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3); /* - - c12c c6c */
static gmx_inline void
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1,
- __m128 * gmx_restrict y1,
- __m128 * gmx_restrict z1)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
__m128 t1,t2,t3,t4;
-
+
t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
t3 = _mm_load_ss(xyz_shift+2);
t4 = _mm_load_ss(xyz+2);
t1 = _mm_add_ps(t1,t2);
t3 = _mm_add_ss(t3,t4);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
static gmx_inline void
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
tB = _mm_load_ss(xyz_shift+2);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ss(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
static gmx_inline void
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
- const float * gmx_restrict xyz,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 tA,tB;
__m128 t1,t2,t3,t4,t5,t6;
-
+
tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
tB = _mm_load_ss(xyz_shift+2);
-
+
t1 = _mm_loadu_ps(xyz);
t2 = _mm_loadu_ps(xyz+4);
t3 = _mm_loadu_ps(xyz+8);
-
+
tA = _mm_movelh_ps(tA,tB);
t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-
+
t1 = _mm_add_ps(t1,t4);
t2 = _mm_add_ps(t2,t5);
t3 = _mm_add_ps(t3,t6);
-
+
*x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
*y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
*z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 t1,t2,t3,t4;
t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrA ) );
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
__m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
__m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 t1,t2,t3,t4;
t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA) ) );
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18 = _mm_movehl_ps(_z3,_z3);\
+_t19 = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20 = _mm_movelh_ps(_x1,_z1);\
+_t21 = _mm_movehl_ps(_z1,_x1);\
+_t22 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t23 = _mm_movelh_ps(_y2,_x3);\
+_t24 = _mm_movehl_ps(_x3,_y2);\
+_t25 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_load_ss(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t20);\
+_t2 = _mm_sub_ps(_t2,_t23);\
+_t3 = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_load_ss(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_t21);\
+_t5 = _mm_sub_ps(_t5,_t24);\
+_t6 = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_load_ss(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t22);\
+_t8 = _mm_sub_ps(_t8,_t25);\
+_t9 = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_load_ss(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3)
+ __m128 x3, __m128 y3, __m128 z3)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
t10 = _mm_loadu_ps(ptrD);
t11 = _mm_loadu_ps(ptrD+4);
t12 = _mm_load_ss(ptrD+8);
-
+
t1 = _mm_sub_ps(t1,t20);
t2 = _mm_sub_ps(t2,t23);
t3 = _mm_sub_ss(t3,z3);
_mm_storeu_ps(ptrD+4,t11);
_mm_store_ss(ptrD+8,t12);
}
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13 = _mm_unpackhi_ps(_x1,_y1);\
+_x1 = _mm_unpacklo_ps(_x1,_y1);\
+_t14 = _mm_unpackhi_ps(_z1,_x2);\
+_z1 = _mm_unpacklo_ps(_z1,_x2);\
+_t15 = _mm_unpackhi_ps(_y2,_z2);\
+_y2 = _mm_unpacklo_ps(_y2,_z2);\
+_t16 = _mm_unpackhi_ps(_x3,_y3);\
+_x3 = _mm_unpacklo_ps(_x3,_y3);\
+_t17 = _mm_unpackhi_ps(_z3,_x4);\
+_z3 = _mm_unpacklo_ps(_z3,_x4);\
+_t18 = _mm_unpackhi_ps(_y4,_z4);\
+_y4 = _mm_unpacklo_ps(_y4,_z4);\
+_t19 = _mm_movelh_ps(_x1,_z1);\
+_z1 = _mm_movehl_ps(_z1,_x1);\
+_t20 = _mm_movelh_ps(_t13,_t14);\
+_t14 = _mm_movehl_ps(_t14,_t13);\
+_t21 = _mm_movelh_ps(_y2,_x3);\
+_x3 = _mm_movehl_ps(_x3,_y2);\
+_t22 = _mm_movelh_ps(_t15,_t16);\
+_t16 = _mm_movehl_ps(_t16,_t15);\
+_t23 = _mm_movelh_ps(_z3,_y4);\
+_y4 = _mm_movehl_ps(_y4,_z3);\
+_t24 = _mm_movelh_ps(_t17,_t18);\
+_t18 = _mm_movehl_ps(_t18,_t17);\
+_t1 = _mm_loadu_ps(ptrA);\
+_t2 = _mm_loadu_ps(ptrA+4);\
+_t3 = _mm_loadu_ps(ptrA+8);\
+_t1 = _mm_sub_ps(_t1,_t19);\
+_t2 = _mm_sub_ps(_t2,_t21);\
+_t3 = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4 = _mm_loadu_ps(ptrB);\
+_t5 = _mm_loadu_ps(ptrB+4);\
+_t6 = _mm_loadu_ps(ptrB+8);\
+_t4 = _mm_sub_ps(_t4,_z1);\
+_t5 = _mm_sub_ps(_t5,_x3);\
+_t6 = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7 = _mm_loadu_ps(ptrC);\
+_t8 = _mm_loadu_ps(ptrC+4);\
+_t9 = _mm_loadu_ps(ptrC+8);\
+_t7 = _mm_sub_ps(_t7,_t20);\
+_t8 = _mm_sub_ps(_t8,_t22);\
+_t9 = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10 = _mm_loadu_ps(ptrD);\
+_t11 = _mm_loadu_ps(ptrD+4);\
+_t12 = _mm_loadu_ps(ptrD+8);\
+_t10 = _mm_sub_ps(_t10,_t14);\
+_t11 = _mm_sub_ps(_t11,_t16);\
+_t12 = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 x3, __m128 y3, __m128 z3,
- __m128 x4, __m128 y4, __m128 z4)
+ __m128 x4, __m128 y4, __m128 z4)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
__m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
_mm_storeu_ps(ptrD+4,t11);
_mm_storeu_ps(ptrD+8,t12);
}
-
+#endif
static gmx_inline void
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
{
- __m128 t2,t3;
-
+ __m128 t2,t3;
+
fix1 = _mm_hadd_ps(fix1,fix1);
- fiy1 = _mm_hadd_ps(fiy1,fiz1);
-
- fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
-
- t2 = _mm_load_ss(fptr);
- t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
- t3 = _mm_load_ss(fshiftptr);
- t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
-
- t2 = _mm_add_ps(t2,fix1);
- t3 = _mm_add_ps(t3,fix1);
-
- _mm_store_ss(fptr,t2);
- _mm_storeh_pi((__m64 *)(fptr+1),t2);
- _mm_store_ss(fshiftptr,t3);
- _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+ fiy1 = _mm_hadd_ps(fiy1,fiz1);
+
+ fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
+
+ t2 = _mm_load_ss(fptr);
+ t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+ t3 = _mm_load_ss(fshiftptr);
+ t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+ t2 = _mm_add_ps(t2,fix1);
+ t3 = _mm_add_ps(t3,fix1);
+
+ _mm_store_ss(fptr,t2);
+ _mm_storeh_pi((__m64 *)(fptr+1),t2);
+ _mm_store_ss(fshiftptr,t3);
+ _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
}
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+_mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+_t4 = _mm_load_ss(fshiftptr+2);\
+_t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+_t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+_t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+_t3 = _mm_shuffle_ps(_t3,_t3,_MM_SHUFFLE(1,2,0,0));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_mm_store_ss(fshiftptr+2,_t1);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
{
- __m128 t1,t2,t3,t4;
-
- fix1 = _mm_hadd_ps(fix1,fiy1);
- fiz1 = _mm_hadd_ps(fiz1,fix2);
- fiy2 = _mm_hadd_ps(fiy2,fiz2);
- fix3 = _mm_hadd_ps(fix3,fiy3);
- fiz3 = _mm_hadd_ps(fiz3,fiz3);
-
- fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
- fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
- fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
-
- _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
- _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
- _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
-
- t4 = _mm_load_ss(fshiftptr+2);
- t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-
- t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
- t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
- t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
- t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
-
- t1 = _mm_add_ps(t1,t2);
- t3 = _mm_add_ps(t3,t4);
- t1 = _mm_add_ps(t1,t3); /* y x - z */
-
- _mm_store_ss(fshiftptr+2,t1);
- _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
+ __m128 t1,t2,t3,t4;
+
+ fix1 = _mm_hadd_ps(fix1,fiy1);
+ fiz1 = _mm_hadd_ps(fiz1,fix2);
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);
+ fix3 = _mm_hadd_ps(fix3,fiy3);
+ fiz3 = _mm_hadd_ps(fiz3,fiz3);
+
+ fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+ fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+ fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+ _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+ t4 = _mm_load_ss(fshiftptr+2);
+ t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
+ t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
+ t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
+ t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ps(t3,t4);
+ t1 = _mm_add_ps(t1,t3); /* y x - z */
+ _mm_store_ss(fshiftptr+2,t1);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fix4);\
+fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+_mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+_t5 = _mm_load_ss(fshiftptr+2);\
+_t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+_t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+_t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+_t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+_t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_t5 = _mm_add_ps(_t5,_t1);\
+_mm_store_ss(fshiftptr+2,_t5);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
static gmx_inline void
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
{
- __m128 t1,t2,t3,t4,t5;
-
- fix1 = _mm_hadd_ps(fix1,fiy1);
- fiz1 = _mm_hadd_ps(fiz1,fix2);
- fiy2 = _mm_hadd_ps(fiy2,fiz2);
- fix3 = _mm_hadd_ps(fix3,fiy3);
- fiz3 = _mm_hadd_ps(fiz3,fix4);
- fiy4 = _mm_hadd_ps(fiy4,fiz4);
-
- fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
- fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
- fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
-
- _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
- _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
- _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
-
- t5 = _mm_load_ss(fshiftptr+2);
- t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
-
- t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
- t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
- t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
- t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
- t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
-
- t1 = _mm_add_ps(t1,t2);
- t3 = _mm_add_ps(t3,t4);
- t1 = _mm_add_ps(t1,t3);
- t5 = _mm_add_ps(t5,t1);
-
- _mm_store_ss(fshiftptr+2,t5);
- _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+ __m128 t1,t2,t3,t4,t5;
+
+ fix1 = _mm_hadd_ps(fix1,fiy1);
+ fiz1 = _mm_hadd_ps(fiz1,fix2);
+ fiy2 = _mm_hadd_ps(fiy2,fiz2);
+ fix3 = _mm_hadd_ps(fix3,fiy3);
+ fiz3 = _mm_hadd_ps(fiz3,fix4);
+ fiy4 = _mm_hadd_ps(fiy4,fiz4);
+
+ fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+ fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+ fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
+
+ _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
+ _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+ _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+ t5 = _mm_load_ss(fshiftptr+2);
+ t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+ t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+ t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+ t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+ t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+ t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
+
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ps(t3,t4);
+ t1 = _mm_add_ps(t1,t3);
+ t5 = _mm_add_ps(t5,t1);
+
+ _mm_store_ss(fshiftptr+2,t5);
+ _mm_storeh_pi((__m64 *)(fshiftptr),t5);
}
-
+#endif
static gmx_inline void
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
- __m128 t1,t2;
- t1 = _mm_movehl_ps(pot2,pot1);
- t2 = _mm_movelh_ps(pot1,pot2);
- t1 = _mm_add_ps(t1,t2);
- t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
- pot1 = _mm_add_ps(t1,t2);
- pot2 = _mm_movehl_ps(t2,pot1);
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-}
-
-
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
- __m128 pot2, float * gmx_restrict ptrB,
- __m128 pot3, float * gmx_restrict ptrC,
- __m128 pot4, float * gmx_restrict ptrD)
-{
- _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
- pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
- pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
- pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
- pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
- _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
- _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
- _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
- _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+ __m128 t1,t2;
+ t1 = _mm_movehl_ps(pot2,pot1);
+ t2 = _mm_movelh_ps(pot1,pot2);
+ t1 = _mm_add_ps(t1,t2);
+ t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+ pot1 = _mm_add_ps(t1,t2);
+ pot2 = _mm_movehl_ps(t2,pot1);
+ _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+ _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
}
case SEL_EXPRESSION:
case SEL_MODIFIER:
- assert(g);
rc = _gmx_sel_evaluate_method_params(data, sel, g);
if (rc != 0)
{
t_complex ***t;
int i,j;
- snew(t,x);
t = (t_complex ***)calloc(x,sizeof(t_complex**));
if(!t) exit(fprintf(stderr,"\nallocation error"));
t[0] = (t_complex **)calloc(x*y,sizeof(t_complex*));
}
}
-static void assign_param(t_functype ftype,t_iparams *newparam,
+/* A return value of 0 means parameters were assigned successfully,
+ * returning -1 means this is an all-zero interaction that should not be added.
+ */
+static int
+assign_param(t_functype ftype,t_iparams *newparam,
real old[MAXFORCEPARAM],int comb,double reppow)
{
int i,j;
real tmp;
+ gmx_bool all_param_zero=TRUE;
/* Set to zero */
for(j=0; (j<MAXFORCEPARAM); j++)
- {
+ {
newparam->generic.buf[j]=0.0;
- }
+ /* If all parameters are zero we might not add some interaction types (selected below).
+ * We cannot apply this to ALL interactions, since many have valid reasons for having
+ * zero parameters (e.g. an index to a Cmap interaction, or LJ parameters), but
+ * we use it for angles and torsions that are typically generated automatically.
+ */
+ all_param_zero = (all_param_zero==TRUE) && fabs(old[j])<GMX_REAL_MIN;
+ }
+
+ if(all_param_zero==TRUE)
+ {
+ if(IS_ANGLE(ftype) || IS_RESTRAINT_TYPE(ftype) || ftype==F_IDIHS ||
+ ftype==F_PDIHS || ftype==F_PIDIHS || ftype==F_RBDIHS || ftype==F_FOURDIHS)
+ {
+ return -1;
+ }
+ }
+
switch (ftype) {
case F_G96ANGLES:
/* Post processing of input data: store cosine iso angle itself */
case F_PIDIHS:
case F_ANGRES:
case F_ANGRESZ:
- newparam->pdihs.phiA = old[0];
- newparam->pdihs.cpA = old[1];
-
- /* Dont do any checks if all parameters are zero (such interactions will be removed).
- * Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
- * so I have changed the lower limit to -99 /EL
- *
- * Second, if the force constant is zero in both A and B states, we set the phase
- * and multiplicity to zero too so the interaction gets removed during clean-up.
- */
- newparam->pdihs.phiB = old[3];
- newparam->pdihs.cpB = old[4];
-
- if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
- {
- newparam->pdihs.phiA = 0.0;
- newparam->pdihs.phiB = 0.0;
- newparam->pdihs.mult = 0;
- }
- else
- {
- newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
- }
+ newparam->pdihs.phiA = old[0];
+ newparam->pdihs.cpA = old[1];
+
+ /* Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
+ * so I have changed the lower limit to -99 /EL
+ */
+ newparam->pdihs.phiB = old[3];
+ newparam->pdihs.cpB = old[4];
+ /* If both force constants are zero there is no interaction. Return -1 to signal
+ * this entry should NOT be added.
+ */
+ if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
+ {
+ return -1;
+ }
+
+ newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
break;
case F_POSRES:
newparam->rbdihs.rbcB[3]=-2.0*old[NR_FOURDIHS+2];
newparam->rbdihs.rbcB[4]=-4.0*old[NR_FOURDIHS+3];
newparam->rbdihs.rbcB[5]=0.0;
- break;
+ break;
case F_CONSTR:
case F_CONSTRNC:
newparam->constr.dA = old[0];
gmx_fatal(FARGS,"unknown function type %d in %s line %d",
ftype,__FILE__,__LINE__);
}
+ return 0;
}
static int enter_params(gmx_ffparams_t *ffparams, t_functype ftype,
{
t_iparams newparam;
int type;
-
- assign_param(ftype,&newparam,forceparams,comb,reppow);
+ int rc;
+
+ if( (rc=assign_param(ftype,&newparam,forceparams,comb,reppow))<0 )
+ {
+ /* -1 means this interaction is all-zero and should not be added */
+ return rc;
+ }
+
if (!bAppend) {
for (type=start; (type<ffparams->ntypes); type++) {
if (ffparams->functype[type]==ftype) {
__FILE__,__LINE__,*maxtypes);
}
type = enter_params(ffparams,ftype,p->param[k].c,comb,reppow,start,bAppend);
- if (!bNB) {
+ /* Type==-1 is used as a signal that this interaction is all-zero and should not be added. */
+ if (!bNB && type>=0) {
nral = NRAL(ftype);
delta = nr*(nral+1);
srenew(il->iatoms,il->nr+delta);
b0 = 0;
switch (ftype) {
case F_BONDS:
- case F_G96BONDS:
b0 = idef->iparams[type].harmonic.rA;
+ break;
+ case F_G96BONDS:
+ b0 = sqrt(idef->iparams[type].harmonic.rA);
break;
case F_MORSE:
b0 = idef->iparams[type].morse.b0A;
/* A histidine residue exists that requires automated assignment, so
* doing the analysis of donors and acceptors is worthwhile. */
fprintf(stderr,
- "Analysing hydrogen-bonding network for automated assigment of histidine\n"
+ "Analysing hydrogen-bonding network for automated assignment of histidine\n"
" protonation.");
snew(donor,natom);
{
nn = rr[i].main;
}
+
if (nn[0] == '-')
{
- gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? " as a starting terminus" : (bEnd ? " as an ending terminus" : ""));
+ gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
}
}
CTYPE ("a value of -1 means: use rlist");
RTYPE("verlet-buffer-drift", ir->verletbuf_drift, 0.005);
CTYPE ("nblist cut-off");
- RTYPE ("rlist", ir->rlist, -1);
+ RTYPE ("rlist", ir->rlist, 1.0);
CTYPE ("long-range cut-off for switched potentials");
RTYPE ("rlistlong", ir->rlistlong, -1);
ITYPE ("nstcalclr", ir->nstcalclr, -1);
EETYPE("coulomb-modifier", ir->coulomb_modifier, eintmod_names);
CTYPE ("cut-off lengths");
RTYPE ("rcoulomb-switch", ir->rcoulomb_switch, 0.0);
- RTYPE ("rcoulomb", ir->rcoulomb, -1);
+ RTYPE ("rcoulomb", ir->rcoulomb, 1.0);
CTYPE ("Relative dielectric constant for the medium and the reaction field");
RTYPE ("epsilon-r", ir->epsilon_r, 1.0);
RTYPE ("epsilon-rf", ir->epsilon_rf, 0.0);
EETYPE("vdw-modifier", ir->vdw_modifier, eintmod_names);
CTYPE ("cut-off lengths");
RTYPE ("rvdw-switch", ir->rvdw_switch, 0.0);
- RTYPE ("rvdw", ir->rvdw, -1);
+ RTYPE ("rvdw", ir->rvdw, 1.0);
CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
EETYPE("DispCorr", ir->eDispCorr, edispc_names);
CTYPE ("Extension of the potential lookup tables beyond the cut-off");
RTYPE ("table-extension", ir->tabext, 1.0);
- CTYPE ("Seperate tables between energy group pairs");
+ CTYPE ("Separate tables between energy group pairs");
STYPE ("energygrp-table", egptable, NULL);
CTYPE ("Spacing for the PME/PPPM FFT grid");
RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
}
fprintf(fplog,"\n");
+ /* the "Order After Exchange" is the state label corresponding to the configuration that
+ started in state listed in order, i.e.
+
+ 3 0 1 2
+
+ means that the:
+ configuration starting in simulation 3 is now in simulation 0,
+ configuration starting in simulation 0 is now in simulation 1,
+ configuration starting in simulation 1 is now in simulation 2,
+ configuration starting in simulation 2 is now in simulation 3
+ */
fprintf(fplog,"Order After Exchange: ");
for (i=0;i<n;i++)
{
= [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
= [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
= de[b][a] + de[a][b] */
+
/* permuted:
ediff = E_new - E_old
= [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
= [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
= [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
= (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]) */
+ /* but, in the current code implementation, we flip configurations, not indices . . .
+ So let's examine that.
+ = [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+ = [H_b(x_ap) - H_a(x_ap)] + [H_a(x_bp) - H_b(x_pb)]
+ = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+ So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+ So the simple solution is to flip the
+ position of perturbed and original indices in the tests.
+ */
+
ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
delta = ediff*beta[a]; /* assume all same temperature in this case */
break;
gmx_bool bPrint,bMultiEx;
gmx_bool *bEx = re->bEx;
real *prob = re->prob;
- int *pind = re->destinations;
+ int *pind = re->destinations; /* permuted index */
gmx_bool bEpot=FALSE;
gmx_bool bDLambda=FALSE;
gmx_bool bVol=FALSE;
for (i=0;i<re->nex;i++)
{
/* randomly select a pair */
- /* find out which state it is from, and what label that state currently has */
+ /* in theory, could reduce this by identifying only which switches had a nonneglibible
+ probability of occurring (log p > -100) and only operate on those switches */
+ /* find out which state it is from, and what label that state currently has. Likely
+ more work that useful. */
i0 = (int)(re->nrepl*rando(&(re->seed)));
i1 = (int)(re->nrepl*rando(&(re->seed)));
if (i0==i1)
{
i--;
- continue; /* got the same pair, back up and do it again */
+ continue; /* self-exchange, back up and do it again */
}
- a = re->ind[i0];
+ a = re->ind[i0]; /* what are the indices of these states? */
b = re->ind[i1];
ap = pind[i0];
bp = pind[i1];
bPrint = FALSE; /* too noisy */
- delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); /* calculate the energy difference */
+ /* calculate the energy difference */
+ /* if the code changes to flip the STATES, rather than the configurations,
+ use the commented version of the code */
+ /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+ delta = calc_delta(fplog,bPrint,re,ap,bp,a,b);
- /* we actually only use the first space, since there are actually many switches between pairs. */
+ /* we actually only use the first space in the prob and bEx array,
+ since there are actually many switches between pairs. */
if (delta <= 0)
{
re->nmoves[re->ind[i]][pind[i]] +=1;
re->nmoves[pind[i]][re->ind[i]] +=1;
}
+ fflush(fplog); /* make sure we can see what the last exchange was */
}
static void write_debug_x(t_state *state)
/* There will be only one swap cycle with standard replica
* exchange, but there may be multiple swap cycles if we
* allow multiple swaps. */
+
for (j = 0; j < maxswap; j++)
{
exchange_partner = re->order[replica_id][j];
}
else
{
- /* check if some threads failed to set their affinities */
+ /* check & warn if some threads failed to set their affinities */
if (nth_affinity_set != nthread_local)
{
- char sbuf[STRLEN];
- sbuf[0] = '\0';
+ char sbuf1[STRLEN], sbuf2[STRLEN];
+
+ /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+ sbuf1[0] = sbuf2[0] = '\0';
#ifdef GMX_MPI
#ifdef GMX_THREAD_MPI
- sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
+ sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
#else /* GMX_LIB_MPI */
+ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
- sprintf(sbuf, "In MPI process #%d", cr->nodeid);
#endif /* GMX_MPI */
+
+ if (nthread_local > 1)
+ {
+ sprintf(sbuf2, "of %d/%d thread%s ",
+ nthread_local - nth_affinity_set, nthread_local,
+ (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+ }
+
md_print_warn(NULL, fplog,
- "%s%d/%d thread%s failed to set their affinities. "
- "This can cause performance degradation!",
- sbuf, nthread_local - nth_affinity_set, nthread_local,
- (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+ "NOTE: %sAffinity setting %sfailed.\n"
+ " This can cause performance degradation!",
+ sbuf1, sbuf2);
}
}
}
/* Initialize the essential dynamics sampling.
* Put the pointer to the ED struct in constr */
constr->ed = ed;
- if (ed != NULL)
+ if (ed != NULL || state->edsamstate.nED > 0)
{
- init_edsam(mtop,ir,cr,ed,state->x,state->box);
+ init_edsam(mtop,ir,cr,ed,state->x,state->box,&state->edsamstate);
}
constr->warn_mtop = mtop;
* with respect to the collective
* anrs[0...nr-1] array */
rvec *x; /* positions for this structure */
- rvec *x_old; /* used to keep track of the shift vectors
- such that the ED molecule can always be
- made whole in the parallel case */
+ rvec *x_old; /* Last positions which have the correct PBC
+ representation of the ED group. In
+ combination with keeping track of the
+ shift vectors, the ED group can always
+ be made whole */
real *m; /* masses */
real mtot; /* total mass (only used in sref) */
real *sqrtm; /* sqrt of the masses used for mass-
FILE *edo; /* output file pointer */
t_edpar *edpar;
gmx_bool bFirst;
- gmx_bool bStartFromCpt;
} t_gmx_edsam;
for (i=0; i<edi->sav.nr; i++)
+ {
proj += edi->sav.sqrtm[i]*iprod(vec[i], xcoll[i]);
+ }
return proj;
}
/* Subtract average positions */
for (i = 0; i < edi->sav.nr; i++)
+ {
rvec_dec(x[i], edi->sav.x[i]);
+ }
for (i = 0; i < vec->neig; i++)
{
/* Add average positions */
for (i = 0; i < edi->sav.nr; i++)
+ {
rvec_inc(x[i], edi->sav.x[i]);
+ }
}
/* Subtract average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_dec(x[i], edi->sav.x[i]);
+ }
for (i=0; i<vec->neig; i++)
+ {
vec->xproj[i] = projectx(edi, x, vec->vec[i]);
+ }
/* Add average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_inc(x[i], edi->sav.x[i]);
+ }
}
for (i=0; i<vec->neig; i++)
+ {
rad += pow((vec->refproj[i]-vec->xproj[i]),2);
+ }
return rad=sqrt(rad);
}
fp = fopen(fn, "w");
for (i=0; i<edi->sav.nr; i++)
+ {
fprintf(fp, "%d %9.5f %9.5f %9.5f %d %d %d %d %d %d\n",
edi->sav.anrs[i]+1,
xcoll[i][XX] , xcoll[i][YY] , xcoll[i][ZZ],
shifts[i][XX] , shifts[i][YY] , shifts[i][ZZ],
eshifts[i][XX], eshifts[i][YY], eshifts[i][ZZ]);
+ }
fclose(fp);
}
fprintf(out, "#%s positions:\n%d\n", name, s->nr);
if (s->nr == 0)
+ {
return;
+ }
fprintf(out, "#index, x, y, z");
if (s->sqrtm)
+ {
fprintf(out, ", sqrt(m)");
+ }
for (i=0; i<s->nr; i++)
{
fprintf(out, "\n%6d %11.6f %11.6f %11.6f",s->anrs[i], s->x[i][XX], s->x[i][YY], s->x[i][ZZ]);
if (s->sqrtm)
+ {
fprintf(out,"%9.3f",s->sqrtm[i]);
+ }
}
fprintf(out, "\n");
}
fprintf(out, "EV %4d\ncomponents %d\nstepsize %f\nxproj %f\nfproj %f\nrefproj %f\nradius %f\nComponents:\n",
ev->ieig[i], length, ev->stpsz[i], ev->xproj[i], ev->fproj[i], ev->refproj[i], ev->radius);
for (j=0; j<length; j++)
+ {
fprintf(out, "%11.6f %11.6f %11.6f\n", ev->vec[i][j][XX], ev->vec[i][j][YY], ev->vec[i][j][ZZ]);
+ }
}
}
for (i=0; i<dim; i++)
+ {
fprintf(out,"%4d %f %f %f\n",i,x[i][XX],x[i][YY],x[i][ZZ]);
+ }
}
for (i=0;i<dim;i++)
{
for (j=0;j<dim;j++)
+ {
fprintf(out,"%f ",mat[i][j]);
+ }
fprintf(out,"\n");
}
}
gmx_bool bFirst;
if(edi->buf->do_edfit != NULL)
+ {
bFirst = FALSE;
+ }
else
{
bFirst = TRUE;
/* construct loc->omega */
/* loc->omega is symmetric -> loc->omega==loc->omega' */
for(r=0;(r<6);r++)
+ {
for(c=0;(c<=r);c++)
+ {
if ((r>=3) && (c<3))
{
loc->omega[r][c]=u[r-3][c];
loc->omega[r][c]=0;
loc->omega[c][r]=0;
}
+ }
+ }
/* determine h and k */
#ifdef DEBUG
int i;
dump_mat(stderr,2*DIM,loc->omega);
for (i=0; i<6; i++)
+ {
fprintf(stderr,"d[%d] = %f\n",i,d[i]);
+ }
}
#endif
jacobi(loc->omega,6,d,loc->om,&irot);
if (irot==0)
+ {
fprintf(stderr,"IROT=0\n");
+ }
index=0; /* For the compiler only */
{
max_d=-1000;
for(i=0;(i<6);i++)
+ {
if (d[i]>max_d)
{
max_d=d[i];
index=i;
}
+ }
d[index]=-10000;
for(i=0;(i<3);i++)
{
/* determine R */
for(c=0;(c<3);c++)
+ {
for(r=0;(r<3);r++)
+ {
R[c][r]=vk[0][r]*vh[0][c]+
- vk[1][r]*vh[1][c]+
- vk[2][r]*vh[2][c];
+ vk[1][r]*vh[1][c]+
+ vk[2][r]*vh[2][c];
+ }
+ }
if (det(R) < 0)
+ {
for(c=0;(c<3);c++)
+ {
for(r=0;(r<3);r++)
+ {
R[c][r]=vk[0][r]*vh[0][c]+
- vk[1][r]*vh[1][c]-
- vk[2][r]*vh[2][c];
+ vk[1][r]*vh[1][c]-
+ vk[2][r]*vh[2][c];
+ }
+ }
+ }
}
for (i = 0; i < edi->flood.vecs.neig; i++)
{
if (edi->flood.vecs.refprojslope[i] != 0.0)
+ {
bOutputRef=TRUE;
+ }
}
if (bOutputRef)
{
fprintf(fp,"FL_FORCES: ");
for (i=0; i<edi->flood.vecs.neig; i++)
+ {
fprintf(fp," %12.5e",edi->flood.vecs.fproj[i]);
+ }
fprintf(fp,"\n");
}
if (edi->flood.bHarmonic)
+ {
for (i=0; i<edi->flood.vecs.neig; i++)
{
edi->flood.vecs.fproj[i] = edi->flood.Efl* edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]);
}
+ }
else
+ {
for (i=0; i<edi->flood.vecs.neig; i++)
{
/* if Efl is zero the forces are zero if not use the formula */
edi->flood.vecs.fproj[i] = edi->flood.Efl!=0 ? edi->flood.kT/edi->flood.Efl/edi->flood.alpha2*energy*edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]) : 0;
}
+ }
}
/* Clear forces first */
for (j=0; j<edi->sav.nr_loc; j++)
+ {
clear_rvec(forces_cart[j]);
+ }
/* Now compute atomwise */
for (j=0; j<edi->sav.nr_loc; j++)
edi->flood.Efl = edi->flood.Efl+edi->flood.dt/edi->flood.tau*(edi->flood.deltaF0-edi->flood.deltaF);
/* check if restrain (inverted flooding) -> don't let EFL become positive */
if (edi->flood.alpha2<0 && edi->flood.Efl>-0.00000001)
+ {
edi->flood.Efl = 0;
+ }
edi->flood.deltaF = (1-edi->flood.dt/edi->flood.tau)*edi->flood.deltaF+edi->flood.dt/edi->flood.tau*edi->flood.Vfl;
}
/* Only assembly REFERENCE positions if their indices differ from the average ones */
if (!edi->bRefEqAv)
+ {
communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, bNS, x,
edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+ }
/* If bUpdateShifts was TRUE, the shifts have just been updated in get_positions.
* We do not need to update the shifts until the next NS step */
/* Fit the reference indices to the reference structure */
if (edi->bRefEqAv)
+ {
fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+ }
else
+ {
fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+ }
/* Now apply the translation and rotation to the ED structure */
translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
/* Finally add forces to the main force variable */
for (i=0; i<edi->sav.nr_loc; i++)
+ {
rvec_inc(force[edi->sav.anrs_loc[i]],edi->flood.forces_cartesian[i]);
+ }
/* Output is written by the master process */
if (do_per_step(step,edi->outfrq) && MASTER(cr))
+ {
write_edo_flood(edi,edo,step);
+ }
}
{
/* Call flooding for one matrix */
if (edi->flood.vecs.neig)
+ {
do_single_flood(ed->edo,x,force,edi,step,box,cr,bNS);
+ }
edi = edi->next_edi;
}
}
/* Called by init_edi, configure some flooding related variables and structures,
* print headers to output files */
-static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
+static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr, gmx_bool bPrintheader)
{
int i;
edi->flood.vecs.ieig[i], edi->flood.vecs.fproj[i]);
}
}
- fprintf(ed->edo,"FL_HEADER: Flooding of matrix %d is switched on! The flooding output will have the following format:\n",
- edi->flood.flood_id);
- fprintf(ed->edo,"FL_HEADER: Step Efl Vfl deltaF\n");
+
+ if (bPrintheader)
+ {
+ fprintf(ed->edo,"FL_HEADER: Flooding of matrix %d is switched on! The flooding output will have the following format:\n",
+ edi->flood.flood_id);
+ fprintf(ed->edo,"FL_HEADER: Step Efl Vfl deltaF\n");
+ }
}
}
count++;
}
if (nnames!=count-1)
+ {
gmx_fatal(FARGS,"Number of energies is not consistent with t_edi structure");
+ }
}
/************* END of FLOODING IMPLEMENTATION ****************************/
#endif
fprintf(stderr,"ED sampling will be performed!\n");
ed->edonam = ftp2fn(efEDO,nfile,fnm);
ed->edo = gmx_fio_fopen(ed->edonam,(Flags & MD_APPENDFILES)? "a+" : "w+");
- ed->bStartFromCpt = Flags & MD_STARTFROMCPT;
}
return ed;
}
static void check(const char *line, const char *label)
{
if (!strstr(line,label))
+ {
gmx_fatal(FARGS,"Could not find input parameter %s at expected position in edsam input-file (.edi)\nline read instead is %s",label,line);
+ }
}
sscanf (line,"%d%lf%lf%lf",&anrs[i],&d[0],&d[1],&d[2]);
anrs[i]--; /* we are reading FORTRAN indices */
for(j=0; j<3; j++)
+ {
x[i][j]=d[j]; /* always read as double and convert to single */
+ }
}
}
{
nscan = sscanf(line,"%d%lf",&idum,&rdum);
if (nscan != 2)
+ {
gmx_fatal(FARGS,"Expected 2 values for flooding vec: <nr> <stpsz>\n");
+ }
}
tvec->ieig[i]=idum;
tvec->stpsz[i]=rdum;
/* If the number of atoms differs between the two structures,
* they cannot be identical */
if (sref.nr != sav.nr)
+ {
return FALSE;
+ }
/* Now that we know that both stuctures have the same number of atoms,
* check if also the indices are identical */
for (i=0; i < sav.nr; i++)
{
if (sref.anrs[i] != sav.anrs[i])
+ {
return FALSE;
+ }
}
fprintf(stderr, "ED: Note: Reference and average structure are composed of the same atom indices.\n");
readmagic=read_edint(in,&bEOF);
/* Check whether we have reached the end of the input file */
if (bEOF)
+ {
return 0;
+ }
if (readmagic != magic)
{
if (readmagic==666 || readmagic==667 || readmagic==668)
+ {
gmx_fatal(FARGS,"Wrong magic number: Use newest version of make_edi to produce edi file");
+ }
else if (readmagic != 669)
+ {
gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,ed->edinam);
+ }
}
/* check the number of atoms */
edi->nini=read_edint(in,&bEOF);
if (edi->nini != nr_mdatoms)
+ {
gmx_fatal(FARGS,"Nr of atoms in %s (%d) does not match nr of md atoms (%d)",
ed->edinam,edi->nini,nr_mdatoms);
+ }
/* Done checking. For the rest we blindly trust the input */
edi->fitmas = read_checked_edint(in,"FITMAS");
edi->flood.kT = read_checked_edreal(in,"KT");
edi->flood.bHarmonic = read_checked_edint(in,"HARMONIC");
if (readmagic > 669)
+ {
edi->flood.bConstForce = read_checked_edint(in,"CONST_FORCE_FLOODING");
+ }
else
+ {
edi->flood.bConstForce = FALSE;
+ }
edi->flood.flood_id = edi_nr;
edi->sref.nr = read_checked_edint(in,"NREF");
edi->sori.nr=read_edint(in,&bEOF);
if (edi->sori.nr > 0)
{
- if (bHaveReference)
- {
- /* Both an -ori structure and a at least one manual reference point have been
- * specified. That's ambiguous and probably not intentional. */
- gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
- " point was manually specified in the edi file. That is ambiguous. Aborting.\n");
- }
+ if (bHaveReference)
+ {
+ /* Both an -ori structure and a at least one manual reference point have been
+ * specified. That's ambiguous and probably not intentional. */
+ gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
+ " point was manually specified in the edi file. That is ambiguous. Aborting.\n");
+ }
snew(edi->sori.anrs,edi->sori.nr);
snew(edi->sori.x ,edi->sori.nr);
edi->sori.sqrtm =NULL;
/* Read in the edi input file. Note that it may contain several ED data sets which were
* achieved by concatenating multiple edi files. The standard case would be a single ED
* data set, though. */
-static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commrec *cr)
+static int read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commrec *cr)
{
FILE *in;
t_edpar *curr_edi,*last_edi;
edi_nr++;
/* Make shure that the number of atoms in each dataset is the same as in the tpr file */
if (edi->nini != nr_mdatoms)
+ {
gmx_fatal(FARGS,"edi file %s (dataset #%d) was made for %d atoms, but the simulation contains %d atoms.",
ed->edinam, edi_nr, edi->nini, nr_mdatoms);
+ }
/* Since we arrived within this while loop we know that there is still another data set to be read in */
/* We need to allocate space for the data: */
snew(edi_read,1);
curr_edi = edi_read;
}
if (edi_nr == 0)
+ {
gmx_fatal(FARGS, "No complete ED data set found in edi file %s.", ed->edinam);
+ }
/* Terminate the edi dataset list with a NULL pointer: */
last_edi->next_edi = NULL;
/* Close the .edi file again */
gmx_fio_fclose(in);
+
+ return edi_nr;
}
/* We do not touch the original positions but work on a copy. */
for (i=0; i<edi->sref.nr; i++)
+ {
copy_rvec(xcoll[i], loc->xcopy[i]);
+ }
/* Calculate the center of mass */
get_center(loc->xcopy, edi->sref.m, edi->sref.nr, com);
for (i=0; i < s->nr; i++)
+ {
rmsd += distance2(s->x[i], x[i]);
+ }
rmsd /= (real) s->nr;
rmsd = sqrt(rmsd);
/* Local atoms of the reference structure (for fitting), need only be assembled
* if their indices differ from the average ones */
if (!edi->bRefEqAv)
+ {
dd_make_local_group_indices(dd->ga2la, edi->sref.nr, edi->sref.anrs,
&edi->sref.nr_loc, &edi->sref.anrs_loc, &edi->sref.nalloc_loc, edi->sref.c_ind);
+ }
/* Local atoms of the average structure (on these ED will be performed) */
dd_make_local_group_indices(dd->ga2la, edi->sav.nr, edi->sav.anrs,
xu[XX] = x[XX]-tx*box[XX][XX]-ty*box[YY][XX]-tz*box[ZZ][XX];
xu[YY] = x[YY]-ty*box[YY][YY]-tz*box[ZZ][YY];
xu[ZZ] = x[ZZ]-tz*box[ZZ][ZZ];
- } else
+ }
+ else
{
xu[XX] = x[XX]-tx*box[XX][XX];
xu[YY] = x[YY]-ty*box[YY][YY];
if (edi->vecs.linacc.stpsz[i] > 0.0)
{
if ((proj-edi->vecs.linacc.refproj[i]) < 0.0)
+ {
add = edi->vecs.linacc.refproj[i] - proj;
+ }
}
if (edi->vecs.linacc.stpsz[i] < 0.0)
{
if ((proj-edi->vecs.linacc.refproj[i]) > 0.0)
+ {
add = edi->vecs.linacc.refproj[i] - proj;
+ }
}
/* apply the correction */
/* apply the correction */
proj[i] /= edi->sav.sqrtm[i];
proj[i] *= ratio;
- for (j=0; j<edi->sav.nr; j++) {
+ for (j=0; j<edi->sav.nr; j++)
+ {
svmul(proj[i], edi->vecs.radfix.vec[i][j], vec_dum);
rvec_inc(xcoll[j], vec_dum);
}
/* subtract the average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_dec(xcoll[i], edi->sav.x[i]);
+ }
/* apply the constraints */
if (step >= 0)
+ {
do_linfix(xcoll, edi, step, cr);
+ }
do_linacc(xcoll, edi, cr);
if (step >= 0)
+ {
do_radfix(xcoll, edi, step, cr);
+ }
do_radacc(xcoll, edi, cr);
do_radcon(xcoll, edi, cr);
/* add back the average positions */
for (i=0; i<edi->sav.nr; i++)
+ {
rvec_inc(xcoll[i], edi->sav.x[i]);
+ }
GMX_MPE_LOG(ev_ed_apply_cons_finish);
}
if (edi->bNeedDoEdsam)
{
if (step == -1)
+ {
fprintf(ed->edo, "Initial projections:\n");
+ }
else
{
fprintf(ed->edo,"Step %s, ED #%d ", gmx_step_str(step, buf), nr_edi);
{
fprintf(ed->edo," Monitor eigenvectors");
for (i=0; i<edi->vecs.mon.neig; i++)
+ {
fprintf(ed->edo," %d: %12.5e ",edi->vecs.mon.ieig[i],edi->vecs.mon.xproj[i]);
+ }
fprintf(ed->edo,"\n");
}
if (edi->vecs.linfix.neig)
{
fprintf(ed->edo," Linfix eigenvectors");
for (i=0; i<edi->vecs.linfix.neig; i++)
+ {
fprintf(ed->edo," %d: %12.5e ",edi->vecs.linfix.ieig[i],edi->vecs.linfix.xproj[i]);
+ }
fprintf(ed->edo,"\n");
}
if (edi->vecs.linacc.neig)
{
fprintf(ed->edo," Linacc eigenvectors");
for (i=0; i<edi->vecs.linacc.neig; i++)
+ {
fprintf(ed->edo," %d: %12.5e ",edi->vecs.linacc.ieig[i],edi->vecs.linacc.xproj[i]);
+ }
fprintf(ed->edo,"\n");
}
if (edi->vecs.radfix.neig)
{
fprintf(ed->edo," Radfix eigenvectors");
for (i=0; i<edi->vecs.radfix.neig; i++)
+ {
fprintf(ed->edo," %d: %12.5e ",edi->vecs.radfix.ieig[i],edi->vecs.radfix.xproj[i]);
+ }
fprintf(ed->edo,"\n");
fprintf(ed->edo," fixed increment radius = %f\n", calc_radius(&edi->vecs.radfix));
}
{
fprintf(ed->edo," Radacc eigenvectors");
for (i=0; i<edi->vecs.radacc.neig; i++)
+ {
fprintf(ed->edo," %d: %12.5e ",edi->vecs.radacc.ieig[i],edi->vecs.radacc.xproj[i]);
+ }
fprintf(ed->edo,"\n");
fprintf(ed->edo," acceptance radius = %f\n", calc_radius(&edi->vecs.radacc));
}
{
fprintf(ed->edo," Radcon eigenvectors");
for (i=0; i<edi->vecs.radcon.neig; i++)
+ {
fprintf(ed->edo," %d: %12.5e ",edi->vecs.radcon.ieig[i],edi->vecs.radcon.xproj[i]);
+ }
fprintf(ed->edo,"\n");
fprintf(ed->edo," contracting radius = %f\n", calc_radius(&edi->vecs.radcon));
}
if (NULL==floodvecs->refproj0)
+ {
snew(floodvecs->refproj0, floodvecs->neig);
+ }
for (i=0; i<floodvecs->neig; i++)
{
}
+/* Call on MASTER only. Check whether the essential dynamics / flooding
+ * datasets of the checkpoint file are consistent with the provided .edi file. */
+static void crosscheck_edi_file_vs_checkpoint(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+ t_edpar *edi = NULL; /* points to a single edi data set */
+ int i, edinum;
+
+
+ if (NULL == EDstate->nref || NULL == EDstate->nav)
+ {
+ gmx_fatal(FARGS, "Essential dynamics and flooding can only be switched on (or off) at the\n"
+ "start of a new simulation. If a simulation runs with/without ED constraints,\n"
+ "it must also continue with/without ED constraints when checkpointing.\n"
+ "To switch on (or off) ED constraints, please prepare a new .tpr to start\n"
+ "from without a checkpoint.\n");
+ }
+
+ edi=ed->edpar;
+ edinum = 0;
+ while(edi != NULL)
+ {
+ /* Check number of atoms in the reference and average structures */
+ if (EDstate->nref[edinum] != edi->sref.nr)
+ {
+ gmx_fatal(FARGS, "The number of reference structure atoms in ED dataset #%d is\n"
+ "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+ edinum+1, EDstate->nref[edinum], edi->sref.nr);
+ }
+ if (EDstate->nav[edinum] != edi->sav.nr)
+ {
+ gmx_fatal(FARGS, "The number of average structure atoms in ED dataset #%d is\n"
+ "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+ edinum+1, EDstate->nav[edinum], edi->sav.nr);
+ }
+ edi=edi->next_edi;
+ edinum++;
+ }
+
+ if (edinum != EDstate->nED)
+ {
+ gmx_fatal(FARGS, "The number of essential dynamics / flooding datasets is not consistent.\n"
+ "There are %d ED datasets in .cpt file, but %d in .edi file!\n"
+ "Are you shure this is the correct .edi file?\n", EDstate->nED, edinum);
+ }
+}
+
+
+/* The edsamstate struct stores the information we need to make the ED group
+ * whole again after restarts from a checkpoint file. Here we do the following:
+ * a) If we did not start from .cpt, we prepare the struct for proper .cpt writing,
+ * b) if we did start from .cpt, we copy over the last whole structures from .cpt,
+ * c) in any case, for subsequent checkpoint writing, we set the pointers in
+ * edsamstate to the x_old arrays, which contain the correct PBC representation of
+ * all ED structures at the last time step. */
+static void init_edsamstate(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+ int i, nr_edi;
+ t_edpar *edi;
+
+
+ snew(EDstate->old_sref_p, EDstate->nED);
+ snew(EDstate->old_sav_p , EDstate->nED);
+
+ /* If we did not read in a .cpt file, these arrays are not yet allocated */
+ if (!EDstate->bFromCpt)
+ {
+ snew(EDstate->nref, EDstate->nED);
+ snew(EDstate->nav , EDstate->nED);
+ }
+
+ /* Loop over all ED/flooding data sets (usually only one, though) */
+ edi = ed->edpar;
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
+ {
+ /* We always need the last reference and average positions such that
+ * in the next time step we can make the ED group whole again
+ * if the atoms do not have the correct PBC representation */
+ if (EDstate->bFromCpt)
+ {
+ /* Copy the last whole positions of reference and average group from .cpt */
+ for (i=0; i<edi->sref.nr; i++)
+ {
+ copy_rvec(EDstate->old_sref[nr_edi-1][i], edi->sref.x_old[i]);
+ }
+ for (i=0; i<edi->sav.nr ; i++)
+ {
+ copy_rvec(EDstate->old_sav [nr_edi-1][i], edi->sav.x_old [i]);
+ }
+ }
+ else
+ {
+ EDstate->nref[nr_edi-1] = edi->sref.nr;
+ EDstate->nav [nr_edi-1] = edi->sav.nr;
+ }
+
+ /* For subsequent checkpoint writing, set the edsamstate pointers to the edi arrays: */
+ EDstate->old_sref_p[nr_edi-1] = edi->sref.x_old;
+ EDstate->old_sav_p [nr_edi-1] = edi->sav.x_old ;
+
+ edi = edi->next_edi;
+ }
+}
+
+
void init_edsam(gmx_mtop_t *mtop, /* global topology */
t_inputrec *ir, /* input record */
t_commrec *cr, /* communication record */
gmx_edsam_t ed, /* contains all ED data */
rvec x[], /* positions of the whole MD system */
- matrix box) /* the box */
+ matrix box, /* the box */
+ edsamstate_t *EDstate)
{
t_edpar *edi = NULL; /* points to a single edi data set */
- int numedis=0; /* keep track of the number of ED data sets in edi file */
int i,nr_edi,avindex;
rvec *x_pbc = NULL; /* positions of the whole MD system with pbc removed */
- rvec *xfit = NULL; /* the positions which will be fitted to the reference structure */
- rvec *xstart = NULL; /* the positions which are subject to ED sampling */
+ rvec *xfit=NULL, *xstart=NULL; /* dummy arrays to determine initial RMSDs */
rvec fit_transvec; /* translation ... */
matrix fit_rotmat; /* ... and rotation from fit to reference structure */
if (!DOMAINDECOMP(cr) && PAR(cr) && MASTER(cr))
+ {
gmx_fatal(FARGS, "Please switch on domain decomposition to use essential dynamics in parallel.");
+ }
GMX_MPE_LOG(ev_edsam_start);
if (MASTER(cr))
+ {
fprintf(stderr, "ED: Initializing essential dynamics constraints.\n");
+ if (NULL == ed)
+ {
+ gmx_fatal(FARGS, "The checkpoint file you provided is from an essential dynamics or\n"
+ "flooding simulation. Please also provide the correct .edi file with -ei.\n");
+ }
+ }
+
/* Needed for initializing radacc radius in do_edsam */
ed->bFirst = 1;
{
snew(ed->edpar,1);
/* Read the whole edi file at once: */
- read_edi_file(ed,ed->edpar,mtop->natoms,cr);
+ EDstate->nED = read_edi_file(ed,ed->edpar,mtop->natoms,cr);
+
+ /* Make shure the checkpoint was produced in a run using this .edi file */
+ if (EDstate->bFromCpt)
+ {
+ crosscheck_edi_file_vs_checkpoint(ed, EDstate);
+ }
+ init_edsamstate(ed, EDstate);
/* Initialization for every ED/flooding dataset. Flooding uses one edi dataset per
* flooding vector, Essential dynamics can be applied to more than one structure
init_edi(mtop,ir,cr,ed,edi);
/* Init flooding parameters if needed */
- init_flood(edi,ed,ir->delta_t,cr);
+ init_flood(edi,ed,ir->delta_t,cr,!EDstate->bFromCpt);
edi=edi->next_edi;
- numedis++;
}
}
edi=ed->edpar;
/* Loop over all ED/flooding data sets (usually only one, though) */
- for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
{
- /* We use srenew to allocate memory since the size of the buffers
- * is likely to change with every ED dataset */
- srenew(xfit , edi->sref.nr );
- srenew(xstart, edi->sav.nr );
-
- /* Extract the positions of the atoms to which will be fitted */
- for (i=0; i < edi->sref.nr; i++)
+ /* Extract the initial reference and average positions. When starting
+ * from .cpt, these have already been read into sref.x_old
+ * in init_edsamstate() */
+ if (!EDstate->bFromCpt)
{
- copy_rvec(x_pbc[edi->sref.anrs[i]], xfit[i]);
+ /* If this is the first run (i.e. no checkpoint present) we assume
+ * that the starting positions give us the correct PBC representation */
+ for (i=0; i < edi->sref.nr; i++)
+ {
+ copy_rvec(x_pbc[edi->sref.anrs[i]], edi->sref.x_old[i]);
+ }
- /* Save the sref positions such that in the next time step we can make the ED group whole
- * in case any of the atoms do not have the correct PBC representation */
- copy_rvec(xfit[i], edi->sref.x_old[i]);
+ for (i=0; i < edi->sav.nr; i++)
+ {
+ copy_rvec(x_pbc[edi->sav.anrs[i]], edi->sav.x_old[i]);
+ }
}
- /* Extract the positions of the atoms subject to ED sampling */
- for (i=0; i < edi->sav.nr; i++)
- {
- copy_rvec(x_pbc[edi->sav.anrs[i]], xstart[i]);
-
- /* Save the sav positions such that in the next time step we can make the ED group whole
- * in case any of the atoms do not have the correct PBC representation */
- copy_rvec(xstart[i], edi->sav.x_old[i]);
- }
+ /* Now we have the PBC-correct start positions of the reference and
+ average structure. We copy that over to dummy arrays on which we
+ can apply fitting to print out the RMSD. We srenew the memory since
+ the size of the buffers is likely different for every ED dataset */
+ srenew(xfit , edi->sref.nr );
+ srenew(xstart, edi->sav.nr );
+ copy_rvecn(edi->sref.x_old, xfit, 0, edi->sref.nr);
+ copy_rvecn(edi->sav.x_old, xstart, 0, edi->sav.nr);
/* Make the fit to the REFERENCE structure, get translation and rotation */
fit_to_reference(xfit, fit_transvec, fit_rotmat, edi);
avindex = edi->star.nr - edi->sav.nr;
}
rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon, cr);
- } else
+ }
+ else
+ {
rad_project(edi, xstart, &edi->vecs.radcon, cr);
+ }
/* process structure that will serve as origin of expansion circle */
if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
+ {
fprintf(stderr, "ED: Setting center of flooding potential (0 = average structure)\n");
+ }
if (edi->sori.nr > 0)
{
{
fprintf(stderr, "ED: A (possibly changing) ref. projection will define the flooding potential center.\n");
for (i=0; i<edi->flood.vecs.neig; i++)
+ {
edi->flood.vecs.refproj[i] = edi->flood.vecs.refproj0[i];
+ }
}
else
{
/* Set center of flooding potential to the center of the covariance matrix,
* i.e. the average structure, i.e. zero in the projected system */
for (i=0; i<edi->flood.vecs.neig; i++)
+ {
edi->flood.vecs.refproj[i] = 0.0;
+ }
}
}
}
{
for (i=0; i<edi->flood.vecs.neig; i++)
{
- fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", i, edi->flood.vecs.refproj[i]);
+ fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", edi->flood.vecs.ieig[i], edi->flood.vecs.refproj[i]);
if (edi->flood.bHarmonic)
+ {
fprintf(stdout, " (adding %11.4e/timestep)", edi->flood.vecs.refprojslope[i]);
+ }
fprintf(stdout, "\n");
}
}
rad_project(edi, xstart, &edi->vecs.linfix, cr);
/* Output to file, set the step to -1 so that write_edo knows it was called from init_edsam */
- if (ed->edo && !(ed->bStartFromCpt))
+ if (ed->edo && !(EDstate->bFromCpt))
+ {
write_edo(nr_edi, edi, ed, -1, 0);
+ }
/* Prepare for the next edi data set: */
edi=edi->next_edi;
if (PAR(cr))
{
/* First let everybody know how many ED data sets to expect */
- gmx_bcast(sizeof(numedis), &numedis, cr);
+ gmx_bcast(sizeof(EDstate->nED), &EDstate->nED, cr);
/* Broadcast the essential dynamics / flooding data to all nodes */
- broadcast_ed_data(cr, ed, numedis);
+ broadcast_ed_data(cr, ed, EDstate->nED);
}
else
{
/* Loop over all ED data sets (usually only one, though) */
edi=ed->edpar;
- for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
{
edi->sref.anrs_loc = edi->sref.anrs;
edi->sav.anrs_loc = edi->sav.anrs;
snew(edi->sav.c_ind, edi->sav.nr);
/* Initialize the array */
for (i=0; i<edi->sav.nr; i++)
+ {
edi->sav.c_ind[i] = i;
+ }
/* In the general case we will need a different-sized array for the reference indices: */
if (!edi->bRefEqAv)
{
snew(edi->sref.c_ind, edi->sref.nr);
for (i=0; i<edi->sref.nr; i++)
+ {
edi->sref.c_ind[i] = i;
+ }
}
/* Point to the very same array in case of other structures: */
edi->star.c_ind = edi->sav.c_ind;
/* Allocate space for ED buffer variables */
/* Again, loop over ED data sets */
edi=ed->edpar;
- for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+ for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
{
/* Allocate space for ED buffer */
snew(edi->buf, 1);
/* Flush the edo file so that the user can check some things
* when the simulation has started */
if (ed->edo)
+ {
fflush(ed->edo);
+ }
GMX_MPE_LOG(ev_edsam_finish);
}
/* Check if ED sampling has to be performed */
if ( ed->eEDtype==eEDnone )
+ {
return;
+ }
/* Suppress output on first call of do_edsam if
* two-step sd2 integrator is used */
if ( (ir->eI==eiSD2) && (v != NULL) )
+ {
bSuppress = TRUE;
+ }
dt_1 = 1.0/ir->delta_t;
buf=edi->buf->do_edsam;
if (ed->bFirst)
+ {
/* initialise radacc radius for slope criterion */
buf->oldrad=calc_radius(&edi->vecs.radacc);
+ }
/* Copy the positions into buf->xc* arrays and after ED
* feed back corrections to the official positions */
#endif
/* Only assembly reference positions if their indices differ from the average ones */
if (!edi->bRefEqAv)
+ {
communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+ }
/* If bUpdateShifts was TRUE then the shifts have just been updated in communicate_group_positions.
* We do not need to update the shifts until the next NS step. Note that dd_make_local_ed_indices
/* Fit the reference indices to the reference structure */
if (edi->bRefEqAv)
+ {
fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+ }
else
+ {
fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+ }
/* Now apply the translation and rotation to the ED structure */
translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
project(buf->xcoll, edi);
rad_project(edi, buf->xcoll, &edi->vecs.radacc, cr);
buf->oldrad = 0.0;
- } else
+ }
+ else
+ {
buf->oldrad = edi->vecs.radacc.radius;
+ }
}
/* apply the constraints */
{
project(buf->xcoll, edi);
if (MASTER(cr) && !bSuppress)
+ {
write_edo(edinr, edi, ed, step, rmsdev);
+ }
}
/* Copy back the positions unless monitoring only */
#endif
if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
{
-#ifdef GMX_NBNXN_SIMD_2XNN
+#ifdef GMX_NBNXN_SIMD_4XN
*kernel_type = nbnxnk4xN_SIMD_4xN;
#else
gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
real *nbfp_i;
int n,ci,ci_sh;
int ish,ishf;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int cjind0,cjind1,cjind;
int ip,jp;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef CALC_ENERGIES
#ifndef ENERGY_GROUPS
}
}
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
#ifdef CALC_ENERGIES
real Vc_sub_self;
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+ load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE0,c12_SSE0);
#ifndef HALF_LJ
- load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+ load_lj_pair_params2(nbfp2,nbfp3,type,aj,c6_SSE2,c12_SSE2);
#endif
#endif /* not defined any LJ rule */
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+/* GMX_MM256_HERE should be set before including this file */
#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE 4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if defined GMX_MM256_HERE
#define STRIDE 4
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
-#else
-/* SSE double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
-#endif
-#endif
-
#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 2x(4+4) kernel */
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
#define TAB_FDV0
#else
-/* AVX double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+#error "unsupported kernel configuration"
#endif
#endif
int nbfp_stride;
int n,ci,ci_sh;
int ish,ish3;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int sci,scix,sciy,sciz,sci2;
int cjind0,cjind1,cjind;
int ip,jp;
gmx_mm_pr diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
gmx_mm_pr diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-#ifndef GMX_MM256_HERE
- __m128i zeroi_SSE = _mm_setzero_si128();
-#endif
#ifdef GMX_X86_SSE4_1
gmx_mm_pr zero_SSE = gmx_set1_pr(0);
#endif
const real *tab_coul_V;
#endif
#ifdef GMX_MM256_HERE
- int ti0_array[2*UNROLLJ-1],*ti0;
- int ti2_array[2*UNROLLJ-1],*ti2;
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
#endif
#ifdef CALC_ENERGIES
gmx_mm_pr mhalfsp_SSE;
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
- /* Generate aligned table pointers */
- ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ /* Generate aligned table index pointers */
+ ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
#endif
invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
egps_jshift = 2*nbat->neg_2log;
egps_jmask = (1<<egps_jshift) - 1;
egps_jstride = (UNROLLJ>>1)*UNROLLJ;
- /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ /* Major division is over i-particle energy groups, determine the stride */
Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
#endif
ish = (nbln->shift & NBNXN_CI_SHIFT);
ish3 = ish*3;
- cjind0 = nbln->cj_ind_start;
- cjind1 = nbln->cj_ind_end;
- /* Currently only works super-cells equal to sub-cells */
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
sci += (ci & 1)*(STRIDE>>1);
#endif
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef ENERGY_GROUPS
egps_i = nbat->energrp[ci];
iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz) ,shZ_SSE);
iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
gmx_mm_pr facel_SSE;
#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#define TAB_FDV0
#else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
#define SUM_SIMD(x) (x[0]+x[1])
#endif
#endif
#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
#define TAB_FDV0
#else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#endif
#endif
int nbfp_stride;
int n,ci,ci_sh;
int ish,ish3;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int sci,scix,sciy,sciz,sci2;
int cjind0,cjind1,cjind;
int ip,jp;
__m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
__m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
__m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
__m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
__m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
#endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
/* AVX: use floating point masks, as there are no integer instructions */
#ifndef GMX_DOUBLE
gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
#endif
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
__m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
__m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
__m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
__m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
#endif
-#else /* GMX_MM256_HERE */
+#endif
+#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
#endif
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
__m128i zeroi_SSE = _mm_setzero_si128();
#endif
#ifdef GMX_X86_SSE4_1
const real *tab_coul_V;
#endif
#ifdef GMX_MM256_HERE
- int ti0_array[2*UNROLLJ-1],*ti0;
- int ti1_array[2*UNROLLJ-1],*ti1;
- int ti2_array[2*UNROLLJ-1],*ti2;
- int ti3_array[2*UNROLLJ-1],*ti3;
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+ int ti1_array[2*GMX_SIMD_WIDTH_HERE-1],*ti1;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
+ int ti3_array[2*GMX_SIMD_WIDTH_HERE-1],*ti3;
#endif
#ifdef CALC_ENERGIES
gmx_mm_pr mhalfsp_SSE;
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
- /* Generate aligned table pointers */
- ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ /* Generate aligned table index pointers */
+ ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
#endif
invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
egps_jshift = 2*nbat->neg_2log;
egps_jmask = (1<<egps_jshift) - 1;
egps_jstride = (UNROLLJ>>1)*UNROLLJ;
- /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ /* Major division is over i-particle energy groups, determine the stride */
Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
#endif
ish = (nbln->shift & NBNXN_CI_SHIFT);
ish3 = ish*3;
- cjind0 = nbln->cj_ind_start;
- cjind1 = nbln->cj_ind_end;
- /* Currently only works super-cells equal to sub-cells */
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
sci += (ci & 1)*(STRIDE>>1);
#endif
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef ENERGY_GROUPS
egps_i = nbat->energrp[ci];
iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
iq_SSE0 = gmx_set1_pr(facel*q[sci]);
iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
* energy group pair energy storage
*/
+/* Transpose 2 double precision registers */
#define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1) \
{ \
- out0 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(0,0)); \
- out1 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(1,1)); \
+ out0 = _mm_unpacklo_pd(in0,in1); \
+ out1 = _mm_unpackhi_pd(in0,in1); \
}
#if defined GMX_MM128_HERE || !defined GMX_DOUBLE
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1) \
{ \
__m128 _c01,_c23; \
- _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(1,0,1,0)); \
- _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(1,0,1,0)); \
+ _c01 = _mm_movelh_ps(in0,in1); \
+ _c23 = _mm_movelh_ps(in2,in3); \
out0 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0)); \
out1 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(3,1,3,1)); \
}
#else
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1) \
{ \
__m256d _c01,_c23; \
}
#endif
+/* Collect element 2 of the 4 inputs to out */
#define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0,in1,in2,in3,out) \
{ \
__m128 _c01,_c23; \
#ifndef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out) \
{ \
- _MM_TRANSPOSE4_PS(i_SSE0,i_SSE1,i_SSE2,i_SSE3); \
- i_SSE0 = _mm_add_ps(i_SSE0,i_SSE1); \
- i_SSE2 = _mm_add_ps(i_SSE2,i_SSE3); \
- o_SSE = _mm_add_ps(i_SSE0,i_SSE2); \
+ _MM_TRANSPOSE4_PS(in0,in1,in2,in3); \
+ in0 = _mm_add_ps(in0,in1); \
+ in2 = _mm_add_ps(in2,in3); \
+ out = _mm_add_ps(in0,in2); \
}
#else
-#define GMX_MM_TRANSPOSE_SUM2_PD(i_SSE0,i_SSE1,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM2_PD(in0,in1,out) \
{ \
- GMX_MM_TRANSPOSE2_PD(i_SSE0,i_SSE1); \
- o_SSE = _mm_add_pd(i_SSE0,i_SSE1); \
+ GMX_MM_TRANSPOSE2_PD(in0,in1); \
+ out = _mm_add_pd(in0,in1); \
}
#endif
#else
#ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out) \
{ \
- i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE1); \
- i_SSE2 = _mm256_hadd_ps(i_SSE2,i_SSE3); \
- i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
- o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
+ in0 = _mm256_hadd_ps(in0,in1); \
+ in2 = _mm256_hadd_ps(in2,in3); \
+ in1 = _mm256_hadd_ps(in0,in2); \
+ out = _mm_add_ps(_mm256_castps256_ps128(in1),_mm256_extractf128_ps(in1,1)); \
}
-#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE) \
+/* Sum the elements of halfs of each input register and store sums in out */
+#define GMX_MM_TRANSPOSE_SUM4H_PR(in0,in2,out) \
{ \
- i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps()); \
- i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps()); \
- i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
- i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001); \
- o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+ in0 = _mm256_hadd_ps(in0,_mm256_setzero_ps()); \
+ in2 = _mm256_hadd_ps(in2,_mm256_setzero_ps()); \
+ in0 = _mm256_hadd_ps(in0,in2); \
+ in2 = _mm256_permute_ps(in0,_MM_SHUFFLE(2,3,0,1)); \
+ out = _mm_add_ps(_mm256_castps256_ps128(in0),_mm256_extractf128_ps(in2,1)); \
}
#else
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out) \
{ \
- i_SSE0 = _mm256_hadd_pd(i_SSE0,i_SSE1); \
- i_SSE2 = _mm256_hadd_pd(i_SSE2,i_SSE3); \
- o_SSE = _mm256_add_pd(_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x20),_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x31)); \
+ in0 = _mm256_hadd_pd(in0,in1); \
+ in2 = _mm256_hadd_pd(in2,in3); \
+ out = _mm256_add_pd(_mm256_permute2f128_pd(in0,in2,0x20),_mm256_permute2f128_pd(in0,in2,0x31)); \
}
#endif
#endif
return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
}
-/* Do 2/4 double precision invsqrt operations.
- * Doing the SSE rsqrt and the first Newton Raphson iteration
+/* Do 2 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
* in single precision gives full double precision accuracy.
- * The speed is more than twice as fast as two gmx_mm_invsqrt_pd calls.
+ * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
*/
-#define GMX_MM128_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1) \
+#define GMX_MM128_INVSQRT2_PD(in0,in1,out0,out1) \
{ \
const __m128d half = _mm_set1_pd(0.5); \
const __m128d three = _mm_set1_pd(3.0); \
- __m128 s_SSE,ir_SSE; \
+ __m128 s,ir; \
__m128d lu0,lu1; \
\
- s_SSE = _mm_movelh_ps(_mm_cvtpd_ps(i_SSE0),_mm_cvtpd_ps(i_SSE1)); \
- ir_SSE = gmx_mm128_invsqrt_ps_single(s_SSE); \
- lu0 = _mm_cvtps_pd(ir_SSE); \
- lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir_SSE,ir_SSE)); \
- o_SSE0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
- o_SSE1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+ s = _mm_movelh_ps(_mm_cvtpd_ps(in0),_mm_cvtpd_ps(in1)); \
+ ir = gmx_mm128_invsqrt_ps_single(s); \
+ lu0 = _mm_cvtps_pd(ir); \
+ lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir,ir)); \
+ out0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),in0)),lu0)); \
+ out1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),in1)),lu1)); \
}
#define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
}
-#define GMX_MM256_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1) \
+/* Do 4 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ */
+#define GMX_MM256_INVSQRT2_PD(in0,in1,out0,out1) \
{ \
const __m256d half = _mm256_set1_pd(0.5); \
const __m256d three = _mm256_set1_pd(3.0); \
- __m256 s_SSE,ir_SSE; \
+ __m256 s,ir; \
__m256d lu0,lu1; \
\
- s_SSE = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(i_SSE0)),_mm256_cvtpd_ps(i_SSE1),1); \
- ir_SSE = gmx_mm256_invsqrt_ps_single(s_SSE); \
- lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir_SSE)); \
- lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir_SSE,1)); \
- o_SSE0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
- o_SSE1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+ s = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)),_mm256_cvtpd_ps(in1),1); \
+ ir = gmx_mm256_invsqrt_ps_single(s); \
+ lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir)); \
+ lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir,1)); \
+ out0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),in0)),lu0)); \
+ out1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),in1)),lu1)); \
}
#define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
}
-#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE) \
+#define load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE,c12_SSE) \
{ \
- __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
+ __m128 clj_SSE0[UNROLLJ],clj_SSE1[UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
int p; \
\
- for(p=0; p<2*UNROLLJ; p++) \
+ for(p=0; p<UNROLLJ; p++) \
{ \
/* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
+ clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE); \
} \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+ for(p=0; p<UNROLLJ; p++) \
+ { \
+ /* Here we load 4 aligned floats, but we need just 2 */ \
+ clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE); \
+ } \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0],clj_SSE0[1],clj_SSE0[2],clj_SSE0[3],c6t_SSE[0],c12t_SSE[0]); \
+ GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0],clj_SSE1[1],clj_SSE1[2],clj_SSE1[3],c6t_SSE[1],c12t_SSE[1]); \
\
GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE); \
GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
* But AMD CPUs perform significantly worse with gcc than with icc.
* Performance is improved a bit by using the extract function UNROLLJ times,
* instead of doing an _mm_store_si128 for every i-particle.
- * With AVX this significantly deteriorates performance (8 extracts iso 4).
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
* Because of this, the load_table_f macro always takes the ti parameter,
* but it is only used with AVX.
*/
static int set_grid_size_xy(const nbnxn_search_t nbs,
nbnxn_grid_t *grid,
+ int dd_zone,
int n,rvec corner0,rvec corner1,
real atom_density,
int XFormat)
grid->ncy = 1;
}
+ grid->sx = size[XX]/grid->ncx;
+ grid->sy = size[YY]/grid->ncy;
+ grid->inv_sx = 1/grid->sx;
+ grid->inv_sy = 1/grid->sy;
+
+ if (dd_zone > 0)
+ {
+ /* This is a non-home zone, add an extra row of cells
+ * for particles communicated for bonded interactions.
+ * These can be beyond the cut-off. It doesn't matter where
+ * they end up on the grid, but for performance it's better
+ * if they don't end up in cells that can be within cut-off range.
+ */
+ grid->ncx++;
+ grid->ncy++;
+ }
+
/* We need one additional cell entry for particles moved by DD */
if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
{
copy_rvec(corner0,grid->c0);
copy_rvec(corner1,grid->c1);
- grid->sx = size[XX]/grid->ncx;
- grid->sy = size[YY]/grid->ncy;
- grid->inv_sx = 1/grid->sx;
- grid->inv_sy = 1/grid->sy;
return nc_max;
}
-#define SORT_GRID_OVERSIZE 2
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
#define SGSF (SORT_GRID_OVERSIZE + 1)
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the inverse hole spacing.
+ * nsort, the theortical hole limit, is only used for debugging.
+ * sort is the sorting work array.
+ */
static void sort_atoms(int dim,gmx_bool Backwards,
int *a,int n,rvec *x,
real h0,real invh,int nsort,int *sort)
{
int i,c;
- int zi,zim;
+ int zi,zim,zi_min,zi_max;
int cp,tmp;
if (n <= 1)
return;
}
- /* For small oversize factors clearing the whole area is fastest.
- * For large oversize we should clear the used elements after use.
- */
- for(i=0; i<nsort; i++)
- {
- sort[i] = -1;
- }
+ /* Determine the index range used, so we can limit it for the second pass */
+ zi_min = INT_MAX;
+ zi_max = -1;
+
/* Sort the particles using a simple index sort */
for(i=0; i<n; i++)
{
if (sort[zi] < 0)
{
sort[zi] = a[i];
+ zi_min = min(zi_min,zi);
+ zi_max = max(zi_max,zi);
}
else
{
zim++;
}
sort[zim] = cp;
+ zi_max = max(zi_max,zim);
}
sort[zi] = a[i];
+ zi_max = max(zi_max,zi);
}
}
if (sort[zi] >= 0)
{
a[c++] = sort[zi];
+ sort[zi] = -1;
}
}
}
else
{
- for(zi=nsort-1; zi>=0; zi--)
+ for(zi=zi_max; zi>=zi_min; zi--)
{
if (sort[zi] >= 0)
{
a[c++] = sort[zi];
+ sort[zi] = -1;
}
}
}
/* Determine in which grid column atoms should go */
static void calc_column_indices(nbnxn_grid_t *grid,
int a0,int a1,
- rvec *x,const int *move,
+ rvec *x,
+ int dd_zone,const int *move,
int thread,int nthread,
int *cell,
int *cxy_na)
n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
- for(i=n0; i<n1; i++)
+ if (dd_zone == 0)
{
- if (move == NULL || move[i] >= 0)
+ /* Home zone */
+ for(i=n0; i<n1; i++)
{
- /* We need to be careful with rounding,
- * particles might be a few bits outside the local box.
- * The int cast takes care of the lower bound,
- * we need to explicitly take care of the upper bound.
- */
- cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
- if (cx == grid->ncx)
- {
- cx = grid->ncx - 1;
- }
- cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
- if (cy == grid->ncy)
+ if (move == NULL || move[i] >= 0)
{
- cy = grid->ncy - 1;
- }
- /* For the moment cell contains only the, grid local,
- * x and y indices, not z.
- */
- cell[i] = cx*grid->ncy + cy;
+ /* We need to be careful with rounding,
+ * particles might be a few bits outside the local zone.
+ * The int cast takes care of the lower bound,
+ * we will explicitly take care of the upper bound.
+ */
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
#ifdef DEBUG_NBNXN_GRIDDING
- if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+ if (cx < 0 || cx >= grid->ncx ||
+ cy < 0 || cy >= grid->ncy)
+ {
+ gmx_fatal(FARGS,
+ "grid cell cx %d cy %d out of range (max %d %d)\n"
+ "atom %f %f %f, grid->c0 %f %f",
+ cx,cy,grid->ncx,grid->ncy,
+ x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+ }
+#endif
+ /* Take care of potential rouding issues */
+ cx = min(cx,grid->ncx - 1);
+ cy = min(cy,grid->ncy - 1);
+
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+ }
+ else
{
- gmx_fatal(FARGS,
- "grid cell cx %d cy %d out of range (max %d %d)\n"
- "atom %f %f %f, grid->c0 %f %f",
- cx,cy,grid->ncx,grid->ncy,
- x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+ /* Put this moved particle after the end of the grid,
+ * so we can process it later without using conditionals.
+ */
+ cell[i] = grid->ncx*grid->ncy;
}
-#endif
+
+ cxy_na[cell[i]]++;
}
- else
+ }
+ else
+ {
+ /* Non-home zone */
+ for(i=n0; i<n1; i++)
{
- /* Put this moved particle after the end of the grid,
- * so we can process it later without using conditionals.
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+ /* For non-home zones there could be particles outside
+ * the non-bonded cut-off range, which have been communicated
+ * for bonded interactions only. For the result it doesn't
+ * matter where these end up on the grid. For performance
+ * we put them in an extra row at the border.
*/
- cell[i] = grid->ncx*grid->ncy;
- }
+ cx = max(cx,0);
+ cx = min(cx,grid->ncx - 1);
+ cy = max(cy,0);
+ cy = min(cy,grid->ncy - 1);
- cxy_na[cell[i]]++;
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+
+ cxy_na[cell[i]]++;
+ }
}
}
#pragma omp parallel for num_threads(nthread) schedule(static)
for(thread=0; thread<nthread; thread++)
{
- calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+ calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
nbs->cell,nbs->work[thread].cxy_na);
}
over_alloc_large(ncz_max*grid->na_sc*SGSF);
srenew(nbs->work[thread].sort_work,
nbs->work[thread].sort_work_nalloc);
+ /* When not in use, all elements should be -1 */
+ for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
+ {
+ nbs->work[thread].sort_work[i] = -1;
+ }
}
}
nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
}
- /* Set the cell indices for the moved particles */
- n0 = grid->nc*grid->na_sc;
- n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
- for(i=n0; i<n1; i++)
+ if (dd_zone == 0)
{
- nbs->cell[nbs->a[i]] = i;
+ /* Set the cell indices for the moved particles */
+ n0 = grid->nc*grid->na_sc;
+ n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+ if (dd_zone == 0)
+ {
+ for(i=n0; i<n1; i++)
+ {
+ nbs->cell[nbs->a[i]] = i;
+ }
+ }
}
/* Sort the super-cell columns along z into the sub-cells. */
nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
}
- nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+ nc_max_grid = set_grid_size_xy(nbs,grid,
+ dd_zone,n-nmoved,corner0,corner1,
nbs->grid[0].atom_density,
nbat->XFormat);
{
sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
- if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+ /* The counts below are used for non-bonded pair/flop counts
+ * and should therefore match the available kernel setups.
+ */
+ if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
{
- nbl->work->ncj_hlj += jlen;
+ nbl->work->ncj_noq += jlen;
}
- else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+ else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+ !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
{
- nbl->work->ncj_noq += jlen;
+ nbl->work->ncj_hlj += jlen;
}
nbl->nci++;
gmx_analyze.c gmx_anaeig.c gmx_angle.c gmx_bond.c
gmx_bundle.c gmx_chi.c gmx_cluster.c gmx_confrms.c
gmx_covar.c gmx_current.c
- gmx_density.c gmx_densmap.c gmx_dih.c
+ gmx_density.c gmx_densmap.c
gmx_dielectric.c
gmx_kinetics.c gmx_spatial.c gmx_tune_pme.c
gmx_dipoles.c gmx_disre.c gmx_dist.c gmx_dyndom.c
make_ndx mk_angndx trjcat trjconv trjorder g_wheel
xpm2ps genion g_anadock make_edi g_analyze g_anaeig
g_angle g_bond g_bundle g_chi g_cluster g_confrms g_covar
- g_current g_density g_densmap g_dih g_dielectric
+ g_current g_density g_densmap g_dielectric
g_helixorient g_principal g_dipoles g_disre g_dist
g_dyndom g_enemat g_energy g_lie g_filter g_gyrate
g_h2order g_hbond g_helix g_mindist g_msd g_morph g_nmeig
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- * Copyright (c) 2012, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <gmx_ana.h>
-
-
-/* This is just a wrapper binary.
-* The code that used to be in g_dih.c is now in gmx_dih.c,
-* where the old main function is called gmx_dih().
-*/
-int
-main(int argc, char *argv[])
-{
- gmx_dih(argc,argv);
- return 0;
-}
-
-
-
{
static const char *desc[] = {
"[TT]g_angle[tt] computes the angle distribution for a number of angles",
- "or dihedrals. This way you can check whether your simulation",
- "is correct. With option [TT]-ov[tt] you can plot the average angle of",
- "a group of angles as a function of time. With the [TT]-all[tt] option",
- "the first graph is the average, the rest are the individual angles.[PAR]",
+ "or dihedrals.[PAR]",
+ "With option [TT]-ov[tt], you can plot the average angle of",
+ "a group of angles as a function of time. With the [TT]-all[tt] option,",
+ "the first graph is the average and the rest are the individual angles.[PAR]",
"With the [TT]-of[tt] option, [TT]g_angle[tt] also calculates the fraction of trans",
"dihedrals (only for dihedrals) as function of time, but this is",
- "probably only fun for a selected few.[PAR]",
- "With option [TT]-oc[tt] a dihedral correlation function is calculated.[PAR]",
- "It should be noted that the index file should contain",
- "atom-triples for angles or atom-quadruplets for dihedrals.",
+ "probably only fun for a select few.[PAR]",
+ "With option [TT]-oc[tt], a dihedral correlation function is calculated.[PAR]",
+ "It should be noted that the index file must contain",
+ "atom triplets for angles or atom quadruplets for dihedrals.",
"If this is not the case, the program will crash.[PAR]",
- "With option [TT]-or[tt] a trajectory file is dumped containing cos and",
- "sin of selected dihedral angles which subsequently can be used as",
- "input for a PCA analysis using [TT]g_covar[tt].[PAR]",
+ "With option [TT]-or[tt], a trajectory file is dumped containing cos and",
+ "sin of selected dihedral angles, which subsequently can be used as",
+ "input for a principal components analysis using [TT]g_covar[tt].[PAR]",
"Option [TT]-ot[tt] plots when transitions occur between",
"dihedral rotamers of multiplicity 3 and [TT]-oh[tt]",
"records a histogram of the times between such transitions,",
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- * Copyright (c) 2012, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <math.h>
-
-#include "sysstuff.h"
-#include "string2.h"
-#include "copyrite.h"
-#include "futil.h"
-#include "smalloc.h"
-#include "statutil.h"
-#include "nrama.h"
-#include "physics.h"
-#include "macros.h"
-#include "xvgr.h"
-#include "vec.h"
-#include "gmx_ana.h"
-
-
-#define NOMIN 'X'
-
-static void ana_dih(FILE *out,char *index,int nframes,real dih[],t_dih *dd)
-{
- int i;
- real mind,maxd,sum,av,var,prev,width;
- gmx_bool bTrans;
-
- mind=5400,maxd=-5400,sum=0,av=0,var=0;
-
- prev=dih[0];
- for(i=0; (i<nframes); i++) {
- if ((dih[i]-prev) > 180) {
- /* PBC.. */
- dih[i]-=360;
- }
- else if ((dih[i]-prev) < -180)
- dih[i]+=360;
- prev=dih[i];
-
- sum+=dih[i];
- mind=min(mind,dih[i]);
- maxd=max(maxd,dih[i]);
- }
- av=sum/nframes;
- for(i=0; (i<nframes); i++)
- var+=sqr(dih[i]-av);
- var/=nframes;
- width=(360.0/dd->mult);
- bTrans=((maxd - mind) > width);
-
- fprintf(out,"%-10s %10.3f %10.3f %10.3f %10.3f %10.3f %-10s%3.0f\n",
- index,mind,av,maxd,var,sqrt(var),
- bTrans ? "Yep" : "",width);
-}
-
-static int find_min(real phi,int ntab,real phitab[])
-{
- int i,imin;
- real mind,mm;
- real width;
-
- /* Set closest minimum to the first one */
- width=360.0/ntab;
- mind=fabs(phi-phitab[0]);
- imin=0;
- for(i=1; (i<ntab); i++) {
- mm=fabs(phi-phitab[i]);
- if (mm < mind) {
- imin=i;
- mind=mm;
- }
- }
- if (mind < width*0.5 )
- return imin;
- else
- return -1;
-}
-
-static int vphi(t_dih *dih,real phi,int mult)
-{
- static real m2[] = { 90, 270 };
- static real m3[] = { 60, 180, 300 };
- static real m4[] = { 45, 135, 225, 315 };
- static real m6[] = { 30, 90, 150, 210, 270, 330 };
-
- real phiref;
- int vpp=0;
-
- phiref=RAD2DEG*(phi-dih->phi0);
- while (phiref < 0)
- phiref+=360;
- while (phiref > 360)
- phiref-=360;
-
- switch(mult) {
- case 2:
- vpp=find_min(phiref,2,m2);
- break;
- case 3:
- vpp=find_min(phiref,3,m3);
- break;
- case 4:
- vpp=find_min(phiref,4,m4);
- break;
- case 6:
- vpp=find_min(phiref,6,m6);
- break;
- default:
- gmx_fatal(FARGS,"No such multiplicity %d",dih->mult);
- }
-
- if (vpp == -1)
- return NOMIN;
- else
- return vpp+'0';
-}
-
-typedef struct t_cluster {
- int ndih;
- int freq;
- char *minimum;
- struct t_cluster *next;
-} t_cluster;
-
-static t_cluster *search_cluster(t_cluster *cl,char *minimum)
-{
- t_cluster *ccl=cl;
-
- while (ccl != NULL) {
- if (strcmp(minimum,ccl->minimum)==0)
- return ccl;
- ccl=ccl->next;
- }
- return NULL;
-}
-
-static void add_cluster(t_cluster **cl,int ndih,char *minimum)
-{
- t_cluster *loper;
- t_cluster *ccl;
-
- snew(ccl,1);
- ccl->ndih=ndih;
- ccl->freq=1;
- ccl->minimum=strdup(minimum);
- ccl->next=NULL;
-
- if (*cl == NULL)
- *cl=ccl;
- else {
- loper=*cl;
- while (loper->next != NULL)
- loper=loper->next;
- loper->next=ccl;
- }
-}
-
-static void p_cluster(FILE *out,t_cluster *cl)
-{
- t_cluster *loper;
-
- fprintf(out,"* * * C L U S T E R A N A L Y S I S * * *\n\n");
- fprintf(out," Frequency Dihedral minima\n");
- loper=cl;
- while (loper != NULL) {
- fprintf(out,"%10d %s\n",loper->freq,loper->minimum);
- loper=loper->next;
- }
-}
-
-static void ana_cluster(FILE *out, t_xrama *xr,real **dih,real time[],
- t_topology *top,int nframes,int mult)
-{
- t_cluster *cl=NULL,*scl;
- char *minimum;
- int i,j,nx;
-
- /* Number of dihedrals + terminating NULL
- * this allows for using string routines
- */
- snew(minimum,xr->ndih+1);
-
- for(i=0; (i<nframes); i++) {
- nx=0;
- for(j=0; (j<xr->ndih); j++) {
- minimum[j] = vphi(&xr->dih[j],dih[j][i],
- mult == -1 ? xr->dih[j].mult : mult);
- if (minimum[j] == NOMIN)
- nx++;
- }
- if (nx == 0) {
- if ((scl=search_cluster(cl,minimum)) == NULL)
- add_cluster(&cl,xr->ndih,minimum);
- else
- scl->freq++;
- }
- }
- p_cluster(out,cl);
-
- sfree(minimum);
-}
-
-static void ana_trans(FILE *out, t_xrama *xr,real **dih,real time[],
- t_topology *top,int nframes, const output_env_t oenv)
-{
- FILE *outd;
- real prev_phi,prev_psi;
- int i,j,phi,psi;
- char buf[10];
-
- fprintf(out,"\n\t* * * D I H E D R A L S T A T I S T I C S * * *\n\n");
- fprintf(out,"%-10s %10s %10s %10s %10s %10s %10s\n",
- "index","minimum","average","maximum","variance","std.dev",
- "transition");
- for(i=0; (i<xr->ndih); i++) {
- sprintf(buf,"dih-%d",i);
- ana_dih(out,buf,nframes,dih[i],&(xr->dih[i]));
- }
- for(i=0; (i<xr->npp); i++) {
- sprintf(buf,"%s",xr->pp[i].label);
- outd=xvgropen(buf,"Dihedral Angles","Time (ps)","Degrees",oenv);
-
- phi=xr->pp[i].iphi;
- psi=xr->pp[i].ipsi;
- prev_phi=dih[phi][0];
- prev_psi=dih[psi][0];
- for(j=0; (j<nframes); j++) {
- /* PBC.. */
- if ((dih[phi][j]-prev_phi) > 180)
- dih[phi][j]-=360;
- else if ((dih[phi][j]-prev_phi) < -180)
- dih[phi][j]+=360;
- prev_phi=dih[phi][j];
- if ((dih[psi][j]-prev_psi) > 180)
- dih[psi][j]-=360;
- else if ((dih[psi][j]-prev_psi) < -180)
- dih[psi][j]+=360;
- prev_psi=dih[psi][j];
- fprintf(outd,"%10g %10g %10g\n",time[j],prev_phi,prev_psi);
- }
- ffclose(outd);
- }
-}
-
-int gmx_dih(int argc,char *argv[])
-{
- const char *desc[] = {
- "[TT]g_dih[tt] can do two things. The default is to analyze dihedral transitions",
- "by merely computing all the dihedral angles defined in your topology",
- "for the whole trajectory. When a dihedral flips over to another minimum",
- "an angle/time plot is made.[PAR]",
- "The opther option is to discretize the dihedral space into a number of",
- "bins, and group each conformation in dihedral space in the",
- "appropriate bin. The output is then given as a number of dihedral",
- "conformations sorted according to occupancy."
- };
- static int mult = -1;
- static gmx_bool bSA = FALSE;
- t_pargs pa[] = {
- { "-sa", FALSE, etBOOL, {&bSA},
- "Perform cluster analysis in dihedral space instead of analysing dihedral transitions." },
- { "-mult", FALSE, etINT, {&mult},
- "mulitiplicity for dihedral angles (by default read from topology)" }
- };
- FILE *out;
- t_xrama *xr;
- t_topology *top;
- real **dih,*time;
- real dd;
- int i,nframes,maxframes=1000;
- output_env_t oenv;
- t_filenm fnm[] = {
- { efTRX, "-f", NULL, ffREAD },
- { efTPX, NULL, NULL, ffREAD },
- { efOUT, NULL, NULL, ffWRITE }
- };
-#define NFILE asize(fnm)
-
- CopyRight(stderr,argv[0]);
- parse_common_args(&argc,argv,PCA_CAN_VIEW | PCA_CAN_TIME | PCA_BE_NICE,
- NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,&oenv);
-
- if (mult != -1)
- fprintf(stderr,"Using %d for dihedral multiplicity rather than topology values\n",mult);
-
- snew(xr,1);
- init_rama(oenv,ftp2fn(efTRX,NFILE,fnm),
- ftp2fn(efTPX,NFILE,fnm),xr,3);
- top=read_top(ftp2fn(efTPX,NFILE,fnm),NULL);
-
- /* Brute force malloc, may be too big... */
- snew(dih,xr->ndih);
- for(i=0; (i<xr->ndih); i++)
- snew(dih[i],maxframes);
- snew(time,maxframes);
-
- fprintf(stderr,"\n");
- nframes = 0;
- while (new_data(xr)) {
- for(i=0; (i<xr->ndih); i++) {
- dd=xr->dih[i].ang*RAD2DEG;
- while (dd < 0)
- dd+=360;
- while (dd > 360)
- dd-=360;
- dih[i][nframes]=dd;
- }
- time[nframes]=xr->t;
- nframes++;
- if (nframes > maxframes) {
- maxframes += 1000;
- for(i=0; (i<xr->ndih); i++)
- srenew(dih[i],maxframes);
- srenew(time,maxframes);
- }
- }
-
- fprintf(stderr,"\nCalculated all dihedrals, now analysing...\n");
-
- out=ftp2FILE(efOUT,NFILE,fnm,"w");
-
- if (bSA) {
- /* Cluster and structure analysis */
- ana_cluster(out,xr,dih,time,top,nframes,mult);
- }
- else {
- /* Analyse transitions... */
- ana_trans(out,xr,dih,time,top,nframes,oenv);
- }
- ffclose(out);
-
- thanx(stderr);
-
- return 0;
-}
nfn = opt2fn_null("-n",NFILE,fnm);
if (( nfn == NULL ) && ( xfn == NULL))
- gmx_fatal(FARGS,"no index file and no structure file suplied");
+ gmx_fatal(FARGS,"no index file and no structure file supplied");
if ((disre_frac < 0) || (disre_frac >= 1))
gmx_fatal(FARGS,"disre_frac should be between 0 and 1");
"of the",
"helix in nm. This is simply the average rise (see above) times the",
"number of helical residues (see below).[BR]",
- "[BB]5.[bb] Number of helical residues (file [TT]n-ahx.xvg[tt]). The title says",
- "it all.[BR]",
- "[BB]6.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
- "[BB]7.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
+ "[BB]5.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
+ "[BB]6.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
"atoms only (file [TT]rms-ahx.xvg[tt]).[BR]",
- "[BB]8.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
- "[BB]9.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
- "[BB]10.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
+ "[BB]7.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
+ "[BB]8.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
+ "[BB]9.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
"[PAR]"
};
static const char *ppp[efhNR+2] = {
static const char *desc[] = {
"[TT]make_edi[tt] generates an essential dynamics (ED) sampling input file to be used with [TT]mdrun[tt]",
"based on eigenvectors of a covariance matrix ([TT]g_covar[tt]) or from a",
- "normal modes anaysis ([TT]g_nmeig[tt]).",
+ "normal modes analysis ([TT]g_nmeig[tt]).",
"ED sampling can be used to manipulate the position along collective coordinates",
"(eigenvectors) of (biological) macromolecules during a simulation. Particularly,",
"it may be used to enhance the sampling efficiency of MD simulations by stimulating",
"before a new cycle is started.[PAR]",
"Note on the parallel implementation: since ED sampling is a 'global' thing",
"(collective coordinates etc.), at least on the 'protein' side, ED sampling",
- "is not very parallel-friendly from an implentation point of view. Because",
+ "is not very parallel-friendly from an implementation point of view. Because",
"parallel ED requires some extra communication, expect the performance to be",
- "lower as in a free MD simulation, especially on a large number of nodes. [PAR]",
- "All output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a .edo file. In the output",
+ "lower as in a free MD simulation, especially on a large number of nodes and/or",
+ "when the ED group contains a lot of atoms. [PAR]",
+ "Please also note that if your ED group contains more than a single protein,",
+ "then the [TT].tpr[tt] file must contain the correct PBC representation of the ED group.",
+ "Take a look on the initial RMSD from the reference structure, which is printed",
+ "out at the start of the simulation; if this is much higher than expected, one",
+ "of the ED molecules might be shifted by a box vector. [PAR]",
+ "All output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a [TT].edo[tt] file. In the output",
"file, per OUTFRQ step the following information is present: [PAR]",
"[TT]*[tt] the step number[BR]",
"[TT]*[tt] the number of the ED dataset. ([BB]Note[bb] that you can impose multiple ED constraints in",
"is kept in that region.",
"[PAR]",
"The origin is normally the average structure stored in the [TT]eigvec.trr[tt] file.",
- "It can be changed with [TT]-ori[tt] to an arbitrary position in configurational space.",
+ "It can be changed with [TT]-ori[tt] to an arbitrary position in configuration space.",
"With [TT]-tau[tt], [TT]-deltaF0[tt], and [TT]-Eflnull[tt] you control the flooding behaviour.",
"Efl is the flooding strength, it is updated according to the rule of adaptive flooding.",
"Tau is the time constant of adaptive flooding, high [GRK]tau[grk] means slow adaption (i.e. growth). ",
*(top.atoms.atomname[index[i]]));
fprintf(fp,"%5d %10.5f %10.5f\n",
- bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,rmsf[i]*bfac,
+ bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,rmsf[i]*bfac,
pdb_bfac);
}
}
if (!bRes || i+1==isize ||
top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
fprintf(fp,"%5d %8.4f\n",
- bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+ bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
ffclose(fp);
}
if (!bRes || i+1==isize ||
top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
fprintf(fp,"%5d %8.4f\n",
- bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+ bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
ffclose(fp);
}
"This is simple tool to compute SANS spectra using Debye formula",
"It currently uses topology file (since it need to assigne element for each atom)",
"[PAR]",
- "[TT]-pr[tt] Computes normalized g(r) function",
- "[PAR]",
- "[TT]-sq[tt] Computes SANS intensity curve for needed q diapason",
- "[PAR]",
- "[TT]-startq[tt] Starting q value in nm",
- "[PAR]",
- "[TT]-endq[tt] Ending q value in nm",
- "[PAR]",
- "[TT]-qstep[tt] Stepping in q space",
- "[PAR]",
+ "Parameters:[PAR]"
+ "[TT]-pr[tt] Computes normalized g(r) function averaged over trajectory[PAR]",
+ "[TT]-prframe[tt] Computes normalized g(r) function for each frame[PAR]",
+ "[TT]-sq[tt] Computes SANS intensity curve averaged over trajectory[PAR]",
+ "[TT]-sqframe[tt] Computes SANS intensity curve for each frame[PAR]",
+ "[TT]-startq[tt] Starting q value in nm[PAR]",
+ "[TT]-endq[tt] Ending q value in nm[PAR]",
+ "[TT]-qstep[tt] Stepping in q space[PAR]",
"Note: When using Debye direct method computational cost increases as",
- "1/2 * N * (N - 1) where N is atom number in group of interest"
+ "1/2 * N * (N - 1) where N is atom number in group of interest",
+ "[PAR]",
+ "WARNING: If sq or pr specified this tool can produce large number of files! Up to two times larger than number of frames!"
};
static gmx_bool bPBC=TRUE;
- static real binwidth=0.2,grid=0.05; /* bins shouldnt be smaller then bond (~0.1nm) length */
+ static gmx_bool bNORM=FALSE;
+ static real binwidth=0.2,grid=0.05; /* bins shouldnt be smaller then smallest bond (~0.1nm) length */
static real start_q=0.0, end_q=2.0, q_step=0.01;
static real mcover=-1;
static unsigned int seed=0;
static const char *emode[]= { NULL, "direct", "mc", NULL };
static const char *emethod[]={ NULL, "debye", "fft", NULL };
- gmx_nentron_atomic_structurefactors_t *gnsf;
+ gmx_neutron_atomic_structurefactors_t *gnsf;
gmx_sans_t *gsans;
#define NPA asize(pa)
#endif
};
FILE *fp;
- const char *fnTPX,*fnNDX,*fnDAT=NULL;
+ const char *fnTPX,*fnNDX,*fnTRX,*fnDAT=NULL;
t_trxstatus *status;
t_topology *top=NULL;
t_atom *atom=NULL;
atom_id *index=NULL;
int isize;
int i,j;
- gmx_radial_distribution_histogram_t *pr=NULL;
- gmx_static_structurefator_t *sq=NULL;
+ char *hdr=NULL;
+ char *suffix=NULL;
+ t_filenm *fnmdup=NULL;
+ gmx_radial_distribution_histogram_t *prframecurrent=NULL, *pr=NULL;
+ gmx_static_structurefactor_t *sqframecurrent=NULL, *sq=NULL;
output_env_t oenv;
#define NFILE asize(fnm)
t_filenm fnm[] = {
- { efTPX, "-s", NULL, ffREAD },
- { efNDX, NULL, NULL, ffOPTRD },
- { efDAT, "-d", "nsfactor", ffOPTRD },
- { efXVG, "-sq", "sq", ffWRITE },
- { efXVG, "-pr", "pr", ffWRITE }
+ { efTPX, "-s", NULL, ffREAD },
+ { efTRX, "-f", NULL, ffREAD },
+ { efNDX, NULL, NULL, ffOPTRD },
+ { efDAT, "-d", "nsfactor", ffOPTRD },
+ { efXVG, "-pr", "pr", ffWRITE },
+ { efXVG, "-sq", "sq", ffWRITE },
+ { efXVG, "-prframe", "prframe", ffOPTWR },
+ { efXVG, "-sqframe", "sqframe", ffOPTWR }
};
nthreads = gmx_omp_get_max_threads();
CopyRight(stderr,argv[0]);
- parse_common_args(&argc,argv,PCA_BE_NICE,
+ parse_common_args(&argc,argv,PCA_CAN_TIME | PCA_TIME_UNIT | PCA_BE_NICE,
NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,&oenv);
/* check that binwidth not smaller than smallers distance */
break;
}
- if (!bDEBYE && !bFFT)
- gmx_fatal(FARGS,"Unknown method. Set pr or fft!\n");
+ if (bDEBYE) {
+ if (bMC) {
+ fprintf(stderr,"Using Monte Carlo Debye method to calculate spectrum\n");
+ } else {
+ fprintf(stderr,"Using direct Debye method to calculate spectrum\n");
+ }
+ } else if (bFFT) {
+ gmx_fatal(FARGS,"FFT method not implemented!");
+ } else {
+ gmx_fatal(FARGS,"Unknown combination for mode and method!");
+ }
+
/* Try to read files */
fnDAT = ftp2fn(efDAT,NFILE,fnm);
fnTPX = ftp2fn(efTPX,NFILE,fnm);
+ fnTRX = ftp2fn(efTRX,NFILE,fnm);
gnsf = gmx_neutronstructurefactors_init(fnDAT);
fprintf(stderr,"Read %d atom names from %s with neutron scattering parameters\n\n",gnsf->nratoms,fnDAT);
gmx_rmpbc(gpbc,top->atoms.nr,box,x);
}
- natoms=top->atoms.nr;
+ natoms=read_first_x(oenv,&status,fnTRX,&t,&x,box);
+ if (natoms != top->atoms.nr) {
+ fprintf(stderr,"\nWARNING: number of atoms in tpx (%d) and trajectory (%d) do not match\n",natoms,top->atoms.nr);
+ }
- if (bDEBYE) {
- if (bMC) {
- fprintf(stderr,"Using Monte Carlo Debye method to calculate spectrum\n");
+ do {
+ if (bPBC) {
+ gmx_rmpbc(gpbc,top->atoms.nr,box,x);
+ }
+ /* allocate memory for pr */
+ if (pr == NULL) {
+ /* in case its first frame to read */
+ snew(pr,1);
+ }
+ /* realy calc p(r) */
+ prframecurrent = calc_radial_distribution_histogram(gsans,x,box,index,isize,binwidth,bMC,bNORM,mcover,seed);
+ /* copy prframecurrent -> pr and summ up pr->gr[i] */
+ /* allocate and/or resize memory for pr->gr[i] and pr->r[i] */
+ if (pr->gr == NULL) {
+ /* check if we use pr->gr first time */
+ snew(pr->gr,prframecurrent->grn);
+ snew(pr->r,prframecurrent->grn);
} else {
- fprintf(stderr,"Using direct Debye method to calculate spectrum\n");
+ /* resize pr->gr and pr->r if needed to preven overruns */
+ if(prframecurrent->grn > pr->grn) {
+ srenew(pr->gr,prframecurrent->grn);
+ srenew(pr->r,prframecurrent->grn);
+ }
}
- } else if (bFFT) {
- gmx_fatal(FARGS,"Not implented!");
- } else {
- gmx_fatal(FARGS,"Whats this!");
- }
-
- /* realy calc p(r) */
- pr = calc_radial_distribution_histogram(gsans,x,box,index,isize,binwidth,bMC,mcover,seed);
+ pr->grn = prframecurrent->grn;
+ pr->binwidth = prframecurrent->binwidth;
+ /* summ up gr and fill r */
+ for(i=0;i<prframecurrent->grn;i++) {
+ pr->gr[i] += prframecurrent->gr[i];
+ pr->r[i] = prframecurrent->r[i];
+ }
+ /* normalize histo */
+ normalize_probability(prframecurrent->grn,prframecurrent->gr);
+ /* convert p(r) to sq */
+ sqframecurrent = convert_histogram_to_intensity_curve(prframecurrent,start_q,end_q,q_step);
+ /* print frame data if needed */
+ if(opt2fn_null("-prframe",NFILE,fnm)) {
+ snew(hdr,25);
+ snew(suffix,GMX_PATH_MAX);
+ /* prepare header */
+ sprintf(hdr,"g(r), t = %f",t);
+ /* prepare output filename */
+ fnmdup = dup_tfn(NFILE,fnm);
+ sprintf(suffix,"-t%.2f",t);
+ add_suffix_to_output_names(fnmdup,NFILE,suffix);
+ fp = xvgropen(opt2fn_null("-prframe",NFILE,fnmdup),hdr,"Distance (nm)","Probability",oenv);
+ for(i=0;i<prframecurrent->grn;i++) {
+ fprintf(fp,"%10.6f%10.6f\n",prframecurrent->r[i],prframecurrent->gr[i]);
+ }
+ done_filenms(NFILE,fnmdup);
+ fclose(fp);
+ sfree(hdr);
+ sfree(suffix);
+ sfree(fnmdup);
+ }
+ if(opt2fn_null("-sqframe",NFILE,fnm)) {
+ snew(hdr,25);
+ snew(suffix,GMX_PATH_MAX);
+ /* prepare header */
+ sprintf(hdr,"I(q), t = %f",t);
+ /* prepare output filename */
+ fnmdup = dup_tfn(NFILE,fnm);
+ sprintf(suffix,"-t%.2f",t);
+ add_suffix_to_output_names(fnmdup,NFILE,suffix);
+ fp = xvgropen(opt2fn_null("-sqframe",NFILE,fnmdup),hdr,"q (nm^-1)","s(q)/s(0)",oenv);
+ for(i=0;i<sqframecurrent->qn;i++) {
+ fprintf(fp,"%10.6f%10.6f\n",sqframecurrent->q[i],sqframecurrent->s[i]);
+ }
+ done_filenms(NFILE,fnmdup);
+ fclose(fp);
+ sfree(hdr);
+ sfree(suffix);
+ sfree(fnmdup);
+ }
+ /* free pr structure */
+ sfree(prframecurrent->gr);
+ sfree(prframecurrent->r);
+ sfree(prframecurrent);
+ /* free sq structure */
+ sfree(sqframecurrent->q);
+ sfree(sqframecurrent->s);
+ sfree(sqframecurrent);
+ } while (read_next_x(oenv,status,&t,natoms,x,box));
+ close_trj(status);
+ /* normalize histo */
+ normalize_probability(pr->grn,pr->gr);
+ sq = convert_histogram_to_intensity_curve(pr,start_q,end_q,q_step);
/* prepare pr.xvg */
fp = xvgropen(opt2fn_null("-pr",NFILE,fnm),"G(r)","Distance (nm)","Probability",oenv);
for(i=0;i<pr->grn;i++)
- fprintf(fp,"%10.6lf%10.6lf\n",pr->r[i],pr->gr[i]);
+ fprintf(fp,"%10.6f%10.6f\n",pr->r[i],pr->gr[i]);
xvgrclose(fp);
/* prepare sq.xvg */
- sq = convert_histogram_to_intensity_curve(pr,start_q,end_q,q_step);
fp = xvgropen(opt2fn_null("-sq",NFILE,fnm),"I(q)","q (nm^-1)","s(q)/s(0)",oenv);
for(i=0;i<sq->qn;i++) {
- fprintf(fp,"%10.6lf%10.6lf\n",sq->q[i],sq->s[i]);
+ fprintf(fp,"%10.6f%10.6f\n",sq->q[i],sq->s[i]);
}
xvgrclose(fp);
-
+ /*
+ * Clean up memory
+ */
+ sfree(pr->gr);
+ sfree(pr->r);
sfree(pr);
+ sfree(sq->q);
+ sfree(sq->s);
+ sfree(sq);
please_cite(stdout,"Garmay2012");
thanx(stderr);
}
}
-void normalize_probability(int n,double *a){
+void normalize_probability(int n,double *a) {
int i;
double norm=0.0;
for (i=0;i<n;i++) norm +=a[i];
for (i=0;i<n;i++) a[i]/=norm;
}
-gmx_nentron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn) {
+gmx_neutron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn) {
/* read nsfactor.dat */
FILE *fp;
char line[STRLEN];
int i, line_no;
char atomnm[8];
double slength;
- gmx_nentron_atomic_structurefactors_t *gnsf;
+ gmx_neutron_atomic_structurefactors_t *gnsf;
fp=libopen(datfn);
line_no = 0;
fclose(fp);
- return (gmx_nentron_atomic_structurefactors_t *) gnsf;
+ return (gmx_neutron_atomic_structurefactors_t *) gnsf;
}
-gmx_sans_t *gmx_sans_init (t_topology *top, gmx_nentron_atomic_structurefactors_t *gnsf) {
+gmx_sans_t *gmx_sans_init (t_topology *top, gmx_neutron_atomic_structurefactors_t *gnsf) {
gmx_sans_t *gsans=NULL;
int i,j;
/* Try to assing scattering length from nsfactor.dat */
int isize,
double binwidth,
gmx_bool bMC,
+ gmx_bool bNORM,
real mcover,
unsigned int seed) {
gmx_radial_distribution_histogram_t *pr=NULL;
#endif
}
- /* normalize */
- normalize_probability(pr->grn,pr->gr);
+ /* normalize if needed */
+ if (bNORM) {
+ normalize_probability(pr->grn,pr->gr);
+ }
+
snew(pr->r,pr->grn);
for(i=0;i<pr->grn;i++)
pr->r[i]=(pr->binwidth*i+pr->binwidth*0.5);
return (gmx_radial_distribution_histogram_t *) pr;
}
-gmx_static_structurefator_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step) {
- gmx_static_structurefator_t *sq=NULL;
+gmx_static_structurefactor_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step) {
+ gmx_static_structurefactor_t *sq=NULL;
int i,j;
/* init data */
snew(sq,1);
}
}
- return (gmx_static_structurefator_t *) sq;
+ return (gmx_static_structurefactor_t *) sq;
}
extern "C" {
#endif
-typedef struct gmx_nentron_atomic_structurefactors_t {
+typedef struct gmx_neutron_atomic_structurefactors_t {
int nratoms;
int *p; /* proton number */
int *n; /* neuton number */
double *slength; /* scattering length in fm */
char **atomnm; /* atom symbol */
-} gmx_nentron_atomic_structurefactors_t;
+} gmx_neutron_atomic_structurefactors_t;
typedef struct gmx_sans_t {
t_topology *top; /* topology */
double *gr; /* Probability */
} gmx_radial_distribution_histogram_t;
-typedef struct gmx_static_structurefator_t {
+typedef struct gmx_static_structurefactor_t {
int qn; /* number of items */
double *s; /* scattering */
double *q; /* q vectors */
double qstep; /* q increment */
-} gmx_static_structurefator_t;
+} gmx_static_structurefactor_t;
void check_binwidth(real binwidth);
void normalize_probability(int n, double *a);
-gmx_nentron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn);
+gmx_neutron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn);
-gmx_sans_t *gmx_sans_init(t_topology *top, gmx_nentron_atomic_structurefactors_t *gnsf);
+gmx_sans_t *gmx_sans_init(t_topology *top, gmx_neutron_atomic_structurefactors_t *gnsf);
gmx_radial_distribution_histogram_t *calc_radial_distribution_histogram (gmx_sans_t *gsans,
rvec *x,
int isize,
double binwidth,
gmx_bool bMC,
+ gmx_bool bNORM,
real mcover,
unsigned int seed);
-gmx_static_structurefator_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step);
+gmx_static_structurefactor_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step);
#ifdef __cplusplus