Merge "Fix CUDA being quiet and mark all CUDA_* as advanced" into release-4-6
authorChristoph Junghans <junghans@votca.org>
Fri, 11 Jan 2013 20:35:09 +0000 (21:35 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Fri, 11 Jan 2013 20:35:09 +0000 (21:35 +0100)
65 files changed:
CMakeLists.txt
admin/programs.txt
cmake/TestAVXMaskload.c [new file with mode: 0644]
cmake/gmxCFlags.cmake
cmake/gmxManageMPI.cmake
cmake/gmxTestAVXMaskload.cmake [new file with mode: 0644]
include/edsam.h
include/gmx_ana.h
include/gmx_cpuid.h
include/gmx_math_x86_avx_128_fma_double.h
include/gmx_x86_avx_128_fma.h
include/gmx_x86_avx_256.h
include/types/nbnxn_pairlist.h
include/types/state.h
include/vec.h
scripts/GMXRC.cmakein
share/html/online.html
share/html/online/g_dih.html [deleted file]
share/html/online/gro.html
share/html/online/mdp_opt.html
share/template/CMakeLists.txt
share/top/links.dat
src/config.h.cmakein
src/gmxlib/atomprop.c
src/gmxlib/checkpoint.c
src/gmxlib/copyrite.c
src/gmxlib/ifunc.c
src/gmxlib/libgmx.pc.cmakein
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
src/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
src/gmxlib/selection/compiler.c
src/gmxlib/sfactor.c
src/kernel/convparm.c
src/kernel/gmxcheck.c
src/kernel/hizzie.c
src/kernel/pdb2gmx.c
src/kernel/readir.c
src/kernel/repl_ex.c
src/kernel/runner.c
src/mdlib/constr.c
src/mdlib/edsam.c
src/mdlib/forcerec.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
src/mdlib/nbnxn_search.c
src/tools/CMakeLists.txt
src/tools/g_dih.c [deleted file]
src/tools/gmx_angle.c
src/tools/gmx_dih.c [deleted file]
src/tools/gmx_genpr.c
src/tools/gmx_helix.c
src/tools/gmx_make_edi.c
src/tools/gmx_rmsf.c
src/tools/gmx_sans.c
src/tools/nsfactor.c
src/tools/nsfactor.h

index 8cffe794125826a1706c7e6a13207e16041ad524..6e2f8fb15692046af7f82ad720dd95568de6bc18 100644 (file)
@@ -40,6 +40,10 @@ set(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required
 set(CPACK_COMPONENT_GROUP_TOOLS_DESCRIPTION "All GROMACS executable tools")
 set(CPACK_COMPONENT_GROUP_MDRUN_DESCRIPTION "GROMACS executable for running simulations")
 
+# CMake modules/macros are in a subdirectory to keep this file cleaner
+# This needs to be set before project() in order to pick up toolchain files
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
+
 project(Gromacs C)
 include(Dart)
 mark_as_advanced(DART_ROOT)
@@ -81,9 +85,6 @@ endif()
 # provide backward compatibility of software written against the Gromacs API.
 set(API_VERSION ${NUM_VERSION})
 
-# Cmake modules/macros are in a subdirectory to keep this file cleaner
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
 set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
 endif()
@@ -148,60 +149,14 @@ mark_as_advanced(GMX_OPENMM)
 option(GMX_FORCE_CXX "Enable C++ compilation even if not necessary" OFF)
 mark_as_advanced(GMX_FORCE_CXX)
 
-option(GMX_NO_QUOTES "Disable Gromacs cool quotes" OFF)
+option(GMX_COOL_QUOTES "Enable Gromacs cool quotes" ON)
+mark_as_advanced(GMX_COOL_QUOTES)
 
 if(GMX_GPU OR GMX_OPENMM OR GMX_FORCE_CXX)
     enable_language(CXX)
 endif()
 set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
 
-########################################################################
-# Fix stupid flags on Windows
-########################################################################
-SET(SHARED_LIBS_DEFAULT ON) 
-IF( WIN32 AND NOT CYGWIN)
-  if (NOT BUILD_SHARED_LIBS)
-    option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
-    mark_as_advanced(GMX_PREFER_STATIC_LIBS)
-    SET(SHARED_LIBS_DEFAULT OFF)
-  else()
-    add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
-  endif()
-
-  IF (GMX_PREFER_STATIC_LIBS)
-    #Only setting Debug and Release flags. Others configurations current not used.
-    STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-    SET(CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE} CACHE STRING "" FORCE)
-    STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
-    if(CMAKE_CXX_COMPILER_LOADED)
-        STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-        SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
-        STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-        SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
-    endif()
-  ENDIF()
-
-  #Workaround for cmake bug 13174. Replace deprecated options.
-  IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
-    if(BUILD_SHARED_LIBS)
-        STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
-        SET(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} CACHE STRING "" FORCE)
-    endif()
-    STRING(REPLACE /GZ /RTC1 CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
-  ENDIF()
-  IF( CMAKE_CXX_COMPILER_ID MATCHES "Intel" AND CMAKE_CXX_COMPILER_LOADED)
-    STRING(REPLACE /GZ /RTC1 CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
-
-    STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-    SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
-  ENDIF()
-ENDIF()
-
-
 ########################################################################
 # User input options                                                   #
 ########################################################################
@@ -210,8 +165,6 @@ option(GMX_MPI    "Build a parallel (message-passing) version of GROMACS" OFF)
 option(GMX_THREAD_MPI  "Build a thread-MPI-based multithreaded version of GROMACS (not compatible with MPI)" ON)
 option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
 mark_as_advanced(GMX_SOFTWARE_INVSQRT)
-option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" OFF)
-mark_as_advanced(GMX_POWERPC_INVSQRT)
 option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
 mark_as_advanced(GMX_FAHCORE)
 
@@ -258,6 +211,9 @@ endif()
 option(GMX_CYCLE_SUBCOUNTERS "Enable cycle subcounters to get a more detailed cycle timings" OFF)
 mark_as_advanced(GMX_CYCLE_SUBCOUNTERS)
 
+option(GMX_SKIP_DEFAULT_CFLAGS "Don't automatically add suggested/required Compiler flags." OFF)
+mark_as_advanced(GMX_SKIP_DEFAULT_CFLAGS)
+
 ######################################################################
 # Compiler tests
 # These need to be done early (before further tests).
@@ -372,9 +328,6 @@ endif(GMX_DOUBLE)
 if(GMX_SOFTWARE_INVSQRT)
   set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
 endif(GMX_SOFTWARE_INVSQRT)
-if(GMX_POWERPC_INVSQRT)
-  set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
-endif(GMX_POWERPC_INVSQRT)
 
 ########################################################################
 #Process MPI settings
@@ -536,22 +489,56 @@ test_big_endian(GMX_INTEGER_BIG_ENDIAN)
 ########################################################################
 # Find external packages                                               #
 ########################################################################
+SET(SHARED_LIBS_DEFAULT ON)
 if(UNIX)
     if(GMX_PREFER_STATIC_LIBS)
         # On Linux .a is the static library suffix, on Mac OS X .lib can also
         # be used, so we'll add both to the preference list.
         SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib;.a" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-        if(SHARED_LIBS_DEFAULT)
-            if(BUILD_SHARED_LIBS) #Warn the user about the combination. But don't overwrite the request.
-                message(WARNING "Static libraries requested, and shared Gromacs libraries requested.")
-            elseif(NOT DEFINED BUILD_SHARED_LIBS) #Change default to OFF. Don't warn if it's already off.
-                message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
-                set(SHARED_LIBS_DEFAULT OFF)
-            endif()
+        if(BUILD_SHARED_LIBS) #Warn the user about the combination. But don't overwrite the request.
+            message(WARNING "Static libraries requested, and shared Gromacs libraries requested.")
+        elseif(NOT DEFINED BUILD_SHARED_LIBS) #Change default to OFF. Don't warn if it's already off.
+            message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
+            set(SHARED_LIBS_DEFAULT OFF)
         endif()
     endif()
 endif()
-option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic with MPI, Windows)" ${SHARED_LIBS_DEFAULT})
+
+IF( WIN32 AND NOT CYGWIN)
+  SET(SHARED_LIBS_DEFAULT OFF) #becuase shared libs on Windows is still new - turning it off by default
+  if (NOT BUILD_SHARED_LIBS)
+      option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
+      if(NOT GMX_PREFER_STATIC_LIBS)
+          message(WARNING "Shared system libraries requested, and static Gromacs libraries requested.")
+      endif()
+  else()
+      option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" OFF)
+      if(GMX_PREFER_STATIC_LIBS)
+          #this combination segfaults (illigal passing of file handles)
+          message(FATAL_ERROR "Static system libraries requested, and shared Gromacs libraries requested.")
+      endif()
+      add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
+      set(PKG_CFLAGS "$PKG_CFLAGS -DUSE_VISIBILITY -DTMPI_USE_VISIBILITY")
+  endif()
+  mark_as_advanced(GMX_PREFER_STATIC_LIBS)
+
+  IF (GMX_PREFER_STATIC_LIBS)
+      #Only setting Debug and Release flags. Others configurations are current not used.
+      STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+      STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+      if(CMAKE_CXX_COMPILER_LOADED)
+          STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+          STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+      endif()
+  ENDIF()
+  IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
+    if(BUILD_SHARED_LIBS) #not sure why incremental building with shared libs doesn't work
+        STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+    endif()
+  ENDIF()
+ENDIF()
+
+option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic e.g. with MPI)" ${SHARED_LIBS_DEFAULT})
 
 option(GMX_GSL "Add support for gsl" OFF)
 if (GMX_GSL)
@@ -694,28 +681,31 @@ if(NOT GMX_SYSTEM_XDR)
     set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_INTERNAL_XDR")
 endif(NOT GMX_SYSTEM_XDR)
 
+# include avx test source, used if the AVX flags are set below
+include(gmxTestAVXMaskload)
+
 # Process nonbonded accelerated kernels settings
 string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
 if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
     # nothing to do
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
 
-    GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
+    GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" ACCELERATION_C_FLAGS)
     if(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
     endif(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
 
     if (CMAKE_CXX_COMPILER_LOADED)
-        GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" GROMACS_CXX_FLAGS)
+        GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" ACCELERATION_CXX_FLAGS)
         if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
-            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
         endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
     endif()
 
     # We dont warn for lacking SSE2 flag support, since that is probably standard today.
 
     # Only test the include after we have tried to add the correct flag for SSE2 support
-    check_include_file(emmintrin.h  HAVE_EMMINTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(emmintrin.h  HAVE_EMMINTRIN_H ${ACCELERATION_C_FLAGS})
 
     if(NOT HAVE_EMMINTRIN_H)
         message(FATAL_ERROR "Cannot find emmintrin.h, which is required for SSE2 intrinsics support.")
@@ -730,36 +720,38 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
 
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
 
-    GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" GROMACS_C_FLAGS)
+    GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" ACCELERATION_C_FLAGS)
     if (NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" ACCELERATION_C_FLAGS)
     endif(NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
     if (NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
-        message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
         # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
-        # intrinsics when SSE2 support is enabled, so we try that instead.
+        # intrinsics when SSE2 support is enabled, so we try that instead first.
        if (GMX_NATIVE_WINDOWS)
-            GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+            GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
+            message(WARNING "Neither SSE4.1 or SSE2 seems to be supported by your Windows compiler. Something is likely broken.")
+        else()
+            message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance")
         endif()
     endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
 
     if (CMAKE_CXX_COMPILER_LOADED)
         GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
         if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
-            GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
         endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
         if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG) 
             message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
             # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
             # intrinsics when SSE2 support is enabled, so we try that instead.
             if (GMX_NATIVE_WINDOWS)
-                GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+                GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
             endif()
         endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
     endif()
 
     # This must come after we have added the -msse4.1 flag on some platforms.
-    check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${ACCELERATION_C_FLAGS})
 
     if(NOT HAVE_SMMINTRIN_H)
         message(FATAL_ERROR "Cannot find smmintrin.h, which is required for SSE4.1 intrinsics support.")
@@ -777,18 +769,18 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
 
     # Set the AVX compiler flag for both these choices!
 
-    GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" GROMACS_C_FLAGS)
+    GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" ACCELERATION_C_FLAGS)
     if (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" ACCELERATION_C_FLAGS)
     endif (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
     if (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
         message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
     endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
 
     if (CMAKE_CXX_COMPILER_LOADED)
-        GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" GROMACS_CXX_FLAGS)
+        GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
         if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
-            GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
         endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
         if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
             message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
@@ -797,24 +789,27 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
 
     # Set the FMA4 flags (MSVC doesn't require any)
     if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
-        GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+        if (${CMAKE_COMPILER_ID} MATCHES "Clang")
+            message(FATAL_ERROR "Clang up to at least version 3.2 produces incorrect code for AVX_128_FMA. Sorry, but you will have to select a different compiler or acceleration.")
+        endif()
+        GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" ACCELERATION_C_FLAGS)
         if (NOT GNU_FMA_CFLAG)
             message(WARNING "No C FMA4 flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
         endif(NOT GNU_FMA_CFLAG)
-        GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" ACCELERATION_C_FLAGS)
         # No big deal if we do not have xop, so no point yelling warnings about it.
         if (CMAKE_CXX_COMPILER_LOADED)
-            GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" ACCELERATION_CXX_FLAGS)
             if (NOT GNU_FMA_CXXFLAG)
                 message(WARNING "No C++ FMA flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
             endif (NOT GNU_FMA_CXXFLAG)
-            GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" ACCELERATION_CXX_FLAGS)
             # No big deal if we do not have xop, so no point yelling warnings about it.
         endif()
     endif()
 
     # Only test the header after we have tried to add the flag for AVX support
-    check_include_file(immintrin.h  HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(immintrin.h  HAVE_IMMINTRIN_H ${ACCELERATION_C_FLAGS})
 
     if(NOT HAVE_IMMINTRIN_H)
         message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
@@ -823,15 +818,15 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
     if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
         try_compile(TEST_AVX ${CMAKE_BINARY_DIR}
             "${CMAKE_SOURCE_DIR}/cmake/TestAVX.c"
-            COMPILE_DEFINITIONS "${GROMACS_C_FLAGS}")
+            COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
         if(NOT TEST_AVX)
             message(FATAL_ERROR "Cannot compile AVX intrinsics. Consider switching compiler.")
         endif()
     endif()
 
     # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
-    check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
-    check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(x86intrin.h HAVE_X86INTRIN_H ${ACCELERATION_C_FLAGS})
+    check_include_file(intrin.h HAVE_INTRIN_H ${ACCELERATION_C_FLAGS})
 
     # The user should not be able to set this orthogonally to the acceleration
     set(GMX_X86_SSE4_1 1)
@@ -853,6 +848,10 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
         endif()
     endif()
 
+    # Unfortunately gcc-4.5.2 and gcc-4.6.0 has a bug where they use the wrong datatype for the formal
+    # parameter of the mask for maskload/maskstore arguments. Check if this is present, since we can work around it.
+    gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
+
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
 # GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
     if (NOT ACCELERATION_QUIETLY)
@@ -864,7 +863,6 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
         set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
     endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
     set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
-    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
     set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
     set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI not compatible with BlueGene, disabled!" FORCE)
     set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
@@ -875,10 +873,6 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
 # The automatic testing for endianness does not work for the BlueGene cross-compiler
     set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
     set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
-    set(GMX_POWER6 1)
-    set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
-    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
 else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
     MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
 endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
@@ -1051,26 +1045,29 @@ if(GMX_FAHCORE)
   set(COREWRAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../corewrap" CACHE STRING 
       "Path to swindirect.h")
   include_directories(${COREWRAP_INCLUDE_DIR})
+  set_property(CACHE GMX_COOL_QUOTES VALUE OFF)
 endif(GMX_FAHCORE)
 
 # # # # # # # # # # NO MORE TESTS AFTER THIS LINE! # # # # # # # # # # #
 # these are set after everything else
-if (NOT DEFINED GROMACS_C_FLAGS_SET)
-    set(GROMACS_C_FLAGS_SET true CACHE INTERNAL "Whether to reset the C flags" 
-        FORCE)
-    set(CMAKE_C_FLAGS "${GROMACS_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING 
-        "Flags used by the compiler during all build types" FORCE)
-    if (CMAKE_CXX_COMPILER_LOADED)
-        set(CMAKE_CXX_FLAGS "${GROMACS_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING 
-            "Flags used by the compiler during all build types" FORCE)
+if (NOT GMX_SKIP_DEFAULT_CFLAGS)
+    set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+else()
+    message("Recommended flags which are not added because GMX_SKIP_DEFAULT_CFLAGS=yes:")
+    message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CFLAGS}")
+    message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}")
+    message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}")
+    if(CMAKE_CXX_COMPILER_LOADED)
+        message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CXXFLAGS}")
+        message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
+        message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
     endif()
-    set(CMAKE_EXE_LINKER_FLAGS 
-        "${GROMACS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" 
-        CACHE STRING "Linker flags for creating executables" FORCE) 
-    set(CMAKE_SHARED_LINKER_FLAGS 
-        "${GROMACS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" 
-        CACHE STRING "Linker flags for creating shared libraries" FORCE) 
-endif (NOT DEFINED GROMACS_C_FLAGS_SET)
+    message("CMAKE_EXE_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+    message("CMAKE_SHARED_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+endif()
 
 if(NOT GMX_OPENMP)
     #Unset all OpenMP flags in case OpenMP was disabled either by the user
@@ -1080,6 +1077,7 @@ if(NOT GMX_OPENMP)
     unset(OpenMP_LINKER_FLAGS CACHE)
     unset(OpenMP_SHARED_LINKER_FLAGS)
 endif()
+set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
 
 ######################################
 # Output compiler and CFLAGS used
index bee575792fffe20763105ac73012e34ee624bc4c..fc35a7dde6a6fe6ef7c6dc1f743c921ad758ff56 100644 (file)
@@ -87,7 +87,6 @@ END
 HEAD|Analyzing bonded interactions
 g_angle|calculates distributions and correlations for angles and dihedrals
 g_bond|calculates bond length distributions
-g_dih|analyzes dihedral transitions
 mk_angndx|generates index files for g_angle
 END
 
diff --git a/cmake/TestAVXMaskload.c b/cmake/TestAVXMaskload.c
new file mode 100644 (file)
index 0000000..61777b0
--- /dev/null
@@ -0,0 +1,17 @@
+#include<immintrin.h>
+int main()
+{
+    __m256d a;
+    __m256i mask;
+    double  d[4]={1,2,3,4};
+
+    a = _mm256_setzero_pd();
+    mask = _mm256_castpd_si256(a);
+
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+    a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
+#else
+    a = _mm256_maskload_pd(d,mask);
+#endif
+}
+
index e995e7709fd94e065935f1c851da72db9aaeee17..ce8552fa061287dcacd71d68e3166439f79c4150 100644 (file)
@@ -110,12 +110,10 @@ MACRO(gmx_c_flags)
             GMX_TEST_CFLAG(CFLAGS_WARN "-Wall" GMXC_CFLAGS)
             GMX_TEST_CFLAG(CFLAGS_STDGNU "-std=gnu99" GMXC_CFLAGS)
             GMX_TEST_CFLAG(CFLAGS_OPT "-ip -funroll-all-loops" GMXC_CFLAGS_RELEASE)
-            GMX_TEST_CFLAG(CFLAGS_SSE2 "-msse2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_X86 "-mtune=core2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_IA64 "-mtune=itanium2" GMXC_CFLAGS_RELEASE)
         else()
             GMX_TEST_CFLAG(CFLAGS_WARN "/W2" GMXC_CFLAGS)
-            GMX_TEST_CFLAG(CFLAGS_SSE2 "/arch:SSE2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_X86 "/Qip" GMXC_CFLAGS_RELEASE)
         endif()
     endif()
@@ -127,13 +125,11 @@ MACRO(gmx_c_flags)
             endif()
             GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall" GMXC_CXXFLAGS)
             GMX_TEST_CXXFLAG(CXXFLAGS_OPT "-ip -funroll-all-loops" GMXC_CXXFLAGS_RELEASE)
-            GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "-msse2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_X86 "-mtune=core2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_IA64 "-mtune=itanium2" 
                               GMXC_CXXFLAGS_RELEASE)
         else()
             GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W2" GMXC_CXXFLAGS)
-            GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "/arch:SSE2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_X86 "/Qip" GMXC_CXXFLAGS_RELEASE)
         endif()
     endif()
@@ -178,8 +174,9 @@ MACRO(gmx_c_flags)
     if (MSVC)
         # disable warnings for: 
         #      inconsistent dll linkage
+        #      forcing value to bool (for C++)
         GMX_TEST_CFLAG(CFLAGS_WARN "/wd4273" GMXC_CFLAGS)
-        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/wd4273" GMXC_CXXFLAGS)
+        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/wd4273 /wd4800" GMXC_CXXFLAGS)
     endif()
 
     if (CMAKE_C_COMPILER_ID MATCHES "Clang")
@@ -198,36 +195,19 @@ MACRO(gmx_c_flags)
 
     # now actually set the flags:
     # C
-    if ( NOT DEFINED GMXCFLAGS_SET AND NOT DEFINED ENV{CFLAGS} )
-        set(GMXCFLAGS_SET true CACHE INTERNAL "Whether to reset the C flags" 
-            FORCE)
-        
-        set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}" 
-            CACHE STRING "Flags used by the compiler during all build types." 
-            FORCE)
-        set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}" 
-            CACHE STRING "Flags used by the compiler during release builds." 
-            FORCE)
-        set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}" 
-            CACHE STRING "Flags used by the compiler during debug builds." 
-            FORCE)
+    if ( NOT GMX_SKIP_DEFAULT_CFLAGS )
+        set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}")
+        set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
+        set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
     endif()
 
     # C++
-    if ( NOT DEFINED GMXCXXFLAGS_SET AND NOT DEFINED ENV{CXXFLAGS} AND CMAKE_CXX_COMPILER_LOADED)
-        set(GMXCXXFLAGS_SET true CACHE INTERNAL "Whether to reset the C++ flags" 
-            FORCE)
-        set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}" 
-            CACHE STRING "Flags used by the compiler during all build types." 
-            FORCE)
+    if ( NOT GMX_SKIP_DEFAULT_CFLAGS)
+        set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}")
         set(CMAKE_CXX_FLAGS_RELEASE 
-            "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}" 
-            CACHE STRING "Flags used by the compiler during release builds." 
-            FORCE)
+            "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
         set(CMAKE_CXX_FLAGS_DEBUG 
-            "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}" 
-            CACHE STRING "Flags used by the compiler during debug builds." 
-            FORCE)
+            "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
     endif()
 ENDMACRO(gmx_c_flags)
 
index e5b286d704e7c02c54782355df02843d786dec94..8340c28700f2ec91e0c9fc024f9c23b83981f5d9 100644 (file)
@@ -56,8 +56,8 @@ if(GMX_MPI)
       endif()
       find_package(MPI)
       if(${${MPI_PREFIX}_FOUND})
-        set(GROMACS_C_FLAGS ${GROMACS_C_FLAGS} ${${MPI_PREFIX}_COMPILE_FLAGS})
-        set(GROMACS_LINKER_FLAGS ${GROMACS_LINKER_FLAGS} ${${MPI_PREFIX}_LINK_FLAGS})
+        set(MPI_COMPILE_FLAGS ${${MPI_PREFIX}_COMPILE_FLAGS})
+        set(MPI_LINKER_FLAGS ${${MPI_PREFIX}_LINK_FLAGS})
         include_directories(${${MPI_PREFIX}_INCLUDE_PATH})
         list(APPEND GMX_EXTRA_LIBRARIES ${${MPI_PREFIX}_LIBRARIES})
       endif()
diff --git a/cmake/gmxTestAVXMaskload.cmake b/cmake/gmxTestAVXMaskload.cmake
new file mode 100644 (file)
index 0000000..a80920d
--- /dev/null
@@ -0,0 +1,72 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+#  GMX_TEST_AVX_GCC_MASKLOAD_BUG(VARIABLE)
+#
+#  VARIABLE will be set if the compiler is a buggy version
+#  of GCC (prior to 4.5.3, and maybe 4.6) that has an incorrect second
+#  argument to the AVX _mm256_maskload_ps() intrinsic.
+#
+#  You need to use this variable in a cmakedefine, and then handle
+#  the case separately in your code - no automatic cure, unfortunately.
+#
+MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG AVX_CFLAGS VARIABLE)
+    IF(NOT DEFINED ${VARIABLE})
+        MESSAGE(STATUS "Checking for gcc AVX maskload bug") 
+        # some compilers like clang accept both cases, 
+        # so first try a normal compile to avoid flagging those as buggy.
+        TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+                    "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+                    COMPILE_DEFINITIONS "${AVX_CFLAGS}" )
+        IF(${VARIABLE}_COMPILEOK)
+            SET(${VARIABLE} 0 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+            MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+        ELSE()
+            TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+                        "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+                         COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+            IF(${VARIABLE}_COMPILEOK)
+                SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+                MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
+            ELSE()
+                MESSAGE(WARNING "Cannot compile AVX code - assuming gcc AVX maskload bug not present." )
+                MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+            ENDIF()
+        ENDIF()
+    ENDIF(NOT DEFINED ${VARIABLE})
+ENDMACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE)
+
+
+
+
index 80e189c6ff2fceaa95674aef543fa981a1345a96..f78215563d7b8f0ed43ccb01725881135fb713e7 100644 (file)
@@ -54,7 +54,7 @@ gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec
 /* Sets the ED input/output filenames, opens output (.edo) file */
 
 void init_edsam(gmx_mtop_t *mtop,t_inputrec *ir,t_commrec *cr,
-                       gmx_edsam_t ed, rvec x[], matrix box);
+                       gmx_edsam_t ed, rvec x[], matrix box, edsamstate_t *edsamstate);
 /* Init routine for ED and flooding. Calls init_edi in a loop for every .edi-cycle 
  * contained in the input file, creates a NULL terminated list of t_edpar structures */
 
index 9e2fd91b5c90aaf254aeac7991f349c24a61155f..dd26ff1fd77af5b61a2d5a448a0aee0bc382f2b9 100644 (file)
@@ -108,10 +108,6 @@ GMX_LIBGMXANA_EXPORT
 int 
 gmx_dielectric(int argc,char *argv[]);
 
-GMX_LIBGMXANA_EXPORT
-int 
-gmx_dih(int argc,char *argv[]);
-
 GMX_LIBGMXANA_EXPORT
 int 
 gmx_dipoles(int argc,char *argv[]);
index 71d89a9c54194c7aee3367455e21037b63c8f8ec..3b6673c807b195fe68605e232581d1e96be25e82 100644 (file)
  */
 #ifndef GMX_CPUID_H_
 #define GMX_CPUID_H_
+
+#include <stdio.h>
+
 #include "visibility.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
index b751a7e1c9d38fd0971391a7d1546c20a0901ee6..e098a4adc7358e4b773bcd65d83932488a3e4eb8 100644 (file)
 #ifndef _gmx_math_x86_avx_128_fma_double_h_
 #define _gmx_math_x86_avx_128_fma_double_h_
 
+#include <immintrin.h> /* AVX */
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
 #include <math.h>
 
 #include "gmx_x86_avx_128_fma.h"
index 1669da94be4953d1409c23d4b1178a4d5b23334f..9b0e0bb1e3b79ebe75f5c7aee4986225e07676a4 100644 (file)
 #ifdef HAVE_X86INTRIN_H
 #include <x86intrin.h> /* FMA */
 #endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
 
 #include <stdio.h>
 
@@ -206,6 +210,19 @@ static int gmx_mm_check_and_reset_overflow(void)
     return sse_overflow;
 }
 
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
+
 
 
 #endif /* _gmx_x86_avx_128_fma_h_ */
index 9f266e834ff031090a34f3aadd91d315e4ae6208..461c736283a66fa6b12a1a01c9890b64494d3be5 100644 (file)
@@ -286,6 +286,18 @@ static int gmx_mm_check_and_reset_overflow(void)
     return sse_overflow;
 }
 
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
 
 
 #endif /* _gmx_x86_avx_256_h_ */
index b6bc9650c6d1d367ee4fe5e426958a1d74bd4924..4d337cf1a3f4a49260a2b6becafa5b6724b62df5 100644 (file)
@@ -71,6 +71,14 @@ typedef struct {
     unsigned excl;  /* The exclusion (interaction) bits */
 } nbnxn_cj_t;
 
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc))   => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
 #define NBNXN_CI_SHIFT          127
 #define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
 #define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
@@ -79,7 +87,7 @@ typedef struct {
 /* Simple pair-list i-unit */
 typedef struct {
     int ci;             /* i-cluster             */
-    int shift;          /* Shift vector index plus possible flags */
+    int shift;          /* Shift vector index plus possible flags, see above */
     int cj_ind_start;   /* Start index into cj   */
     int cj_ind_end;     /* End index into cj     */
 } nbnxn_ci_t;
index 2b7d315ac258cbad4456b8131c29feae704918e3..107bd6c31804adb7c27386419bbdc14636702b81 100644 (file)
@@ -156,6 +156,28 @@ typedef struct
 }
 energyhistory_t;
 
+typedef struct
+{
+    /* If one uses essential dynamics or flooding on a group of atoms from
+     * more than one molecule, we cannot make this group whole with
+     * do_pbc_first_mtop(). We assume that the ED group has the correct PBC
+     * representation at the beginning of the simulation and keep track
+     * of the shifts to always get it into that representation.
+     * For proper restarts from a checkpoint we store the positions of the
+     * reference group at the time of checkpoint writing */
+    gmx_bool    bFromCpt;       /* Did we start from a checkpoint file?       */
+    int         nED;            /* No. of ED/Flooding data sets, if <1 no ED  */
+    int         *nref;          /* No. of atoms in i'th reference structure   */
+    int         *nav;           /* Same for average structure                 */
+    rvec        **old_sref;     /* Positions of the reference atoms
+                                   at the last time step (with correct PBC
+                                   representation)                            */
+    rvec        **old_sref_p;   /* Pointer to these positions                 */
+    rvec        **old_sav;      /* Same for the average positions             */
+    rvec        **old_sav_p;
+}
+edsamstate_t;
+
 typedef struct
 {
   int           natoms;
@@ -199,6 +221,7 @@ typedef struct
 
   energyhistory_t  enerhist; /* Energy history for statistics           */
   df_history_t  dfhist; /*Free energy history for free energy analysis  */
+  edsamstate_t  edsamstate;    /* Essential dynamics / flooding history */
 
   int           ddp_count; /* The DD partitioning count for this state  */
   int           ddp_count_cg_gl; /* The DD part. count for index_gl     */
index 6c9995f2770bcb528c39216b0d36ac27d83e2e54..410b35e53613db3097fb933c8c9cd913885c52b3 100644 (file)
@@ -183,40 +183,6 @@ static real gmx_software_invsqrt(real x)
 #define INVSQRT_DONE 
 #endif /* gmx_invsqrt */
 
-#ifdef GMX_POWERPC_SQRT
-static real gmx_powerpc_invsqrt(real x)
-{
-  const real  half=0.5;
-  const real  three=3.0;
-  t_convert   result,bit_pattern;
-  unsigned int exp,fract;
-  real        lu;
-  real        y;
-#ifdef GMX_DOUBLE
-  real        y2;
-#endif
-
-  lu = __frsqrte((double)x);
-
-  y=(half*lu*(three-((x*lu)*lu)));
-
-#if (GMX_POWERPC_SQRT==2)
-  /* Extra iteration required */
-  y=(half*y*(three-((x*y)*y)));
-#endif
-
-#ifdef GMX_DOUBLE
-  y2=(half*y*(three-((x*y)*y)));
-
-  return y2;                    /* 10 Flops */
-#else
-  return y;                     /* 5  Flops */
-#endif
-}
-#define gmx_invsqrt(x) gmx_powerpc_invsqrt(x)
-#define INVSQRT_DONE
-#endif /* powerpc_invsqrt */
-
 #ifndef INVSQRT_DONE
 #    ifdef GMX_DOUBLE
 #        ifdef HAVE_RSQRT
index de457e2b1b8b7a0921b3bada649bc0b3556af23b..b57659a9495927e3e7b38ecb9060f5a182907956 100644 (file)
@@ -5,9 +5,8 @@
 # If you only use one shell you can copy that GMXRC.* instead.
 
 
-# only csh/tcsh understand 'set'
-set is_csh = 123
-test "$is_csh" = 123 && goto CSH
+# only csh/tcsh set the variable $shell (note: lower case!)
+test $shell && goto CSH
 
 # if we got here, shell is bsh/bash/zsh/ksh
 # bsh cannot remove part of a variable with %%
index fea72694c7824b5be192fafbb31a1c50e2c9dc91..7655758f51bda31af990ae4807f3cf01363daedb 100644 (file)
@@ -66,7 +66,6 @@ Thu 26 Aug 2010</B></td>
 <br><a href=online/g_density.html>g_density</a>
 <br><a href=online/g_densmap.html>g_densmap</a>
 <br><a href=online/g_dielectric.html>g_dielectric</a>
-<br><a href=online/g_dih.html>g_dih</a>
 <br><a href=online/g_dipoles.html>g_dipoles</a>
 <br><a href=online/g_disre.html>g_disre</a>
 <br><a href=online/g_dist.html>g_dist</a>
@@ -283,7 +282,6 @@ Thu 26 Aug 2010</B></td>
 <TR><TD><A HREF="online/g_bond.html">g_bond</A></TD><TD>calculates bond length distributions</TD>
 <TR><TD><A HREF="online/mk_angndx.html">mk_angndx</A></TD><TD>generates index files for g_angle</TD>
 <TR><TD><A HREF="online/g_angle.html">g_angle</A></TD><TD>calculates distributions and correlations for angles and dihedrals</TD>
-<TR><TD><A HREF="online/g_dih.html">g_dih</A></TD><TD>analyzes dihedral transitions</TD>
 </TABLE>
 
 <A NAME="HNR11">
diff --git a/share/html/online/g_dih.html b/share/html/online/g_dih.html
deleted file mode 100644 (file)
index e61741b..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-<HTML>
-<HEAD>
-<TITLE>g_dih</TITLE>
-<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
-<TABLE WIDTH="98%" NOBORDER >
-<TR><TD WIDTH=400>
-<TABLE WIDTH=400 NOBORDER>
-<TD WIDTH=116>
-<a href="http://www.gromacs.org/"><img SRC="../images/gmxlogo_small.png"BORDER=0 </a></td>
-<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>g_dih</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p><B>VERSION 4.5<br>
-Thu 26 Aug 2010</B></td></tr></TABLE>
-<HR>
-<H3>Description</H3>
-<p>
-g_dih can do two things. The default is to analyze dihedral transitions
-by merely computing all the dihedral angles defined in your topology
-for the whole trajectory. When a dihedral flips over to another minimum
-an angle/time plot is made.<p>
-The opther option is to discretize the dihedral space into a number of
-bins, and group each conformation in dihedral space in the
-appropriate bin. The output is then given as a number of dihedral
-conformations sorted according to occupancy.
-<P>
-<H3>Files</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>filename</TH><TH>type</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-f</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">    traj.xtc</a></tt> </TD><TD> Input </TD><TD> Trajectory: <a href="xtc.html">xtc</a> <a href="trr.html">trr</a> <a href="trj.html">trj</a> <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> cpt </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-s</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">   topol.tpr</a></tt> </TD><TD> Input </TD><TD> Run input file: <a href="tpr.html">tpr</a> <a href="tpb.html">tpb</a> <a href="tpa.html">tpa</a> </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-o</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="out.html">   hello.out</a></tt> </TD><TD> Output </TD><TD> Generic output file </TD></TR>
-</TABLE>
-<P>
-<H3>Other options</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>type</TH><TH>default</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> Print help info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]version</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> Print version info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0     </tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0     </tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0     </tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]sa</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> Perform cluster analysis in dihedral space instead of analysing dihedral transitions. </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-mult</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> mulitiplicity for dihedral angles (by default read from topology) </TD></TD>
-</TABLE>
-<P>
-<hr>
-<div ALIGN=RIGHT>
-<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
-<font size="-1"><a href="mailto:gromacs@gromacs.org">gromacs@gromacs.org</a></font><br>
-</div>
-</BODY>
index 4853808f5988ee3f49e268b2570fe5aad7dce768..963099586feaef488b35e235075c33d426932881 100644 (file)
@@ -67,7 +67,7 @@ without using the GROMACS libraries you can use the following formats:
 
 <dl>
 <dt>C format 
-<dd><tt>"%5d%5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
+<dd><tt>"%5d%-5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
 <dt>Fortran format 
 <dd><tt>(i5,2a5,i5,3f8.3,3f8.4)</tt>
 <dt>Pascal format
index 88a27c21bf34d45a72144314c9157cfe4758afb1..6ef530faf68909b57b7855d86211d6352451017c 100644 (file)
@@ -542,7 +542,7 @@ For dynamics without temperature coupling or to override the buffer size,
 use <b>verlet-buffer-drift</b>=-1 and set <b>rlist</b> manually.</dd>
 
 <dt><b>rlist: (1) [nm]</b></dt>
-<dd>Cut-off distance for the short-range neighbor list, should be &ge; 0.
+<dd>Cut-off distance for the short-range neighbor list.
 With <b>cutoff-scheme</b>=<b>Verlet</b>, this is by default set by the
 <b>verlet-buffer-drift</b> option and the value of <b>rlist</b> is ignored.</dd>
 
@@ -716,8 +716,8 @@ affect the forces or the sampling.</dd>
 <dt><b>rcoulomb-switch: (0) [nm]</b></dt>
 <dd>where to start switching the Coulomb potential</dd>
 
-<dt><b>rcoulomb: (-1) [nm]</b></dt>
-<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx-->, should be &ge; 0</dd>
+<dt><b>rcoulomb: (1) [nm]</b></dt>
+<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx--></dd>
 
 <dt><b>epsilon-r: (1)</b></dt>
 <dd>The relative <!--Idx-->dielectric constant<!--EIdx-->.
@@ -787,8 +787,8 @@ affect the forces or the sampling.</dd>
 <dt><b>rvdw-switch: (0) [nm]</b></dt>
 <dd>where to start switching the LJ potential</dd>
 
-<dt><b>rvdw: (-1) [nm]</b></dt>
-<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx-->, should be &ge; 0</dd>
+<dt><b>rvdw: (1) [nm]</b></dt>
+<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx--></dd>
 
 <dt><b>DispCorr:</b></dt>
 <dd><dl compact></dd>
index b205633cf5fc4eb7d4aed9fb464541d78238120d..a4ace20c75582bcb0164f9095be653b3902f757c 100644 (file)
@@ -43,12 +43,23 @@ add_custom_command(OUTPUT gromacs
     DEPENDS ${GROMACS_HEADERS})
 add_custom_target(gromacs_include_links DEPENDS gromacs)
 
-add_executable(template template.c)
-remove_definitions( -DHAVE_CONFIG_H )
-add_definitions("${PKG_CFLAGS}")
-target_link_libraries(template gmx)
-include_directories("${CMAKE_CURRENT_BINARY_DIR}")
-add_dependencies(template gromacs_include_links)
+option(GMX_BUILD_TEMPLATE "Build gromacs template program" ON)
+mark_as_advanced(GMX_BUILD_TEMPLATE)
+# GMX_PREFER_STATIC_OPENMP=yes is a special case to build binaries
+# to distribute and as the template is not installed it can be
+# ignored.
+# The template is build in a user-like environment, hence we use
+# flags from PKG_CFLAGS. Again GMX_PREFER_STATIC_OPENMP=yes would
+# need special link flags (OpenMP_LINKER_FLAGS), which are not
+# very user-like.
+if (GMX_BUILD_TEMPLATE AND NOT GMX_PREFER_STATIC_OPENMP)
+    add_executable(template template.c)
+    remove_definitions( -DHAVE_CONFIG_H )
+    add_definitions("${PKG_CFLAGS}")
+    target_link_libraries(template gmx)
+    include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+    add_dependencies(template gromacs_include_links)
+endif()
 
 install(FILES README template.c Makefile.pkg
         DESTINATION ${DATA_INSTALL_DIR}/template
index 4a2a7d9817c713c771d88f412f36ca44b976bcf2..443f1f9f2453562a17e71f1aa2cb942adec29247 100644 (file)
@@ -22,7 +22,6 @@ g_coord
 g_covar
 g_density
 g_dielectric
-g_dih
 g_dipoles
 g_disre
 g_dist
index 74e28486f9364727b3cb628f1811a8580ac6052c..fa77489709382f5fb5338c92affb33deb3587be2 100644 (file)
 /* AVX 256-bit instructions available */
 #cmakedefine GMX_X86_AVX_256
 
+/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
+#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+
 /* SSE2 was selected as CPU acceleration level */
 #cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
 
 /* Use the GROMACS software 1/sqrt(x) */
 #cmakedefine GMX_SOFTWARE_INVSQRT
 
-/* Use the PowerPC hardware 1/sqrt(x) */
-#cmakedefine GMX_POWERPC_INVSQRT
-
 /* Use sub-counters */
 #cmakedefine GMX_CYCLE_SUBCOUNTERS
 
 /* Build special-purpose mdrun library */
 #cmakedefine GMX_FAHCORE   
 
-/* Disable gromacs quotes */
-#cmakedefine GMX_NO_QUOTES
+/* Enable gromacs quotes */
+#cmakedefine GMX_COOL_QUOTES
 
 #ifdef GMX_FAHCORE
 #define FULLINDIRECT 1
index 6c3d5a5bc045908f59091fb677a04327a33b9baf..9c2618b6e6cd6d8a4d2a77c047ad6b64411b79dc 100644 (file)
@@ -161,8 +161,6 @@ static void add_prop(aprop_t *ap,gmx_residuetype_t restype,
        ap->bAvail[i] = FALSE;
       }
     }
-    upstring(atomnm);
-    upstring(resnm);
     ap->atomnm[ap->nprop] = strdup(atomnm);
     ap->resnm[ap->nprop]  = strdup(resnm);
     j = ap->nprop;
@@ -322,9 +320,7 @@ gmx_bool gmx_atomprop_query(gmx_atomprop_t aps,
   else { 
     strncpy(atomname,atomnm,MAXQ-1);
   }
-  upstring(atomname);
   strncpy(resname,resnm,MAXQ-1);
-  upstring(resname);
   
   j = get_prop_index(&(ap->prop[eprop]),ap->restype,resname,
                     atomname,&bExact);
index 20be93520ff507525d42883a6b6b2735a07cdc17..d80c198d594c3d5d68f6f676b4b70de2363766bf 100644 (file)
@@ -103,7 +103,7 @@ gmx_ctime_r(const time_t *clock,char *buf, int n);
  * But old code can not read a new entry that is present in the file
  * (but can read a new format when new entries are not present).
  */
-static const int cpt_version = 14;
+static const int cpt_version = 15;
 
 
 const char *est_names[estNR]=
@@ -316,6 +316,39 @@ static void do_cpt_double_err(XDR *xd,const char *desc,double *f,FILE *list)
     }
 }
 
+static void do_cpt_real_err(XDR *xd,const char *desc,real *f)
+{
+    bool_t res=0;
+
+#ifdef GMX_DOUBLE
+    res = xdr_double(xd,f);
+#else
+    res = xdr_float(xd,f);
+#endif
+    if (res == 0)
+    {
+        cp_error();
+    }
+}
+
+static void do_cpt_n_rvecs_err(XDR *xd,const char *desc,int n, rvec f[],FILE *list)
+{
+    int i,j;
+
+    for (i=0; i<n; i++)
+    {
+        for (j=0; j<DIM; j++)
+        {
+            do_cpt_real_err(xd, desc, &f[i][j]);
+        }
+    }
+
+    if (list)
+    {
+        pr_rvecs(list,0,desc,f,n);
+    }
+}
+
 /* If nval >= 0, nval is used; on read this should match the passed value.
  * If nval n<0, *nptr is used; on read the value is stored in nptr
  */
@@ -771,6 +804,7 @@ static void do_cpt_header(XDR *xd,gmx_bool bRead,int *file_version,
                           int *natoms,int *ngtc, int *nnhpres, int *nhchainlength,
                           int *nlambda, int *flags_state,
                           int *flags_eks,int *flags_enh, int *flags_dfh,
+                          int *nED,
                           FILE *list)
 {
     bool_t res=0;
@@ -909,6 +943,15 @@ static void do_cpt_header(XDR *xd,gmx_bool bRead,int *file_version,
     } else {
         *flags_dfh = 0;
     }
+
+    if (*file_version >= 15)
+    {
+        do_cpt_int_err(xd,"ED data sets",nED,list);
+    }
+    else
+    {
+        *nED = 0;
+    }
 }
 
 static int do_cpt_footer(XDR *xd,gmx_bool bRead,int file_version)
@@ -1179,6 +1222,71 @@ static int do_cpt_df_hist(XDR *xd,gmx_bool bRead,int fflags,df_history_t *dfhist
     return ret;
 }
 
+
+/* This function stores the last whole configuration of the reference and
+ * average structure in the .cpt file
+ */
+static int do_cpt_EDstate(XDR *xd,gmx_bool bRead,
+        edsamstate_t *EDstate, FILE *list)
+{
+    int i,j;
+    int ret=0;
+    char buf[STRLEN];
+
+
+    EDstate->bFromCpt = bRead;
+
+    if (EDstate->nED <= 0)
+    {
+        return ret;
+    }
+
+    /* When reading, init_edsam has not been called yet,
+     * so we have to allocate memory first. */
+    if (bRead)
+    {
+        snew(EDstate->nref    , EDstate->nED);
+        snew(EDstate->old_sref, EDstate->nED);
+        snew(EDstate->nav     , EDstate->nED);
+        snew(EDstate->old_sav , EDstate->nED);
+    }
+
+    /* Read/write the last whole conformation of SREF and SAV for each ED dataset (usually only one) */
+    for (i=0; i< EDstate->nED; i++)
+    {
+        /* Reference structure SREF */
+        sprintf(buf, "ED%d # of atoms in reference structure", i+1);
+        do_cpt_int_err(xd, buf, &EDstate->nref[i],list);
+        sprintf(buf, "ED%d x_ref", i+1);
+        if (bRead)
+        {
+            snew(EDstate->old_sref[i], EDstate->nref[i]);
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref[i], list);
+        }
+        else
+        {
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref_p[i], list);
+        }
+
+        /* Average structure SAV */
+        sprintf(buf, "ED%d # of atoms in average structure", i+1);
+        do_cpt_int_err(xd, buf, &EDstate->nav[i] ,list);
+        sprintf(buf, "ED%d x_av", i+1);
+        if (bRead)
+        {
+            snew(EDstate->old_sav[i], EDstate->nav[i]);
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav[i], list);
+        }
+        else
+        {
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav_p[i], list);
+        }
+    }
+
+    return ret;
+}
+
+
 static int do_cpt_files(XDR *xd, gmx_bool bRead, 
                         gmx_file_position_t **p_outputfiles, int *nfiles, 
                         FILE *list, int file_version)
@@ -1418,6 +1526,7 @@ void write_checkpoint(const char *fn,gmx_bool bNumberAndKeep,
                   DOMAINDECOMP(cr) ? cr->dd->nc : NULL,&npmenodes,
                   &state->natoms,&state->ngtc,&state->nnhpres,
                   &state->nhchainlength,&(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+                  &state->edsamstate.nED,
                   NULL);
     
     sfree(version);
@@ -1430,6 +1539,7 @@ void write_checkpoint(const char *fn,gmx_bool bNumberAndKeep,
        (do_cpt_ekinstate(gmx_fio_getxdr(fp),FALSE,flags_eks,&state->ekinstate,NULL) < 0)||
        (do_cpt_enerhist(gmx_fio_getxdr(fp),FALSE,flags_enh,&state->enerhist,NULL) < 0)  ||
        (do_cpt_df_hist(gmx_fio_getxdr(fp),FALSE,flags_dfh,&state->dfhist,NULL) < 0)  ||
+       (do_cpt_EDstate(gmx_fio_getxdr(fp),FALSE,&state->edsamstate,NULL) < 0)      ||
        (do_cpt_files(gmx_fio_getxdr(fp),FALSE,&outputfiles,&noutputfiles,NULL,
                      file_version) < 0))
     {
@@ -1673,7 +1783,8 @@ static void read_checkpoint(const char *fn,FILE **pfplog,
                   &eIntegrator_f,simulation_part,step,t,
                   &nppnodes_f,dd_nc_f,&npmenodes_f,
                   &natoms,&ngtc,&nnhpres,&nhchainlength,&nlambda,
-                  &fflags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+                  &fflags,&flags_eks,&flags_enh,&flags_dfh,
+                  &state->edsamstate.nED,NULL);
 
     if (bAppendOutputFiles &&
         file_version >= 13 && double_prec != GMX_CPT_BUILD_DP)
@@ -1862,6 +1973,12 @@ static void read_checkpoint(const char *fn,FILE **pfplog,
         cp_error();
     }
 
+    ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+    if (ret)
+    {
+        cp_error();
+    }
+
     if (file_version < 6)
     {
         const char *warn="Reading checkpoint file in old format, assuming that the run that generated this file started at step 0, if this is not the case the averages stored in the energy file will be incorrect.";
@@ -2098,7 +2215,8 @@ static void read_checkpoint_data(t_fileio *fp,int *simulation_part,
                   &version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
                   &eIntegrator,simulation_part,step,t,&nppnodes,dd_nc,&npme,
                   &state->natoms,&state->ngtc,&state->nnhpres,&state->nhchainlength,
-                  &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+                  &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+                  &state->edsamstate.nED,NULL);
     ret =
         do_cpt_state(gmx_fio_getxdr(fp),TRUE,state->flags,state,bReadRNG,NULL);
     if (ret)
@@ -2124,6 +2242,12 @@ static void read_checkpoint_data(t_fileio *fp,int *simulation_part,
         cp_error();
     }
 
+    ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+    if (ret)
+    {
+        cp_error();
+    }
+
     ret = do_cpt_files(gmx_fio_getxdr(fp),TRUE,
                        outputfiles != NULL ? outputfiles : &files_loc,
                        outputfiles != NULL ? nfiles : &nfiles_loc,
@@ -2234,7 +2358,7 @@ void list_checkpoint(const char *fn,FILE *out)
                   &eIntegrator,&simulation_part,&step,&t,&nppnodes,dd_nc,&npme,
                   &state.natoms,&state.ngtc,&state.nnhpres,&state.nhchainlength,
                   &(state.dfhist.nlambda),&state.flags,
-                  &flags_eks,&flags_enh,&flags_dfh,out);
+                  &flags_eks,&flags_enh,&flags_dfh,&state.edsamstate.nED,out);
     ret = do_cpt_state(gmx_fio_getxdr(fp),TRUE,state.flags,&state,TRUE,out);
     if (ret)
     {
@@ -2255,6 +2379,12 @@ void list_checkpoint(const char *fn,FILE *out)
         ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
                              flags_dfh,&state.dfhist,out);
     }
+
+    if (ret == 0)
+    {
+        ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state.edsamstate,out);
+    }
+
     if (ret == 0)
     {
                do_cpt_files(gmx_fio_getxdr(fp),TRUE,&outputfiles,&nfiles,out,file_version);
index 5354ae8ca1b287b3427d450f1be582896ff094ed..a1187f7978ae98aca74ca5dfc9257211216b917a 100644 (file)
@@ -126,11 +126,11 @@ gmx_bool be_cool(void)
    * but we dont call this routine often, and it avoids using 
    * a mutex for locking the variable...
    */
-#if defined(GMX_FAHCORE) || defined(GMX_NO_QUOTES)
+#ifdef GMX_COOL_QUOTES
+  return (getenv("GMX_NO_QUOTES") == NULL);
+#else
   /*be uncool*/
   return FALSE;
-#else
-  return (getenv("GMX_NO_QUOTES") == NULL);
 #endif
 }
 
@@ -656,6 +656,7 @@ void gmx_print_version_info(FILE *fp)
 #else
     fprintf(fp, "Precision:          single\n");
 #endif
+    fprintf(fp, "Memory model:       %lu bit\n",8*sizeof(void *));
 
 #ifdef GMX_THREAD_MPI
     fprintf(fp, "MPI library:        thread_mpi\n");
index 79ac2e206411baadc62810bb625832f05ac39438..e2d6f79a7456be1e7473b59f9083153363a341f9 100644 (file)
@@ -112,7 +112,7 @@ const t_interaction_function interaction_function[F_NRE]=
   def_bonded  ("RBDIHS",   "Ryckaert-Bell.",  4, 6, 6,  eNR_RB, rbdihs            ),
   def_bonded  ("FOURDIHS", "Fourier Dih.",    4, 4, 4,  eNR_FOURDIH, rbdihs       ),
   def_bonded  ("IDIHS",    "Improper Dih.",   4, 2, 2,  eNR_IMPROPER,idihs        ),
-  def_bonded  ("PIDIHS",   "Improper Dih.",   4, 3, 3,  eNR_PROPER, pdihs         ),
+  def_bonded  ("PIDIHS",   "Improper Dih.",   4, 3, 3,  eNR_IMPROPER, pdihs       ),
   def_bondedt ("TABDIHS", "Tab. Dih.",        4, 2, 2,  eNR_TABDIHS, tab_dihs     ),
   def_bonded  ("CMAP",  "CMAP Dih.",          5, -1, -1,  eNR_CMAP,   unimplemented ),
   def_bonded  ("GB12",     "GB 1-2 Pol.",     2, 4, 0,  eNR_GB,     unimplemented ),
index 3b4227907e411d21ee86beb2203b98e916dc8e9b..0bc0b0d3cc71aff5d945b0454116d8bc5628e510 100644 (file)
@@ -6,7 +6,7 @@ Description: Gromacs default lib
 URL: http://www.gromacs.org
 Version: @PROJECT_VERSION@
 Requires:
-Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
 Libs: -L${libdir} -lgmx@GMX_LIBS_SUFFIX@ -lm
 Cflags: -I${includedir} @PKG_CFLAGS@
 
index b86c3eee5715d688527f130c666757938c52d5e2..0f076850006660da3ff8fb52aacdaac864c8bbc0 100644 (file)
@@ -9,16 +9,16 @@
  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  * a full list of developers and information, check out http://www.gromacs.org
  *
- * This program is free software; you can redistribute it and/or modify it under 
- * the terms of the GNU Lesser General Public License as published by the Free 
- * Software Foundation; either version 2 of the License, or (at your option) any 
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
  * later version.
  * As a special exception, you may use this file as part of a free software
  * library without restriction.  Specifically, if other files instantiate
  * templates or use macros or inline functions from this file, or you compile
  * this file and link it with other files to produce an executable, this
  * file does not by itself cause the resulting executable to be covered by
- * the GNU Lesser General Public License.  
+ * the GNU Lesser General Public License.
  *
  * In plain-speak: do not worry about classes/macros/templates either - only
  * changes to the library have to be LGPL, not an application linking with it.
@@ -73,16 +73,16 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                               __m128d xmm1)
 {
     __m128d t2;
-    
+
     t2       = _mm_unpackhi_pd(xmm1,xmm1);
-    _mm_store_sd(ptrA,xmm1);                                           
-    _mm_store_sd(ptrB,t2);                                         
+    _mm_store_sd(ptrA,xmm1);
+    _mm_store_sd(ptrB,t2);
 }
 
 static void
 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 {
-    _mm_store_sd(ptrA,xmm1);                                        
+    _mm_store_sd(ptrA,xmm1);
 }
 
 
@@ -92,7 +92,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                   double * gmx_restrict ptrB, __m128d xmm1)
 {
     __m128d t1;
-    
+
     t1   = _mm_unpackhi_pd(xmm1,xmm1);
     xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA));
     t1   = _mm_add_sd(t1,_mm_load_sd(ptrB));
@@ -104,7 +104,7 @@ static void
 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 {
     __m128d tmp;
-    
+
     tmp = gmx_mm_load_1real_pd(ptrA);
     tmp = _mm_add_sd(tmp,xmm1);
     gmx_mm_store_1real_pd(ptrA,tmp);
@@ -119,12 +119,12 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                              __m128d * gmx_restrict c12)
 {
     __m128d t1,t2,t3;
-    
+
     /* The c6/c12 array should be aligned */
     t1   = _mm_loadu_pd(p1);
     t2   = _mm_loadu_pd(p2);
-    *c6  = _mm_unpacklo_pd(t1,t2);  
-    *c12 = _mm_unpackhi_pd(t1,t2);                    
+    *c6  = _mm_unpacklo_pd(t1,t2);
+    *c12 = _mm_unpackhi_pd(t1,t2);
 }
 
 static gmx_inline void
@@ -139,21 +139,21 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
-    
+
     mem_xy  = _mm_loadu_pd(xyz);
     mem_z   = _mm_load_sd(xyz+2);
     mem_sxy = _mm_loadu_pd(xyz_shift);
     mem_sz  = _mm_load_sd(xyz_shift+2);
-    
+
     mem_xy  = _mm_add_pd(mem_xy,mem_sxy);
     mem_z   = _mm_add_pd(mem_z,mem_sz);
-    
+
     *x1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
     *y1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
     *z1  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
@@ -162,30 +162,30 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-    
+
     t1  = _mm_loadu_pd(xyz);
     t2  = _mm_loadu_pd(xyz+2);
     t3  = _mm_loadu_pd(xyz+4);
     t4  = _mm_loadu_pd(xyz+6);
     t5  = _mm_load_sd(xyz+8);
-    
+
     sxy = _mm_loadu_pd(xyz_shift);
     sz  = _mm_load_sd(xyz_shift+2);
     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-    
+
     t1  = _mm_add_pd(t1,sxy);
     t2  = _mm_add_pd(t2,szx);
     t3  = _mm_add_pd(t3,syz);
     t4  = _mm_add_pd(t4,sxy);
     t5  = _mm_add_sd(t5,sz);
-    
+
     *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
@@ -200,33 +200,33 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-    
+
     t1  = _mm_loadu_pd(xyz);
     t2  = _mm_loadu_pd(xyz+2);
     t3  = _mm_loadu_pd(xyz+4);
     t4  = _mm_loadu_pd(xyz+6);
     t5  = _mm_loadu_pd(xyz+8);
     t6  = _mm_loadu_pd(xyz+10);
-    
+
     sxy = _mm_loadu_pd(xyz_shift);
     sz  = _mm_load_sd(xyz_shift+2);
     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-    
+
     t1  = _mm_add_pd(t1,sxy);
     t2  = _mm_add_pd(t2,szx);
     t3  = _mm_add_pd(t3,syz);
     t4  = _mm_add_pd(t4,sxy);
     t5  = _mm_add_pd(t5,szx);
     t6  = _mm_add_pd(t6,syz);
-    
+
     *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -313,7 +313,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     t1           = _mm_loadu_pd(ptrA);
     t2           = _mm_loadu_pd(ptrB);
     t3           = _mm_loadu_pd(ptrA+2);
@@ -382,106 +382,16 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 
 
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
-{
-    __m128d t1,t2;
-    
-    t1 = _mm_loadu_pd(ptrA);
-    t2 = _mm_load_sd(ptrA+2);
-    
-    t1 = _mm_sub_pd(t1,xy);
-    t2 = _mm_sub_sd(t2,z);
-    
-    _mm_storeu_pd(ptrA,t1);
-    _mm_store_sd(ptrA+2,t2);
-}
-
-
-static void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-    
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-    
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-    
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-    
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-    
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-    
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-    
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
-
-
 static void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1)
 {
     __m128d t1,t2,t3;
-    
+
     t1           = _mm_load_sd(ptrA);
     t2           = _mm_load_sd(ptrA+1);
     t3           = _mm_load_sd(ptrA+2);
-    
+
     t1           = _mm_sub_sd(t1,x1);
     t2           = _mm_sub_sd(t2,y1);
     t3           = _mm_sub_sd(t3,z1);
@@ -491,26 +401,53 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
-                                       __m128d x3, __m128d y3, __m128d z3) 
+                                       __m128d x3, __m128d y3, __m128d z3)
 {
     __m128d t1,t2,t3,t4,t5;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
     t4          = _mm_loadu_pd(ptrA+6);
     t5          = _mm_load_sd(ptrA+8);
-    
+
     x1          = _mm_unpacklo_pd(x1,y1);
     z1          = _mm_unpacklo_pd(z1,x2);
     y2          = _mm_unpacklo_pd(y2,z2);
     x3          = _mm_unpacklo_pd(x3,y3);
     /* nothing to be done for z3 */
-    
+
     t1          = _mm_sub_pd(t1,x1);
     t2          = _mm_sub_pd(t2,z1);
     t3          = _mm_sub_pd(t3,y2);
@@ -522,31 +459,58 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
                                        __m128d x3, __m128d y3, __m128d z3,
-                                       __m128d x4, __m128d y4, __m128d z4) 
+                                       __m128d x4, __m128d y4, __m128d z4)
 {
     __m128d t1,t2,t3,t4,t5,t6;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
     t4          = _mm_loadu_pd(ptrA+6);
     t5          = _mm_loadu_pd(ptrA+8);
     t6          = _mm_loadu_pd(ptrA+10);
-    
+
     x1          = _mm_unpacklo_pd(x1,y1);
     z1          = _mm_unpacklo_pd(z1,x2);
     y2          = _mm_unpacklo_pd(y2,z2);
     x3          = _mm_unpacklo_pd(x3,y3);
     z3          = _mm_unpacklo_pd(z3,x4);
     y4          = _mm_unpacklo_pd(y4,z4);
-    
+
     _mm_storeu_pd(ptrA,    _mm_sub_pd( t1,x1 ));
     _mm_storeu_pd(ptrA+2,  _mm_sub_pd( t2,z1 ));
     _mm_storeu_pd(ptrA+4,  _mm_sub_pd( t3,y2 ));
@@ -554,28 +518,30 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1)
 {
     __m128d t1,t2,t3,t4,t5,t6,t7;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_load_sd(ptrA+2);
     t3          = _mm_loadu_pd(ptrB);
     t4          = _mm_load_sd(ptrB+2);
-    
+
     t5          = _mm_unpacklo_pd(x1,y1);
     t6          = _mm_unpackhi_pd(x1,y1);
     t7          = _mm_unpackhi_pd(z1,z1);
-    
+
     t1          = _mm_sub_pd(t1,t5);
     t2          = _mm_sub_sd(t2,z1);
-    
+
     t3          = _mm_sub_pd(t3,t6);
     t4          = _mm_sub_sd(t4,t7);
-    
+
     _mm_storeu_pd(ptrA,t1);
     _mm_store_sd(ptrA+2,t2);
     _mm_storeu_pd(ptrB,t3);
@@ -583,15 +549,63 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
-                                       __m128d x3, __m128d y3, __m128d z3) 
+                                       __m128d x3, __m128d y3, __m128d z3)
 {
     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
@@ -602,7 +616,7 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     t8          = _mm_loadu_pd(ptrB+4);
     t9          = _mm_loadu_pd(ptrB+6);
     t10         = _mm_load_sd(ptrB+8);
-    
+
     tA          = _mm_unpacklo_pd(x1,y1);
     tB          = _mm_unpackhi_pd(x1,y1);
     tC          = _mm_unpacklo_pd(z1,x2);
@@ -612,19 +626,19 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     tG          = _mm_unpacklo_pd(x3,y3);
     tH          = _mm_unpackhi_pd(x3,y3);
     tI          = _mm_unpackhi_pd(z3,z3);
-    
+
     t1          = _mm_sub_pd(t1,tA);
     t2          = _mm_sub_pd(t2,tC);
     t3          = _mm_sub_pd(t3,tE);
     t4          = _mm_sub_pd(t4,tG);
     t5          = _mm_sub_sd(t5,z3);
-    
+
     t6          = _mm_sub_pd(t6,tB);
     t7          = _mm_sub_pd(t7,tD);
     t8          = _mm_sub_pd(t8,tF);
     t9          = _mm_sub_pd(t9,tH);
     t10         = _mm_sub_sd(t10,tI);
-    
+
     _mm_storeu_pd(ptrA,t1);
     _mm_storeu_pd(ptrA+2,t2);
     _mm_storeu_pd(ptrA+4,t3);
@@ -636,18 +650,76 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
                                        __m128d x3, __m128d y3, __m128d z3,
-                                       __m128d x4, __m128d y4, __m128d z4) 
+                                       __m128d x4, __m128d y4, __m128d z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
     __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
@@ -660,7 +732,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     t10         = _mm_loadu_pd(ptrB+6);
     t11         = _mm_loadu_pd(ptrB+8);
     t12         = _mm_loadu_pd(ptrB+10);
-    
+
     tA          = _mm_unpacklo_pd(x1,y1);
     tB          = _mm_unpackhi_pd(x1,y1);
     tC          = _mm_unpacklo_pd(z1,x2);
@@ -673,21 +745,21 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     tJ          = _mm_unpackhi_pd(z3,x4);
     tK          = _mm_unpacklo_pd(y4,z4);
     tL          = _mm_unpackhi_pd(y4,z4);
-    
+
     t1          = _mm_sub_pd(t1,tA);
     t2          = _mm_sub_pd(t2,tC);
     t3          = _mm_sub_pd(t3,tE);
     t4          = _mm_sub_pd(t4,tG);
     t5          = _mm_sub_pd(t5,tI);
     t6          = _mm_sub_pd(t6,tK);
-    
+
     t7          = _mm_sub_pd(t7,tB);
     t8          = _mm_sub_pd(t8,tD);
     t9          = _mm_sub_pd(t9,tF);
     t10         = _mm_sub_pd(t10,tH);
     t11         = _mm_sub_pd(t11,tJ);
     t12         = _mm_sub_pd(t12,tL);
-    
+
     _mm_storeu_pd(ptrA,  t1);
     _mm_storeu_pd(ptrA+2,t2);
     _mm_storeu_pd(ptrA+4,t3);
@@ -701,7 +773,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -711,14 +783,41 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
 {
     fix1 = _mm_hadd_pd(fix1,fiy1);
     fiz1 = _mm_hadd_pd(fiz1,fiz1);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
     _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
-    
+
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+fix1 = _mm_add_pd(fix1,fix3);\
+_t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+fiz1 = _mm_add_sd(fiz1,_t2);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -727,32 +826,63 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       double * gmx_restrict fshiftptr)
 {
     __m128d t1,t2;
-    
+
     fix1 = _mm_hadd_pd(fix1,fiy1);
     fiz1 = _mm_hadd_pd(fiz1,fix2);
     fiy2 = _mm_hadd_pd(fiy2,fiz2);
     fix3 = _mm_hadd_pd(fix3,fiy3);
     fiz3 = _mm_hadd_pd(fiz3,fiz3);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
     _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
-    
+
     fix1 = _mm_add_pd(fix1,fix3);
     t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
     fix1 = _mm_add_pd(fix1,t1); /* x and y sums */
-    
+
     t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));
     fiz1 = _mm_add_sd(fiz1,fiz3);
     fiz1 = _mm_add_sd(fiz1,t2); /* z sum */
-    
+
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fix4);\
+fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+fix3 = _mm_add_pd(fix3,_t2);\
+fix1 = _mm_add_pd(fix1,fix3);\
+fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -762,35 +892,35 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       double * gmx_restrict fshiftptr)
 {
     __m128d t1,t2;
-    
+
     fix1 = _mm_hadd_pd(fix1,fiy1);
     fiz1 = _mm_hadd_pd(fiz1,fix2);
     fiy2 = _mm_hadd_pd(fiy2,fiz2);
     fix3 = _mm_hadd_pd(fix3,fiy3);
     fiz3 = _mm_hadd_pd(fiz3,fix4);
     fiy4 = _mm_hadd_pd(fiy4,fiz4);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));
     _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));
     _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
-    
+
     t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
     fix1 = _mm_add_pd(fix1,t1);
     t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));
     fix3 = _mm_add_pd(fix3,t2);
     fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */
-    
+
     fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));
     fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));
     fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */
-    
+
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
+#endif
 
 
 static gmx_inline void
@@ -806,7 +936,7 @@ gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
 {
     pot1 = _mm_hadd_pd(pot1,pot2);
     pot2 = _mm_unpackhi_pd(pot1,pot1);
-    
+
     _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
     _mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB)));
 }
index 8fe321d85cb9078763daf769b05c822a24afc382..7b663ed73392fb6e8463ed9e6dfb6829a975db6e 100644 (file)
@@ -120,10 +120,10 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
 
@@ -142,10 +142,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
@@ -180,11 +180,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
@@ -227,10 +227,10 @@ gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float *
 {
     __m128 t1,t2,t3,t4;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
-    t1             = _mm_maskload_ps(ptrA,mask);
-    t2             = _mm_maskload_ps(ptrB,mask);
-    t3             = _mm_maskload_ps(ptrC,mask);
-    t4             = _mm_maskload_ps(ptrD,mask);
+    t1             = gmx_mm_maskload_ps(ptrA,mask);
+    t2             = gmx_mm_maskload_ps(ptrB,mask);
+    t3             = gmx_mm_maskload_ps(ptrC,mask);
+    t4             = gmx_mm_maskload_ps(ptrD,mask);
     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
     *x1           = t1;
     *y1           = t2;
@@ -348,6 +348,72 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                               _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+    __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+    _t13         = _mm_unpackhi_ps(_x1,_y1);\
+    _x1          = _mm_unpacklo_ps(_x1,_y1);\
+    _t14         = _mm_unpackhi_ps(_z1,_x2);\
+    _z1          = _mm_unpacklo_ps(_z1,_x2);\
+    _t15         = _mm_unpackhi_ps(_y2,_z2);\
+    _y2          = _mm_unpacklo_ps(_y2,_z2);\
+    _t16         = _mm_unpackhi_ps(_x3,_y3);\
+    _x3          = _mm_unpacklo_ps(_x3,_y3);\
+    _t17         = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
+    _t18         = _mm_movehl_ps(_z3,_z3);\
+    _t19         = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
+    _t20         = _mm_movelh_ps(_x1,_z1);\
+    _t21         = _mm_movehl_ps(_z1,_x1);\
+    _t22         = _mm_movelh_ps(_t13,_t14);\
+    _t14         = _mm_movehl_ps(_t14,_t13);\
+    _t23         = _mm_movelh_ps(_y2,_x3);\
+    _t24         = _mm_movehl_ps(_x3,_y2);\
+    _t25         = _mm_movelh_ps(_t15,_t16);\
+    _t16         = _mm_movehl_ps(_t16,_t15);\
+    _t1          = _mm_loadu_ps(ptrA);\
+    _t2          = _mm_loadu_ps(ptrA+4);\
+    _t3          = _mm_load_ss(ptrA+8);\
+    _t1          = _mm_sub_ps(_t1,_t20);\
+    _t2          = _mm_sub_ps(_t2,_t23);\
+    _t3          = _mm_sub_ss(_t3,_z3);\
+    _mm_storeu_ps(ptrA,_t1);\
+    _mm_storeu_ps(ptrA+4,_t2);\
+    _mm_store_ss(ptrA+8,_t3);\
+    _t4          = _mm_loadu_ps(ptrB);\
+    _t5          = _mm_loadu_ps(ptrB+4);\
+    _t6          = _mm_load_ss(ptrB+8);\
+    _t4          = _mm_sub_ps(_t4,_t21);\
+    _t5          = _mm_sub_ps(_t5,_t24);\
+    _t6          = _mm_sub_ss(_t6,_t17);\
+    _mm_storeu_ps(ptrB,_t4);\
+    _mm_storeu_ps(ptrB+4,_t5);\
+    _mm_store_ss(ptrB+8,_t6);\
+    _t7          = _mm_loadu_ps(ptrC);\
+    _t8          = _mm_loadu_ps(ptrC+4);\
+    _t9          = _mm_load_ss(ptrC+8);\
+    _t7          = _mm_sub_ps(_t7,_t22);\
+    _t8          = _mm_sub_ps(_t8,_t25);\
+    _t9          = _mm_sub_ss(_t9,_t18);\
+    _mm_storeu_ps(ptrC,_t7);\
+    _mm_storeu_ps(ptrC+4,_t8);\
+    _mm_store_ss(ptrC+8,_t9);\
+    _t10         = _mm_loadu_ps(ptrD);\
+    _t11         = _mm_loadu_ps(ptrD+4);\
+    _t12         = _mm_load_ss(ptrD+8);\
+    _t10         = _mm_sub_ps(_t10,_t14);\
+    _t11         = _mm_sub_ps(_t11,_t16);\
+    _t12         = _mm_sub_ss(_t12,_t19);\
+    _mm_storeu_ps(ptrD,_t10);\
+    _mm_storeu_ps(ptrD+4,_t11);\
+    _mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
@@ -414,8 +480,79 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_store_ss(ptrD+8,t12);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                               _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+    __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+    __m128 _t23,_t24;\
+    _t13         = _mm_unpackhi_ps(_x1,_y1);\
+    _x1          = _mm_unpacklo_ps(_x1,_y1);\
+    _t14         = _mm_unpackhi_ps(_z1,_x2);\
+    _z1          = _mm_unpacklo_ps(_z1,_x2);\
+    _t15         = _mm_unpackhi_ps(_y2,_z2);\
+    _y2          = _mm_unpacklo_ps(_y2,_z2);\
+    _t16         = _mm_unpackhi_ps(_x3,_y3);\
+    _x3          = _mm_unpacklo_ps(_x3,_y3);\
+    _t17         = _mm_unpackhi_ps(_z3,_x4);\
+    _z3          = _mm_unpacklo_ps(_z3,_x4);\
+    _t18         = _mm_unpackhi_ps(_y4,_z4);\
+    _y4          = _mm_unpacklo_ps(_y4,_z4);\
+    _t19         = _mm_movelh_ps(_x1,_z1);\
+    _z1          = _mm_movehl_ps(_z1,_x1);\
+    _t20         = _mm_movelh_ps(_t13,_t14);\
+    _t14         = _mm_movehl_ps(_t14,_t13);\
+    _t21         = _mm_movelh_ps(_y2,_x3);\
+    _x3          = _mm_movehl_ps(_x3,_y2);\
+    _t22         = _mm_movelh_ps(_t15,_t16);\
+    _t16         = _mm_movehl_ps(_t16,_t15);\
+    _t23         = _mm_movelh_ps(_z3,_y4);\
+    _y4          = _mm_movehl_ps(_y4,_z3);\
+    _t24         = _mm_movelh_ps(_t17,_t18);\
+    _t18         = _mm_movehl_ps(_t18,_t17);\
+    _t1          = _mm_loadu_ps(ptrA);\
+    _t2          = _mm_loadu_ps(ptrA+4);\
+    _t3          = _mm_loadu_ps(ptrA+8);\
+    _t1          = _mm_sub_ps(_t1,_t19);\
+    _t2          = _mm_sub_ps(_t2,_t21);\
+    _t3          = _mm_sub_ps(_t3,_t23);\
+    _mm_storeu_ps(ptrA,_t1);\
+    _mm_storeu_ps(ptrA+4,_t2);\
+    _mm_storeu_ps(ptrA+8,_t3);\
+    _t4          = _mm_loadu_ps(ptrB);\
+    _t5          = _mm_loadu_ps(ptrB+4);\
+    _t6          = _mm_loadu_ps(ptrB+8);\
+    _t4          = _mm_sub_ps(_t4,_z1);\
+    _t5          = _mm_sub_ps(_t5,_x3);\
+    _t6          = _mm_sub_ps(_t6,_y4);\
+    _mm_storeu_ps(ptrB,_t4);\
+    _mm_storeu_ps(ptrB+4,_t5);\
+    _mm_storeu_ps(ptrB+8,_t6);\
+    _t7          = _mm_loadu_ps(ptrC);\
+    _t8          = _mm_loadu_ps(ptrC+4);\
+    _t9          = _mm_loadu_ps(ptrC+8);\
+    _t7          = _mm_sub_ps(_t7,_t20);\
+    _t8          = _mm_sub_ps(_t8,_t22);\
+    _t9          = _mm_sub_ps(_t9,_t24);\
+    _mm_storeu_ps(ptrC,_t7);\
+    _mm_storeu_ps(ptrC+4,_t8);\
+    _mm_storeu_ps(ptrC+8,_t9);\
+    _t10         = _mm_loadu_ps(ptrD);\
+    _t11         = _mm_loadu_ps(ptrD+4);\
+    _t12         = _mm_loadu_ps(ptrD+8);\
+    _t10         = _mm_sub_ps(_t10,_t14);\
+    _t11         = _mm_sub_ps(_t11,_t16);\
+    _t12         = _mm_sub_ps(_t12,_t18);\
+    _mm_storeu_ps(ptrD,_t10);\
+    _mm_storeu_ps(ptrD+4,_t11);\
+    _mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
@@ -488,7 +625,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_storeu_ps(ptrD+8,t12);
 }
-
+#endif
 
 static gmx_inline void
 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
@@ -516,6 +653,38 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4;\
+\
+    fix1 = _mm_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+    fix1 = _mm_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+    _t4 = _mm_load_ss(fshiftptr+2);\
+    _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+    _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+    _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+    _t3 = _mm_permute_ps(_t3  ,_MM_SHUFFLE(1,2,0,0));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _mm_store_ss(fshiftptr+2,_t1);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -554,8 +723,43 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t1);
     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5;\
+\
+    fix1 = _mm_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm_hadd_ps(fiz3,fix4);\
+    fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+    fix1 = _mm_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+    _t5 = _mm_load_ss(fshiftptr+2);\
+    _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+    _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
+    _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
+    _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
+    _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+    _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _t5 = _mm_add_ps(_t5,_t1);\
+    _mm_store_ss(fshiftptr+2,_t5);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -598,7 +802,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t5);
     _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -621,22 +825,4 @@ gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
 }
 
 
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_permute_ps(pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_permute_ps(pot1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */
index 242260ac86374d045f508a8955cf061856c0049a..c00b6dad84608efb553e9b70546edf65055a9f7c 100644 (file)
@@ -211,10 +211,10 @@ gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * g
 
 static gmx_inline void
 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1,
-                                            __m256d * gmx_restrict y1,
-                                            __m256d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1,
+        __m256d * gmx_restrict y1,
+        __m256d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
 
@@ -238,10 +238,10 @@ gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shif
 
 static gmx_inline void
 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                            __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                            __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+        __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+        __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
 
@@ -285,11 +285,11 @@ gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shif
 
 static gmx_inline void
 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                            __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                            __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
-                                            __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+        __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+        __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
+        __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
 
@@ -352,27 +352,6 @@ gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 }
 
 
-static void
-gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3;
-
-    t1            = _mm256_loadu_pd(p1);                         /* x2 z1 | y1 x1 */
-    t2            = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4));  /*  -  - | z2 y2 */
-
-    *x1           = t1;
-    *y2           = t2;
-
-    t3            = gmx_mm256_unpack128hi_pd(t1,t1);
-
-    *z1           = t3;
-    *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
-    *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
-    *x2           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
-}
-
 static void
 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -408,7 +387,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
     t1            = _mm256_loadu_pd(p1);
     t2            = _mm256_loadu_pd(p1+4);
     t3            = _mm256_loadu_pd(p1+8);
-    
+
     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
@@ -419,7 +398,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
     *z1           = t4;
     *x3           = t5;
     *y4           = t6;
-    
+
     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
     *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
     *x4           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
@@ -429,128 +408,12 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 }
 
 
-static void
-gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-{
-    __m256d tA,tB,tC;
-
-    tA           = _mm256_loadu_pd(ptrA); /*  - z1 | y1 x1 */
-    tB           = _mm256_loadu_pd(ptrB); /*  - z2 | y2 x2 */
-
-    tC           = _mm256_unpacklo_pd(tA,tB);  /* z2 z1 | x2 x1 */
-
-    *x1          = tC;
-    *y1          = _mm256_unpackhi_pd(tA,tB);
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
-}
-
-
-static void
-gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3,t4,t5;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));        /*   -   -  | z2a y2a */
-    t4           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));        /*   -   -  | z2b y2b */
-    
-    t5           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-    *y2          = _mm256_unpacklo_pd(t3,t4);      /*   -   -  | y2b y2a */
-    *z2          = _mm256_unpackhi_pd(t3,t4);      /*   -   -  | z2b z2a */
-    *x1          = t5;
-    *y1          = t1;
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-}
-
-
-static void
-gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                     __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
-    t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
-    t5           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8));        /*   -   -  |  -  z3a */
-    t6           = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8));        /*   -   -  |  -  z3b */
-
-    t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-
-    t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
-    t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
-
-    *z3          = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | z3b z3a */
-
-    *x1          = t7;
-    *y1          = t1;
-    *y2          = t2;
-    *z2          = t3;
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-    *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
-    *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-}
-
-
-static void
-gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                     __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
-                                     __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
-    t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
-    t5           = _mm256_loadu_pd(ptrA+8);        /*  z4a y4a | x4a z3a */
-    t6           = _mm256_loadu_pd(ptrB+8);        /*  z4b y4b | x4b z3b */
-
-    t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-
-    t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
-    t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
-
-    t4           = _mm256_unpacklo_pd(t5,t6);      /*  y4b y4a | z3b z3a */
-    t5           = _mm256_unpackhi_pd(t5,t6);      /*  z4b z4a | x4b x4a */
-
-    *x1          = t7;
-    *y1          = t1;
-    *y2          = t2;
-    *z2          = t3;
-    *z3          = t4;
-    *x4          = t5;
-
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-    *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
-    *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-    *y4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
-    *z4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
-}
-
-
-
 static void
 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 {
-     __m256d t1,t2,t3,t4,t5,t6;
+    __m256d t1,t2,t3,t4,t5,t6;
 
     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
@@ -567,40 +430,6 @@ gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
     *z1          = gmx_mm256_unpack128hi_pd(t5,t1);
 }
 
-static void
-gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-
-    t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
-    t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
-    t5           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));      /*   -   -  | z2a y2a */
-    t6           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));      /*   -   -  | z2b y2b */
-    t7           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4));      /*   -   -  | z2c y2c */
-    t8           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4));      /*   -   -  | z2d y2d */
-
-    t9           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t10          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-    t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
-    t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
-    t3           = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | y2b y2a */
-    t4           = _mm256_unpackhi_pd(t5,t6);      /*   -   -  | z2b z2a */
-    t5           = _mm256_unpacklo_pd(t7,t8);      /*   -   -  | y2d y2c */
-    t6           = _mm256_unpackhi_pd(t7,t8);      /*   -   -  | z2d z2c */
-
-    *x1          = gmx_mm256_unpack128lo_pd(t9,t1);
-    *y1          = gmx_mm256_unpack128lo_pd(t10,t2);
-    *z1          = gmx_mm256_unpack128hi_pd(t9,t1);
-
-    *x2          = gmx_mm256_unpack128hi_pd(t10,t2);
-    *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
-    *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
-}
 
 
 static void
@@ -705,375 +534,10 @@ gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
 
 
 
-/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
-{
-    __m256d t1,t2;
-
-    t1  = _mm256_loadu_pd(ptrA);
-    t2  = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
-    t1  = _mm256_sub_pd(t1,t2);
-    /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
-    _mm256_storeu_pd(ptrA,t1);
-}
-
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                            __m256d xyz1, __m256d xyz2, __m256d xyz3)
-{
-    __m256d t1,t2;
-    __m256d tA,tB;
-    __m128d tC;
-
-    tA   = _mm256_loadu_pd(ptrA);
-    tB   = _mm256_loadu_pd(ptrA+4);
-    tC   = _mm_load_sd(ptrA+8);
-
-    /* xyz1:  -  z1 | y1 x1 */
-    /* xyz2:  -  z2 | y2 x2 */
-    /* xyz3:  -  z3 | y3 x3 */
-
-    xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
-    t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
-    xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
-    xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
-    t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
-    xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
-
-    tA   = _mm256_sub_pd(tA,xyz1);
-    tB   = _mm256_sub_pd(tB,xyz2);
-    tC   = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
-
-    _mm256_storeu_pd(ptrA,tA);
-    _mm256_storeu_pd(ptrA+4,tB);
-    _mm_store_sd(ptrA+8,tC);
-}
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                            __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
-{
-    __m256d t1,t2,t3;
-    __m256d tA,tB,tC;
-
-    tA   = _mm256_loadu_pd(ptrA);
-    tB   = _mm256_loadu_pd(ptrA+4);
-    tC   = _mm256_loadu_pd(ptrA+8);
-
-    /* xyz1:  -  z1 | y1 x1 */
-    /* xyz2:  -  z2 | y2 x2 */
-    /* xyz3:  -  z3 | y3 x3 */
-    /* xyz4:  -  z4 | y4 x4 */
-
-    xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
-    t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
-    xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
-    xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
-    t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
-    xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
-    xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1));  /*  z4 -  | x4 y4 */
-    t3   = _mm256_permute2f128_pd(xyz4,xyz4,0x21);    /*  x4 y4 | z4 - */
-    t3   = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
-    xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /*  xz y4 | x4 z3 */
-
-    tA   = _mm256_sub_pd(tA,xyz1);
-    tB   = _mm256_sub_pd(tB,xyz2);
-    tC   = _mm256_sub_pd(tC,xyz4);
-
-    _mm256_storeu_pd(ptrA,tA);
-    _mm256_storeu_pd(ptrA+4,tB);
-    _mm256_storeu_pd(ptrA+8,tC);
-}
-
-
-
-static void
-gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1)
-{
-    __m128d t1,t2,t3;
-
-    t1           = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
-    t2           = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
-    t3           = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
-    _mm_store_sd(ptrA,t1);
-    _mm_store_sd(ptrA+1,t2);
-    _mm_store_sd(ptrA+2,t3);
-}
-
-
-static void
-gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1;
-    __m128d tA;
-    t1          = _mm256_loadu_pd(ptrA);
-    tA          = _mm_loadu_pd(ptrA+4);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2a z1a | y1a x1a */
-
-    t1          = _mm256_sub_pd(x1,t1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm_storeu_pd(ptrA+4,tA);
-}
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
-{
-    __m256d t1,t2;
-    __m128d tA;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrA+4);
-    tA          = _mm_load_sd(ptrA+8);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-    y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
-    t1          = _mm256_sub_pd(t1,x1);
-    t2          = _mm256_sub_pd(t2,y2);
-    tA          = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrA+4,t2);
-    _mm_store_sd(ptrA+8,tA);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
-{
-    __m256d t1,t2,t3;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrA+4);
-    t3          = _mm256_loadu_pd(ptrA+8);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    z3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
-    y4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-    y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
-    z3          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
-
-    t1          = _mm256_sub_pd(t1,x1);
-    t2          = _mm256_sub_pd(t2,y2);
-    t3          = _mm256_sub_pd(t3,z3);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrA+4,t2);
-    _mm256_storeu_pd(ptrA+8,t3);
-}
-
-static void
-gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1)
-{
-    __m256d t1,t2,t3,t4;
-    __m256i mask;
-
-    t3          = _mm256_loadu_pd(ptrA);
-    t4          = _mm256_loadu_pd(ptrB);
-
-    t1          = _mm256_unpacklo_pd(x1,y1);   /*  -  - | y1a x1a */
-    t2          = _mm256_unpackhi_pd(x1,y1);   /*  -  - | y1b x1b */
-
-    t1          = gmx_mm256_unpack128lo_pd(t1,z1); /*  -  z1a | y1a x1a */
-    z1          = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
-    t2          = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
-
-    /* Construct a mask without executing any data loads */
-    mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
-                                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
-
-    t3          = _mm256_sub_pd(t3,t1);
-    t4          = _mm256_sub_pd(t4,t2);
-
-    /* Careful with potentially overlapping stores, need to be masked */
-    _mm256_maskstore_pd(ptrA,mask,t3);
-    _mm256_maskstore_pd(ptrB,mask,t4);
-}
-
-static void
-gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1,t2,t5;
-    __m128d t3,t4;
-
-    t1          = _mm256_loadu_pd(ptrA); 
-    t2          = _mm256_loadu_pd(ptrB); 
-    t3          = _mm_loadu_pd(ptrA+4);
-    t4          = _mm_loadu_pd(ptrB+4);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t1          = _mm256_sub_pd(t1,z2);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
-    t4          = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
-
-    /* Careful with potentially overlapping stores, need to be masked */
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm_storeu_pd(ptrA+4,t3);
-    _mm_storeu_pd(ptrB+4,t4);
-}
-
-static void
-gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
-{
-    __m256d t1,t2,t3,t4,t5,t6;
-    __m128d tA,tB;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB);
-    t3          = _mm256_loadu_pd(ptrA+4);
-    t4          = _mm256_loadu_pd(ptrB+4);
-    tA          = _mm_load_sd(ptrA+8);
-    tB          = _mm_load_sd(ptrB+8);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
-
-    t6          = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
-
-    y3          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t5          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */     
-    x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
-    t1          = _mm256_sub_pd(t1,y3);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm256_sub_pd(t3,t5);  
-    t4          = _mm256_sub_pd(t4,x1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
-    tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrA+4,t3);
-    _mm256_storeu_pd(ptrB+4,t4);
-    _mm_store_sd(ptrA+8,tA);
-    _mm_store_sd(ptrB+8,tB);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB); 
-    t3          = _mm256_loadu_pd(ptrA+4);
-    t4          = _mm256_loadu_pd(ptrB+4);
-    t5          = _mm256_loadu_pd(ptrA+8);
-    t6          = _mm256_loadu_pd(ptrB+8);
-
-    t7          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
-
-    y3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
-    z3          = _mm256_unpackhi_pd(z3,x4); /*  -   -  | x4b z3b */
-    x4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
-    y4          = _mm256_unpackhi_pd(y4,z4); /*  -   -  | z4b y4b */
-
-    z4          = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t7          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
-    x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
-    x2          = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
-    y2          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
-
-    t1          = _mm256_sub_pd(t1,z4);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm256_sub_pd(t3,t7);
-    t4          = _mm256_sub_pd(t4,x1);
-    t5          = _mm256_sub_pd(t5,x2);
-    t6          = _mm256_sub_pd(t6,y2);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrA+4,t3);
-    _mm256_storeu_pd(ptrB+4,t4);
-    _mm256_storeu_pd(ptrA+8,t5);
-    _mm256_storeu_pd(ptrB+8,t6);
-}
-
-
-
 static void
 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1)
 {
     __m256d t1,t2,tA,tB,tC,tD;
     __m256i mask;
@@ -1088,7 +552,7 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
 
     /* Construct a mask without executing any data loads */
     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
-                                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
+                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
 
     tA          = _mm256_loadu_pd(ptrA);
     tB          = _mm256_loadu_pd(ptrB);
@@ -1106,65 +570,77 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
     _mm256_maskstore_pd(ptrD,mask,tD);
 }
 
-static void
-gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1,t2,t3,t4,t5,t6;
-    __m128d tA,tB,tC,tD,tE,tF;
 
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB);
-    t3          = _mm256_loadu_pd(ptrC);
-    t4          = _mm256_loadu_pd(ptrD);
-    tA          = _mm_loadu_pd(ptrA+4);
-    tB          = _mm_loadu_pd(ptrB+4);
-    tC          = _mm_loadu_pd(ptrC+4);
-    tD          = _mm_loadu_pd(ptrD+4);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
-    y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
-    x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
 
-    t6          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    z2          = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
-    t5          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-    y1          = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
-
-    tE          = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
-    tF          = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
-
-    t1          = _mm256_sub_pd(t1,t6);
-    t2          = _mm256_sub_pd(t2,t5);
-    t3          = _mm256_sub_pd(t3,z2);
-    t4          = _mm256_sub_pd(t4,y1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
-    tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
-    tC          = _mm_sub_pd(tC,tE);
-    tD          = _mm_sub_pd(tD,tF);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrC,t3);
-    _mm256_storeu_pd(ptrD,t4);
-    _mm_storeu_pd(ptrA+4,tA);
-    _mm_storeu_pd(ptrB+4,tB);
-    _mm_storeu_pd(ptrC+4,tC);
-    _mm_storeu_pd(ptrD+4,tD);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128d _tA,_tB,_tC,_tD,_tE;\
+    _t1          = _mm256_loadu_pd(ptrA);\
+    _t2          = _mm256_loadu_pd(ptrB);\
+    _t3          = _mm256_loadu_pd(ptrC);\
+    _t4          = _mm256_loadu_pd(ptrD);\
+    _t5          = _mm256_loadu_pd(ptrA+4);\
+    _t6          = _mm256_loadu_pd(ptrB+4);\
+    _t7          = _mm256_loadu_pd(ptrC+4);\
+    _t8          = _mm256_loadu_pd(ptrD+4);\
+    _tA          = _mm_load_sd(ptrA+8);\
+    _tB          = _mm_load_sd(ptrB+8);\
+    _tC          = _mm_load_sd(ptrC+8);\
+    _tD          = _mm_load_sd(ptrD+8);\
+    _t9          = _mm256_unpacklo_pd(_x1,_y1);\
+    _x1          = _mm256_unpackhi_pd(_x1,_y1);\
+    _y1          = _mm256_unpacklo_pd(_z1,_x2);\
+    _z1          = _mm256_unpackhi_pd(_z1,_x2);\
+    _x2          = _mm256_unpacklo_pd(_y2,_z2);\
+    _y2          = _mm256_unpackhi_pd(_y2,_z2);\
+    _z2          = _mm256_unpacklo_pd(_x3,_y3);\
+    _x3          = _mm256_unpackhi_pd(_x3,_y3);\
+    _t10         = gmx_mm256_unpack128lo_pd(_t9,_y1);\
+    _y3          = gmx_mm256_unpack128hi_pd(_t9,_y1);\
+    _t9          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+    _y1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+    _x1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+    _z1          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+    _x2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+    _z2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+    _t1          = _mm256_sub_pd(_t1,_t10);\
+    _t2          = _mm256_sub_pd(_t2,_t9);\
+    _t3          = _mm256_sub_pd(_t3,_y3);\
+    _t4          = _mm256_sub_pd(_t4,_y1);\
+    _t5          = _mm256_sub_pd(_t5,_x1);\
+    _t6          = _mm256_sub_pd(_t6,_x2);\
+    _t7          = _mm256_sub_pd(_t7,_z1);\
+    _t8          = _mm256_sub_pd(_t8,_z2);\
+    _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3));\
+    _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3),_GMX_MM_PERMUTE128D(1,1)));\
+    _tE          = _mm256_extractf128_pd(_z3,0x1);\
+    _tC          = _mm_sub_sd(_tC, _tE);\
+    _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE,_GMX_MM_PERMUTE128D(1,1)));\
+    _mm256_storeu_pd(ptrA,_t1);\
+    _mm256_storeu_pd(ptrB,_t2);\
+    _mm256_storeu_pd(ptrC,_t3);\
+    _mm256_storeu_pd(ptrD,_t4);\
+    _mm256_storeu_pd(ptrA+4,_t5);\
+    _mm256_storeu_pd(ptrB+4,_t6);\
+    _mm256_storeu_pd(ptrC+4,_t7);\
+    _mm256_storeu_pd(ptrD+4,_t8);\
+    _mm_store_sd(ptrA+8,_tA);\
+    _mm_store_sd(ptrB+8,_tB);\
+    _mm_store_sd(ptrC+8,_tC);\
+    _mm_store_sd(ptrD+8,_tD);\
 }
-
-
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1,
+        __m256d x2, __m256d y2, __m256d z2,
+        __m256d x3, __m256d y3, __m256d z3)
 {
     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128d tA,tB,tC,tD,tE;
@@ -1235,15 +711,85 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
     _mm_store_sd(ptrC+8,tC);
     _mm_store_sd(ptrD+8,tD);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{ \
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14;\
+    __m128d _tA,_tB,_tC,_tD,_tE;\
+    _t1          = _mm256_loadu_pd(ptrA);\
+    _t2          = _mm256_loadu_pd(ptrB);\
+    _t3          = _mm256_loadu_pd(ptrC);\
+    _t4          = _mm256_loadu_pd(ptrD);\
+    _t5          = _mm256_loadu_pd(ptrA+4);\
+    _t6          = _mm256_loadu_pd(ptrB+4);\
+    _t7          = _mm256_loadu_pd(ptrC+4);\
+    _t8          = _mm256_loadu_pd(ptrD+4);\
+    _t9          = _mm256_loadu_pd(ptrA+8);\
+    _t10         = _mm256_loadu_pd(ptrB+8);\
+    _t11         = _mm256_loadu_pd(ptrC+8);\
+    _t12         = _mm256_loadu_pd(ptrD+8);\
+    _t13         = _mm256_unpacklo_pd(_x1,_y1);\
+    _x1          = _mm256_unpackhi_pd(_x1,_y1);\
+    _y1          = _mm256_unpacklo_pd(_z1,_x2);\
+    _z1          = _mm256_unpackhi_pd(_z1,_x2);\
+    _x2          = _mm256_unpacklo_pd(_y2,_z2);\
+    _y2          = _mm256_unpackhi_pd(_y2,_z2);\
+    _z2          = _mm256_unpacklo_pd(_x3,_y3);\
+    _x3          = _mm256_unpackhi_pd(_x3,_y3);\
+    _y3          = _mm256_unpacklo_pd(_z3,_x4);\
+    _z3          = _mm256_unpackhi_pd(_z3,_x4);\
+    _x4          = _mm256_unpacklo_pd(_y4,_z4);\
+    _y4          = _mm256_unpackhi_pd(_y4,_z4);\
+    _z4          = gmx_mm256_unpack128lo_pd(_t13,_y1);\
+    _t13         = gmx_mm256_unpack128hi_pd(_t13,_y1);\
+    _y1          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+    _x1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+    _z1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+    _x2          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+    _z2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+    _y2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+    _x3          = gmx_mm256_unpack128lo_pd(_y3,_x4);\
+    _y3          = gmx_mm256_unpack128hi_pd(_y3,_x4);\
+    _x4          = gmx_mm256_unpack128lo_pd(_z3,_y4);\
+    _z3          = gmx_mm256_unpack128hi_pd(_z3,_y4);\
+    _t1          = _mm256_sub_pd(_t1,_z4);\
+    _t2          = _mm256_sub_pd(_t2,_y1);\
+    _t3          = _mm256_sub_pd(_t3,_t13);\
+    _t4          = _mm256_sub_pd(_t4,_x1);\
+    _t5          = _mm256_sub_pd(_t5,_z1);\
+    _t6          = _mm256_sub_pd(_t6,_z2);\
+    _t7          = _mm256_sub_pd(_t7,_x2);\
+    _t8          = _mm256_sub_pd(_t8,_y2);\
+    _t9          = _mm256_sub_pd(_t9,_x3);\
+    _t10         = _mm256_sub_pd(_t10,_x4);\
+    _t11         = _mm256_sub_pd(_t11,_y3);\
+    _t12         = _mm256_sub_pd(_t12,_z3);\
+    _mm256_storeu_pd(ptrA,_t1);\
+    _mm256_storeu_pd(ptrB,_t2);\
+    _mm256_storeu_pd(ptrC,_t3);\
+    _mm256_storeu_pd(ptrD,_t4);\
+    _mm256_storeu_pd(ptrA+4,_t5);\
+    _mm256_storeu_pd(ptrB+4,_t6);\
+    _mm256_storeu_pd(ptrC+4,_t7);\
+    _mm256_storeu_pd(ptrD+4,_t8);\
+    _mm256_storeu_pd(ptrA+8,_t9);\
+    _mm256_storeu_pd(ptrB+8,_t10);\
+    _mm256_storeu_pd(ptrC+8,_t11);\
+    _mm256_storeu_pd(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1,
+        __m256d x2, __m256d y2, __m256d z2,
+        __m256d x3, __m256d y3, __m256d z3,
+        __m256d x4, __m256d y4, __m256d z4)
 {
     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
     __m128d tA,tB,tC,tD,tE;
@@ -1314,6 +860,7 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
     _mm256_storeu_pd(ptrC+8,t11);
     _mm256_storeu_pd(ptrD+8,t12);
 }
+#endif
 
 
 
@@ -1321,8 +868,8 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
 
 static gmx_inline void
 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
 {
     __m256d t1,t2;
     __m128d tA,tB;
@@ -1345,63 +892,59 @@ gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     _mm256_storeu_pd(fshiftptr,t2);
 }
 
-static gmx_inline void
-gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
-{
-    __m256d t1,t2,t3;
-    __m128d tA,tB,tC,tD,tE;
 
-    fix1 = _mm256_hadd_pd(fix1,fiy1);
-    fiz1 = _mm256_hadd_pd(fiz1,fix2);
-    fiy2 = _mm256_hadd_pd(fiy2,fiz2);
 
-    /* Add across the two lanes by swapping and adding back */
-    tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
-    tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
-    tC   = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
-    
-    t1   = gmx_mm256_set_m128d(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
-
-    t2   = _mm256_loadu_pd(fptr);
-    tD   = _mm_loadu_pd(fptr+4);
-
-    t2   = _mm256_add_pd(t2,t1);
-    tD   = _mm_add_pd(tD,tC);
-    _mm256_storeu_pd(fptr,t2);
-    _mm_storeu_pd(fptr+4,tD);
-
-    /* Add up shift force */
-    /* t1:  fix2 fiz1 | fiy1 fix1 */
-    /* tC:              fiz2 fiy2 */
-
-    tA   = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
-    tB   = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1));   /* fiy2 fix2 */
-    tC   = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1));      /*  -   fiz2 */
-    
-    tB   = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
-    tC   = _mm_add_sd(tC,tA);
-
-    tD   = _mm_loadu_pd(fshiftptr);
-    tE   = _mm_load_sd(fshiftptr+2);
-
-    tD   = _mm_add_pd(tD,tB);
-    tE   = _mm_add_pd(tE,tC);
-
-    _mm_storeu_pd(fshiftptr,tD);
-    _mm_store_sd(fshiftptr+2,tE);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{ \
+    __m256d _t1,_t2,_t3,_t4;\
+    __m128d _tz3,_tA,_tB,_tC,_tD;\
+    fix1 = _mm256_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm256_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd());\
+    _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+    _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+    _t1   = _mm256_add_pd(_t1,_t2);\
+    _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+    _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+    _t3   = _mm256_add_pd(_t3,_t4);\
+    _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));\
+    _t2   = _mm256_loadu_pd(fptr);\
+    _t4   = _mm256_loadu_pd(fptr+4);\
+    _tA   = _mm_load_sd(fptr+8);\
+    _t2   = _mm256_add_pd(_t2,_t1);\
+    _t4   = _mm256_add_pd(_t4,_t3);\
+    _tA   = _mm_add_sd(_tA,_tz3);\
+    _mm256_storeu_pd(fptr,_t2);\
+    _mm256_storeu_pd(fptr+4,_t4);\
+    _mm_store_sd(fptr+8,_tA);\
+    _tB   = _mm256_extractf128_pd(_t1,0x1);\
+    _tC   = _mm256_extractf128_pd(_t3,0x1);\
+    _tz3  = _mm_add_sd(_tz3,_tB);\
+    _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3),_GMX_MM_PERMUTE128D(1,1));\
+    _tz3  = _mm_add_sd(_tz3,_tD);\
+    _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t1));\
+    _tD   = _mm_shuffle_pd(_tB,_mm256_castpd256_pd128(_t3),_MM_SHUFFLE2(0,1));\
+    _tC   = _mm_add_pd(_tC,_tD);\
+    _tA   = _mm_loadu_pd(fshiftptr);\
+    _tB   = _mm_load_sd(fshiftptr+2);\
+    _tA   = _mm_add_pd(_tA,_tC);\
+    _tB   = _mm_add_sd(_tB,_tz3);\
+    _mm_storeu_pd(fshiftptr,_tA);\
+    _mm_store_sd(fshiftptr+2,_tB);\
 }
-
-
-
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         __m256d fix3, __m256d fiy3, __m256d fiz3,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        __m256d fix2, __m256d fiy2, __m256d fiz2,
+        __m256d fix3, __m256d fiy3, __m256d fiz3,
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
 {
     __m256d t1,t2,t3,t4;
     __m128d tz3,tA,tB,tC,tD;
@@ -1459,15 +1002,66 @@ gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     _mm_storeu_pd(fshiftptr,tA);
     _mm_store_sd(fshiftptr+2,tB);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6;\
+    __m128d _tA,_tB,_tC,_tD;\
+    fix1 = _mm256_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm256_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm256_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm256_hadd_pd(fiy4,fiz4);\
+    _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+    _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+    _t1   = _mm256_add_pd(_t1,_t2);\
+    _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+    _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+    _t3   = _mm256_add_pd(_t3,_t4);\
+    _t5   = gmx_mm256_unpack128lo_pd(fiz3,fiy4);\
+    _t6   = gmx_mm256_unpack128hi_pd(fiz3,fiy4);\
+    _t5   = _mm256_add_pd(_t5,_t6);\
+    _t2   = _mm256_loadu_pd(fptr);\
+    _t4   = _mm256_loadu_pd(fptr+4);\
+    _t6   = _mm256_loadu_pd(fptr+8);\
+    _t2   = _mm256_add_pd(_t2,_t1);\
+    _t4   = _mm256_add_pd(_t4,_t3);\
+    _t6   = _mm256_add_pd(_t6,_t5);\
+    _mm256_storeu_pd(fptr,_t2);\
+    _mm256_storeu_pd(fptr+4,_t4);\
+    _mm256_storeu_pd(fptr+8,_t6);\
+    _tA   = _mm256_extractf128_pd(_t1,0x1);\
+    _tB   = _mm256_extractf128_pd(_t3,0x1);\
+    _tC   = _mm256_extractf128_pd(_t5,0x1);\
+    _tB   = _mm_add_pd(_tB,_mm256_castpd256_pd128(_t1));\
+    _tA   = _mm_add_pd(_tA,_mm256_castpd256_pd128(_t5));\
+    _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t3));\
+    _tD   = _mm_shuffle_pd(_tA,_tC,_MM_SHUFFLE2(0,1));\
+    _tB   = _mm_add_pd(_tB,_tD);\
+    _tC   = _mm_permute_pd(_tC,_GMX_MM_PERMUTE128D(1,1));\
+    _tC   = _mm_add_sd(_tC,_tA);\
+    _tA   = _mm_loadu_pd(fshiftptr);\
+    _tD   = _mm_load_sd(fshiftptr+2);\
+    _tA   = _mm_add_pd(_tA,_tB);\
+    _tD   = _mm_add_sd(_tD,_tC);\
+    _mm_storeu_pd(fshiftptr,_tA);\
+    _mm_store_sd(fshiftptr+2,_tD);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         __m256d fix3, __m256d fiy3, __m256d fiz3,
-                                         __m256d fix4, __m256d fiy4, __m256d fiz4,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        __m256d fix2, __m256d fiy2, __m256d fiz2,
+        __m256d fix3, __m256d fiy3, __m256d fiz3,
+        __m256d fix4, __m256d fiy4, __m256d fiz4,
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
 {
     __m256d t1,t2,t3,t4,t5,t6;
     __m128d tA,tB,tC,tD;
@@ -1530,6 +1124,7 @@ gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     _mm_storeu_pd(fshiftptr,tA);
     _mm_store_sd(fshiftptr+2,tD);
 }
+#endif
 
 
 
@@ -1547,7 +1142,7 @@ gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
 
 static void
 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
-                      __m256d pot2, double * gmx_restrict ptrB)
+                         __m256d pot2, double * gmx_restrict ptrB)
 {
     __m128d t1,t2;
 
@@ -1561,49 +1156,4 @@ gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
 }
 
 
-static void
-gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
-                         __m256d pot2, double * gmx_restrict ptrB,
-                         __m256d pot3, double * gmx_restrict ptrC,
-                         __m256d pot4, double * gmx_restrict ptrD)
-{
-    __m256d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF,tG,tH;
-
-    tA   = _mm_load_sd(ptrA);
-    tB   = _mm_load_sd(ptrB);
-    tC   = _mm_load_sd(ptrC);
-    tD   = _mm_load_sd(ptrD);
-
-    /* do a transpose */
-    t1   = _mm256_unpacklo_pd(pot1, pot2);   /* p2c p1c | p2a p1a */
-    t2   = _mm256_unpackhi_pd(pot1, pot2);   /* p2d p1d | p2b p1b */
-    t3   = _mm256_unpacklo_pd(pot3, pot4);   /* p4c p3c | p4a p3a */
-    t4   = _mm256_unpackhi_pd(pot3, pot4);   /* p4d p3d | p4b p3b */
-    pot1 = _mm256_permute2f128_pd(t1, t3, 0x20);   /* p4a p3a | p2a p1a */
-    pot2 = _mm256_permute2f128_pd(t2, t4, 0x20);   /* p4b p3b | p2b p1b */
-    pot3 = _mm256_permute2f128_pd(t1, t3, 0x31);   /* p4c p3c | p2c p1c */
-    pot4 = _mm256_permute2f128_pd(t2, t4, 0x31);   /* p4d p3d | p2d p1d */
-
-    pot1 = _mm256_add_pd(pot1,pot2);
-    pot3 = _mm256_add_pd(pot3,pot4);
-    pot1 = _mm256_add_pd(pot1,pot3);  /* Sum in the four elements */
-
-    tE   = _mm256_castpd256_pd128(pot1);
-    tF   = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
-    tG   = _mm256_extractf128_pd(pot1,0x1);
-    tH   = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
-
-    tA   = _mm_add_sd(tA,tE);
-    tB   = _mm_add_sd(tB,tF);
-    tC   = _mm_add_sd(tC,tG);
-    tD   = _mm_add_sd(tD,tH);
-
-       _mm_store_sd(ptrA,tA);
-       _mm_store_sd(ptrB,tB);
-       _mm_store_sd(ptrC,tC);
-       _mm_store_sd(ptrD,tD);
-}
-
-
 #endif /* _kernelutil_x86_avx_256_double_h_ */
index 1f2ab31be04f996cd47402a9cfed44e24fe12270..f3a1f6740a1b5f7d512fde1006d0b8f0005290e7 100644 (file)
@@ -199,10 +199,10 @@ gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx
 
 static gmx_inline void
 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1,
-                                            __m256 * gmx_restrict y1,
-                                            __m256 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1,
+        __m256 * gmx_restrict y1,
+        __m256 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
 
@@ -225,10 +225,10 @@ gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift
 
 static gmx_inline void
 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-                                            __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
-                                            __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+        __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+        __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
@@ -273,11 +273,11 @@ gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift
 
 static gmx_inline void
 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-                                            __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
-                                            __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
-                                            __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+        __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+        __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
+        __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
@@ -334,10 +334,10 @@ gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 {
     __m128 t1,t2,t3,t4;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
-    t1             = _mm_maskload_ps(ptrA,mask);
-    t2             = _mm_maskload_ps(ptrB,mask);
-    t3             = _mm_maskload_ps(ptrC,mask);
-    t4             = _mm_maskload_ps(ptrD,mask);
+    t1             = gmx_mm_maskload_ps(ptrA,mask);
+    t2             = gmx_mm_maskload_ps(ptrB,mask);
+    t3             = gmx_mm_maskload_ps(ptrC,mask);
+    t4             = gmx_mm_maskload_ps(ptrD,mask);
     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
     *x1           = _mm256_castps128_ps256(t1);
     *y1           = _mm256_castps128_ps256(t2);
@@ -431,10 +431,10 @@ gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
     __m256 t1,t2,t3,t4,t5,t6,t7,t8;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
 
-    t1             = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
-    t2             = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
-    t3             = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
-    t4             = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
+    t1             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
+    t2             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
+    t3             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
+    t4             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
 
     t5            = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
     t6            = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
@@ -503,7 +503,7 @@ gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 
     t1           = _mm256_unpacklo_ps(t1,t3);  /*  -   -  z3g z3e |  -   -  z3c z3a */
     t2           = _mm256_unpacklo_ps(t2,t4);  /*  -   -  z3h z3f |  -   -  z3d z3b */
-    
+
     *z3          = _mm256_unpacklo_ps(t1,t2);
 }
 
@@ -567,7 +567,7 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
     t6           = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
     t7           = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
     t8           = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
-    
+
     *z3          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
     *x4          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
     *y4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
@@ -577,8 +577,8 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 
 static gmx_inline void
 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC,float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1)
+        float * gmx_restrict ptrC,float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8;
     __m128i mask;
@@ -594,30 +594,79 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     t3          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /*  -  z1c y1c x1c */
     t4          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /*  -  z1d y1d x1d */
 
-    t5          = _mm_maskload_ps(ptrA,mask);
-    t6          = _mm_maskload_ps(ptrB,mask);
-    t7          = _mm_maskload_ps(ptrC,mask);
-    t8          = _mm_maskload_ps(ptrD,mask);
+    t5          = gmx_mm_maskload_ps(ptrA,mask);
+    t6          = gmx_mm_maskload_ps(ptrB,mask);
+    t7          = gmx_mm_maskload_ps(ptrC,mask);
+    t8          = gmx_mm_maskload_ps(ptrD,mask);
 
     t5          = _mm_sub_ps(t5,t1);
     t6          = _mm_sub_ps(t6,t2);
     t7          = _mm_sub_ps(t7,t3);
     t8          = _mm_sub_ps(t8,t4);
 
-    _mm_maskstore_ps(ptrA,mask,t5);
-    _mm_maskstore_ps(ptrB,mask,t6);
-    _mm_maskstore_ps(ptrC,mask,t7);
-    _mm_maskstore_ps(ptrD,mask,t8);
+    gmx_mm_maskstore_ps(ptrA,mask,t5);
+    gmx_mm_maskstore_ps(ptrB,mask,t6);
+    gmx_mm_maskstore_ps(ptrC,mask,t7);
+    gmx_mm_maskstore_ps(ptrD,mask,t8);
 }
 
-
-
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                                  x1,y1,z1,x2,y2,z2,x3,y3,z3) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6;\
+    __m128 _tA,_tB,_tC,_tD;\
+\
+    _t1         = _mm256_loadu_ps(ptrA);\
+    _t2         = _mm256_loadu_ps(ptrB);\
+    _t3         = _mm256_loadu_ps(ptrC);\
+    _t4         = _mm256_loadu_ps(ptrD);\
+    _tA         = _mm_load_ss(ptrA+8);\
+    _tB         = _mm_load_ss(ptrB+8);\
+    _tC         = _mm_load_ss(ptrC+8);\
+    _tD         = _mm_load_ss(ptrD+8);\
+    _t5         = _mm256_unpacklo_ps(x1,y1);\
+    x1          = _mm256_unpackhi_ps(x1,y1);\
+    y1          = _mm256_unpacklo_ps(z1,x2);\
+    z1          = _mm256_unpackhi_ps(z1,x2);\
+    x2          = _mm256_unpacklo_ps(y2,z2);\
+    y2          = _mm256_unpackhi_ps(y2,z2);\
+    _t6         = _mm256_unpacklo_ps(x3,y3);\
+    x3          = _mm256_unpackhi_ps(x3,y3);\
+    _t5         = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+    x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+    y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1);\
+    z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+    z2          = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(1,0,1,0));\
+    _t5         = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(3,2,3,2));\
+    y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+    x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_sub_ps(_t1,z2);\
+    _t2         = _mm256_sub_ps(_t2,_t5);\
+    _t3         = _mm256_sub_ps(_t3,y1);\
+    _t4         = _mm256_sub_ps(_t4,x1);\
+    _tA         = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3));\
+    _tB         = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));\
+    _tC         = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));\
+    _tD         = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));\
+    _mm256_storeu_ps(ptrA,_t1);\
+    _mm256_storeu_ps(ptrB,_t2);\
+    _mm256_storeu_ps(ptrC,_t3);\
+    _mm256_storeu_ps(ptrD,_t4);\
+    _mm_store_ss(ptrA+8,_tA);\
+    _mm_store_ss(ptrB+8,_tB);\
+    _mm_store_ss(ptrC+8,_tC);\
+    _mm_store_ss(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3)
 {
     __m256 t1,t2,t3,t4,t5,t6;
     __m128 tA,tB,tC,tD;
@@ -672,15 +721,76 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_store_ss(ptrC+8,tC);
     _mm_store_ss(ptrD+8,tD);
 }
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                                  x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5;\
+    __m128 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH;\
+\
+    _t1         = _mm256_loadu_ps(ptrA);\
+    _t2         = _mm256_loadu_ps(ptrB);\
+    _t3         = _mm256_loadu_ps(ptrC);\
+    _t4         = _mm256_loadu_ps(ptrD);\
+    _tA         = _mm_loadu_ps(ptrA+8);\
+    _tB         = _mm_loadu_ps(ptrB+8);\
+    _tC         = _mm_loadu_ps(ptrC+8);\
+    _tD         = _mm_loadu_ps(ptrD+8);\
+    _t5         = _mm256_unpacklo_ps(x1,y1);\
+    x1          = _mm256_unpackhi_ps(x1,y1);\
+    y1          = _mm256_unpacklo_ps(z1,x2);\
+    z1          = _mm256_unpackhi_ps(z1,x2);\
+    x2          = _mm256_unpacklo_ps(y2,z2);\
+    y2          = _mm256_unpackhi_ps(y2,z2);\
+    z2          = _mm256_unpacklo_ps(x3,y3);\
+    x3          = _mm256_unpackhi_ps(x3,y3);\
+    y3          = _mm256_unpacklo_ps(z3,x4);\
+    z3          = _mm256_unpackhi_ps(z3,x4);\
+    x4          = _mm256_unpacklo_ps(y4,z4);\
+    y4          = _mm256_unpackhi_ps(y4,z4);\
+    x2          = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+    x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+    y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);\
+    z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+    z2          = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0));\
+    _t5         = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2));\
+    y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+    x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+    _tE         = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0));\
+    _tF         = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2));\
+    _tG         = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0));\
+    _tH         = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_sub_ps(_t1,z2);\
+    _t2         = _mm256_sub_ps(_t2,_t5);\
+    _t3         = _mm256_sub_ps(_t3,y1);\
+    _t4         = _mm256_sub_ps(_t4,x1);\
+    _tA         = _mm_sub_ps(_tA,_tE);\
+    _tB         = _mm_sub_ps(_tB,_tF);\
+    _tC         = _mm_sub_ps(_tC,_tG);\
+    _tD         = _mm_sub_ps(_tD,_tH);\
+    _mm256_storeu_ps(ptrA,_t1);\
+    _mm256_storeu_ps(ptrB,_t2);\
+    _mm256_storeu_ps(ptrC,_t3);\
+    _mm256_storeu_ps(ptrD,_t4);\
+    _mm_storeu_ps(ptrA+8,_tA);\
+    _mm_storeu_ps(ptrB+8,_tB);\
+    _mm_storeu_ps(ptrC+8,_tC);\
+    _mm_storeu_ps(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3,
-                                          __m256 x4, __m256 y4, __m256 z4)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3,
+        __m256 x4, __m256 y4, __m256 z4)
 {
     __m256 t1,t2,t3,t4,t5;
     __m128 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -745,15 +855,15 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_storeu_ps(ptrC+8,tC);
     _mm_storeu_ps(ptrD+8,tD);
 }
-
+#endif
 
 
 static gmx_inline void
 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1)
 {
     __m256 t1,t2,t3,t4,t5,t6;
     __m256 tA,tB,tC,tD;
@@ -762,10 +872,10 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     /* Construct a mask without executing any data loads */
     mask        = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
 
-    tA          = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask));
-    tB          = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask));
-    tC          = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask));
-    tD          = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask));
+    tA          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
+    tB          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
+    tC          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
+    tD          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
     t1          = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
     t2          = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 
@@ -779,26 +889,103 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     tC          = _mm256_sub_ps(tC,t5);
     tD          = _mm256_sub_ps(tD,t6);
 
-    _mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
-    _mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
-    _mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
-    _mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
-    _mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
-    _mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
-    _mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
-    _mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
+    gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
+    gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
+    gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
+    gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
+    gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
+    gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
+    gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
+    gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
 }
 
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+    _tA         = _mm256_loadu_ps(ptrA);\
+    _tB         = _mm256_loadu_ps(ptrB);\
+    _tC         = _mm256_loadu_ps(ptrC);\
+    _tD         = _mm256_loadu_ps(ptrD);\
+    _tE         = _mm256_loadu_ps(ptrE);\
+    _tF         = _mm256_loadu_ps(ptrF);\
+    _tG         = _mm256_loadu_ps(ptrG);\
+    _tH         = _mm256_loadu_ps(ptrH);\
+    _t1         = _mm256_unpacklo_ps(_x1,_y1);\
+    _t2         = _mm256_unpackhi_ps(_x1,_y1);\
+    _t3         = _mm256_unpacklo_ps(_z1,_x2);\
+    _t4         = _mm256_unpackhi_ps(_z1,_x2);\
+    _t5         = _mm256_unpacklo_ps(_y2,_z2);\
+    _t6         = _mm256_unpackhi_ps(_y2,_z2);\
+    _t7         = _mm256_unpacklo_ps(_x3,_y3);\
+    _t8         = _mm256_unpackhi_ps(_x3,_y3);\
+    _t9         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t10        = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t11        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t12        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+    _t2         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+    _t3         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+    _t4         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+    _t5         = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+    _t6         = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+    _t7         = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+    _t8         = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+    _t1         = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+    _t2         = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+    _t9         = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+    _t10        = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+    _tA         = _mm256_sub_ps(_tA,_t5);\
+    _tB         = _mm256_sub_ps(_tB,_t7);\
+    _tC         = _mm256_sub_ps(_tC,_t1);\
+    _tD         = _mm256_sub_ps(_tD,_t9);\
+    _tE         = _mm256_sub_ps(_tE,_t6);\
+    _tF         = _mm256_sub_ps(_tF,_t8);\
+    _tG         = _mm256_sub_ps(_tG,_t2);\
+    _tH         = _mm256_sub_ps(_tH,_t10);\
+    _mm256_storeu_ps(ptrA,_tA);\
+    _mm256_storeu_ps(ptrB,_tB);\
+    _mm256_storeu_ps(ptrC,_tC);\
+    _mm256_storeu_ps(ptrD,_tD);\
+    _mm256_storeu_ps(ptrE,_tE);\
+    _mm256_storeu_ps(ptrF,_tF);\
+    _mm256_storeu_ps(ptrG,_tG);\
+    _mm256_storeu_ps(ptrH,_tH);\
+    _tI         = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));\
+    _tJ         = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));\
+    _tK         = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));\
+    _tL         = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));\
+    _tI         = _mm256_unpacklo_ps(_tI,_tK);\
+    _tJ         = _mm256_unpacklo_ps(_tJ,_tL);\
+    _tI         = _mm256_unpacklo_ps(_tI,_tJ);\
+    _tI         = _mm256_sub_ps(_tI,_z3);\
+    _tJ         = _mm256_permute_ps(_tI,_MM_SHUFFLE(1,1,1,1));\
+    _tK         = _mm256_permute_ps(_tI,_MM_SHUFFLE(2,2,2,2));\
+    _tL         = _mm256_permute_ps(_tI,_MM_SHUFFLE(3,3,3,3));\
+    _mm_store_ss(ptrA+8,_mm256_castps256_ps128(_tI));\
+    _mm_store_ss(ptrB+8,_mm256_castps256_ps128(_tJ));\
+    _mm_store_ss(ptrC+8,_mm256_castps256_ps128(_tK));\
+    _mm_store_ss(ptrD+8,_mm256_castps256_ps128(_tL));\
+    _mm_store_ss(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+    _mm_store_ss(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+    _mm_store_ss(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+    _mm_store_ss(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3)
 {
     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
     __m256 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -859,12 +1046,12 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm256_storeu_ps(ptrF,tF);
     _mm256_storeu_ps(ptrG,tG);
     _mm256_storeu_ps(ptrH,tH);
-    
+
     tI          = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
     tJ          = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
     tK          = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
     tL          = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
-    
+
     tI          = _mm256_unpacklo_ps(tI,tK);  /*  -  - zG zE |  -  - zC zA */
     tJ          = _mm256_unpacklo_ps(tJ,tL);  /*  -  - zH zF |  -  - zD zB */
     tI          = _mm256_unpacklo_ps(tI,tJ);  /* zH zG zF zE | zD zC zB zA */
@@ -883,17 +1070,102 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
     _mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
 }
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+    _tA         = _mm256_loadu_ps(ptrA);\
+    _tB         = _mm256_loadu_ps(ptrB);\
+    _tC         = _mm256_loadu_ps(ptrC);\
+    _tD         = _mm256_loadu_ps(ptrD);\
+    _tE         = _mm256_loadu_ps(ptrE);\
+    _tF         = _mm256_loadu_ps(ptrF);\
+    _tG         = _mm256_loadu_ps(ptrG);\
+    _tH         = _mm256_loadu_ps(ptrH);\
+    _t1         = _mm256_unpacklo_ps(_x1,_y1);\
+    _t2         = _mm256_unpackhi_ps(_x1,_y1);\
+    _t3         = _mm256_unpacklo_ps(_z1,_x2);\
+    _t4         = _mm256_unpackhi_ps(_z1,_x2);\
+    _t5         = _mm256_unpacklo_ps(_y2,_z2);\
+    _t6         = _mm256_unpackhi_ps(_y2,_z2);\
+    _t7         = _mm256_unpacklo_ps(_x3,_y3);\
+    _t8         = _mm256_unpackhi_ps(_x3,_y3);\
+    _t9         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t10        = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t11        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t12        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+    _t2         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+    _t3         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+    _t4         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+    _t5         = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+    _t6         = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+    _t7         = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+    _t8         = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+    _t1         = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+    _t2         = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+    _t9         = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+    _t10        = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+    _tA         = _mm256_sub_ps(_tA,_t5);\
+    _tB         = _mm256_sub_ps(_tB,_t7);\
+    _tC         = _mm256_sub_ps(_tC,_t1);\
+    _tD         = _mm256_sub_ps(_tD,_t9);\
+    _tE         = _mm256_sub_ps(_tE,_t6);\
+    _tF         = _mm256_sub_ps(_tF,_t8);\
+    _tG         = _mm256_sub_ps(_tG,_t2);\
+    _tH         = _mm256_sub_ps(_tH,_t10);\
+    _mm256_storeu_ps(ptrA,_tA);\
+    _mm256_storeu_ps(ptrB,_tB);\
+    _mm256_storeu_ps(ptrC,_tC);\
+    _mm256_storeu_ps(ptrD,_tD);\
+    _mm256_storeu_ps(ptrE,_tE);\
+    _mm256_storeu_ps(ptrF,_tF);\
+    _mm256_storeu_ps(ptrG,_tG);\
+    _mm256_storeu_ps(ptrH,_tH);\
+    _tI         = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));\
+    _tJ         = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));\
+    _tK         = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));\
+    _tL         = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));\
+    _t1         = _mm256_unpacklo_ps(_z3,_x4);\
+    _t2         = _mm256_unpackhi_ps(_z3,_x4);\
+    _t3         = _mm256_unpacklo_ps(_y4,_z4);\
+    _t4         = _mm256_unpackhi_ps(_y4,_z4);\
+    _t5         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t6         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t7         = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t8         = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _tI         = _mm256_sub_ps(_tI,_t5);\
+    _tJ         = _mm256_sub_ps(_tJ,_t6);\
+    _tK         = _mm256_sub_ps(_tK,_t7);\
+    _tL         = _mm256_sub_ps(_tL,_t8);\
+    _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(_tI));\
+    _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(_tJ));\
+    _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(_tK));\
+    _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(_tL));\
+    _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+    _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+    _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+    _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3,
-                                          __m256 x4, __m256 y4, __m256 z4)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3,
+        __m256 x4, __m256 y4, __m256 z4)
 {
     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
     __m256 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -959,7 +1231,7 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     tJ          = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
     tK          = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
     tL          = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
-    
+
     t1          = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
     t2          = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
     t3          = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
@@ -984,13 +1256,13 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
     _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
 }
-
+#endif
 
 
 static gmx_inline void
 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                      float * gmx_restrict fptr,
-                                      float * gmx_restrict fshiftptr)
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
 {
     __m128 t1,t2,t3;
 
@@ -1000,7 +1272,7 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
 
     /* Add across the two lanes */
     t1   = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
-    
+
     t2 = _mm_load_ss(fptr);
     t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
     t3 = _mm_load_ss(fshiftptr);
@@ -1015,12 +1287,53 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                                 fptr,fshiftptr) \
+{ \
+    __m256 _t1,_t2,_t3;\
+    __m128 _tA,_tB,_tC;\
+\
+    fix1 = _mm256_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm256_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+    fix1 = _mm256_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+\
+    _t1  = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+    _t2  = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+    _t1  = _mm256_add_ps(_t1,_t2);\
+    _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+    _t3  = _mm256_loadu_ps(fptr);\
+    _t3  = _mm256_add_ps(_t3,_t1);\
+    _mm256_storeu_ps(fptr,_t3);\
+    _tB  = _mm_load_ss(fptr+8);\
+    _tB  = _mm_add_ss(_tB,_tA);\
+    _mm_store_ss(fptr+8,_tB);\
+\
+    _tB  = _mm256_extractf128_ps(_t1,0x1);\
+    _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+    _tB  = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+    _tC  = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+    _tB  = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+    _tA  = _mm_add_ps(_tB,_tC);\
+    _tA  = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+    _tC  = _mm_loadu_ps(fshiftptr);\
+    _tC  = _mm_add_ps(_tC,_tA);\
+    _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                         __m256 fix2, __m256 fiy2, __m256 fiz2,
-                                         __m256 fix3, __m256 fiy3, __m256 fiz3,
-                                         float * gmx_restrict fptr,
-                                         float * gmx_restrict fshiftptr)
+        __m256 fix2, __m256 fiy2, __m256 fiz2,
+        __m256 fix3, __m256 fiy3, __m256 fiz3,
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
 {
     __m256 t1,t2,t3;
     __m128 tA,tB,tC;
@@ -1057,22 +1370,68 @@ gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
 
     tB   = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
     tA   = _mm_add_ps(tB,tC); /*  - z y x */
-    
+
     tA   = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
 
     tC   = _mm_loadu_ps(fshiftptr);
     tC   = _mm_add_ps(tC,tA);
     _mm_storeu_ps(fshiftptr,tC);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                                fptr,fshiftptr) \
+{ \
+    __m256 _t1,_t2,_t3; \
+    __m128 _tA,_tB,_tC; \
+\
+    fix1 = _mm256_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm256_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm256_hadd_ps(fiz3,fix4);\
+    fiy4 = _mm256_hadd_ps(fiy4,fiz4);\
+\
+    fix1 = _mm256_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm256_hadd_ps(fiz3,fiy4);\
+\
+    _t1  = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+    _t2  = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+    _t1  = _mm256_add_ps(_t1,_t2);\
+    _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+    _t3  = _mm256_loadu_ps(fptr);\
+    _t3  = _mm256_add_ps(_t3,_t1);\
+    _mm256_storeu_ps(fptr,_t3);\
+    _tB  = _mm_loadu_ps(fptr+8);\
+    _tB  = _mm_add_ps(_tB,_tA);\
+    _mm_storeu_ps(fptr+8,_tB);\
+\
+    _tB  = _mm256_extractf128_ps(_t1,0x1);\
+    _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+    _tB  = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+    _tC  = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+    _tA  = _mm_permute_ps(_tA,_MM_SHUFFLE(0,3,2,1));\
+    _tB  = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+    _tA  = _mm_add_ps(_tA,_tC);\
+    _tA  = _mm_add_ps(_tA,_tB);\
+    _tA  = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+    _tC  = _mm_loadu_ps(fshiftptr);\
+    _tC  = _mm_add_ps(_tC,_tA);\
+    _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                         __m256 fix2, __m256 fiy2, __m256 fiz2,
-                                         __m256 fix3, __m256 fiy3, __m256 fiz3,
-                                         __m256 fix4, __m256 fiy4, __m256 fiz4,
-                                         float * gmx_restrict fptr,
-                                         float * gmx_restrict fshiftptr)
+        __m256 fix2, __m256 fiy2, __m256 fiz2,
+        __m256 fix3, __m256 fiy3, __m256 fiz3,
+        __m256 fix4, __m256 fiy4, __m256 fiz4,
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
 {
     __m256 t1,t2,t3;
     __m128 tA,tB,tC;
@@ -1120,6 +1479,7 @@ gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
     tC   = _mm_add_ps(tC,tA);
     _mm_storeu_ps(fshiftptr,tC);
 }
+#endif
 
 
 
@@ -1153,26 +1513,4 @@ gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
 }
 
 
-static gmx_inline void
-gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
-                         __m256 pot2, float * gmx_restrict ptrB,
-                         __m256 pot3, float * gmx_restrict ptrC,
-                         __m256 pot4, float * gmx_restrict ptrD)
-{
-    __m128 t1,t2,t3,t4;
-
-    pot1 = _mm256_hadd_ps(pot1,pot2);
-    pot3 = _mm256_hadd_ps(pot3,pot4);
-    pot1 = _mm256_hadd_ps(pot1,pot3);
-    t1   = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
-    t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
-    t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
-    t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
-    _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
-    _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
-    _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
-}
-
-
 #endif /* _kernelutil_x86_avx_256_single_h_ */
index 006439173d4e8011d395e3195bbdd33580354c0a..35fb80eafc4936c869567813feb2868270545f5b 100644 (file)
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
 
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
 
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
 
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
 {
     __m128d t1,t2;
 
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrA+2,t2);
 }
 
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_load_sd(ptrA+8);\
+    _x1          = _mm_unpacklo_pd(_x1,_y1);\
+    _z1          = _mm_unpacklo_pd(_z1,_x2);\
+    _y2          = _mm_unpacklo_pd(_y2,_z2);\
+    _x3          = _mm_unpacklo_pd(_x3,_y3);\
+    _t1          = _mm_sub_pd(_t1,_x1);\
+    _t2          = _mm_sub_pd(_t2,_z1);\
+    _t3          = _mm_sub_pd(_t3,_y2);\
+    _t4          = _mm_sub_pd(_t4,_x3);\
+    _t5          = _mm_sub_sd(_t5,_z3);\
+    _mm_storeu_pd(ptrA,_t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_loadu_pd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrA+10);\
+    _x1          = _mm_unpacklo_pd(_x1,_y1);\
+    _z1          = _mm_unpacklo_pd(_z1,_x2);\
+    _y2          = _mm_unpacklo_pd(_y2,_z2);\
+    _x3          = _mm_unpacklo_pd(_x3,_y3);\
+    _z3          = _mm_unpacklo_pd(_z3,_x4);\
+    _y4          = _mm_unpacklo_pd(_y4,_z4);\
+    _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+    _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+    _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+    _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+    _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+    _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_store_sd(ptrB+2,t4);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_load_sd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrB);\
+    _t7          = _mm_loadu_pd(ptrB+2);\
+    _t8          = _mm_loadu_pd(ptrB+4);\
+    _t9          = _mm_loadu_pd(ptrB+6);\
+    _t10         = _mm_load_sd(ptrB+8);\
+    _tA          = _mm_unpacklo_pd(_x1,_y1);\
+    _tB          = _mm_unpackhi_pd(_x1,_y1);\
+    _tC          = _mm_unpacklo_pd(_z1,_x2);\
+    _tD          = _mm_unpackhi_pd(_z1,_x2);\
+    _tE          = _mm_unpacklo_pd(_y2,_z2);\
+    _tF          = _mm_unpackhi_pd(_y2,_z2);\
+    _tG          = _mm_unpacklo_pd(_x3,_y3);\
+    _tH          = _mm_unpackhi_pd(_x3,_y3);\
+    _tI          = _mm_unpackhi_pd(_z3,_z3);\
+    _t1          = _mm_sub_pd(_t1,_tA);\
+    _t2          = _mm_sub_pd(_t2,_tC);\
+    _t3          = _mm_sub_pd(_t3,_tE);\
+    _t4          = _mm_sub_pd(_t4,_tG);\
+    _t5          = _mm_sub_sd(_t5,_z3);\
+    _t6          = _mm_sub_pd(_t6,_tB);\
+    _t7          = _mm_sub_pd(_t7,_tD);\
+    _t8          = _mm_sub_pd(_t8,_tF);\
+    _t9          = _mm_sub_pd(_t9,_tH);\
+    _t10         = _mm_sub_sd(_t10,_tI);\
+    _mm_storeu_pd(ptrA,_t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_store_sd(ptrA+8,_t5);\
+    _mm_storeu_pd(ptrB,_t6);\
+    _mm_storeu_pd(ptrB+2,_t7);\
+    _mm_storeu_pd(ptrB+4,_t8);\
+    _mm_storeu_pd(ptrB+6,_t9);\
+    _mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_loadu_pd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrA+10);\
+    _t7          = _mm_loadu_pd(ptrB);\
+    _t8          = _mm_loadu_pd(ptrB+2);\
+    _t9          = _mm_loadu_pd(ptrB+4);\
+    _t10         = _mm_loadu_pd(ptrB+6);\
+    _t11         = _mm_loadu_pd(ptrB+8);\
+    _t12         = _mm_loadu_pd(ptrB+10);\
+    _tA          = _mm_unpacklo_pd(_x1,_y1);\
+    _tB          = _mm_unpackhi_pd(_x1,_y1);\
+    _tC          = _mm_unpacklo_pd(_z1,_x2);\
+    _tD          = _mm_unpackhi_pd(_z1,_x2);\
+    _tE          = _mm_unpacklo_pd(_y2,_z2);\
+    _tF          = _mm_unpackhi_pd(_y2,_z2);\
+    _tG          = _mm_unpacklo_pd(_x3,_y3);\
+    _tH          = _mm_unpackhi_pd(_x3,_y3);\
+    _tI          = _mm_unpacklo_pd(_z3,_x4);\
+    _tJ          = _mm_unpackhi_pd(_z3,_x4);\
+    _tK          = _mm_unpacklo_pd(_y4,_z4);\
+    _tL          = _mm_unpackhi_pd(_y4,_z4);\
+    _t1          = _mm_sub_pd(_t1,_tA);\
+    _t2          = _mm_sub_pd(_t2,_tC);\
+    _t3          = _mm_sub_pd(_t3,_tE);\
+    _t4          = _mm_sub_pd(_t4,_tG);\
+    _t5          = _mm_sub_pd(_t5,_tI);\
+    _t6          = _mm_sub_pd(_t6,_tK);\
+    _t7          = _mm_sub_pd(_t7,_tB);\
+    _t8          = _mm_sub_pd(_t8,_tD);\
+    _t9          = _mm_sub_pd(_t9,_tF);\
+    _t10         = _mm_sub_pd(_t10,_tH);\
+    _t11         = _mm_sub_pd(_t11,_tJ);\
+    _t12         = _mm_sub_pd(_t12,_tL);\
+    _mm_storeu_pd(ptrA,  _t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_storeu_pd(ptrA+8,_t5);\
+    _mm_storeu_pd(ptrA+10,_t6);\
+    _mm_storeu_pd(ptrB,  _t7);\
+    _mm_storeu_pd(ptrB+2,_t8);\
+    _mm_storeu_pd(ptrB+4,_t9);\
+    _mm_storeu_pd(ptrB+6,_t10);\
+    _mm_storeu_pd(ptrB+8,_t11);\
+    _mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -699,6 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
+#endif
 
 
 
@@ -726,6 +818,39 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+    GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+    GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+    _t1 = fix3;\
+    fix3 = _mm_unpacklo_pd(fix3,fiy3);\
+    fiy3 = _mm_unpackhi_pd(_t1,fiy3);\
+    fix1 = _mm_add_pd(fix1,fiy1);\
+    fiz1 = _mm_add_pd(fiz1,fix2);\
+    fiy2 = _mm_add_pd(fiy2,fiz2);\
+    fix3 = _mm_add_pd(fix3,fiy3);\
+    fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -767,8 +892,46 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+    GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+    GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+    GMX_MM_TRANSPOSE2_PD(fix3,fiy3);\
+    GMX_MM_TRANSPOSE2_PD(fiz3,fix4);\
+    GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);\
+    fix1 = _mm_add_pd(fix1,fiy1);\
+    fiz1 = _mm_add_pd(fiz1,fix2);\
+    fiy2 = _mm_add_pd(fiy2,fiz2);\
+    fix3 = _mm_add_pd(fix3,fiy3);\
+    fiz3 = _mm_add_pd(fiz3,fix4);\
+    fiy4 = _mm_add_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -793,7 +956,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     fix3 = _mm_add_pd(fix3,fiy3);
     fiz3 = _mm_add_pd(fiz3,fix4);
     fiy4 = _mm_add_pd(fiy4,fiz4);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
@@ -814,7 +977,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
+#endif
 
 
 static gmx_inline void
index 8174970457d3fcfe9bfe4d6d2d0893b831c9f24f..7d3ff2ab7aaeabad285036343c73a249819b2958 100644 (file)
@@ -38,7 +38,7 @@
 
 /* We require SSE2 now! */
 
-#include <math.h> 
+#include <math.h>
 
 #include "gmx_x86_sse2.h"
 
@@ -135,20 +135,20 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
-    
+
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
     t3   = _mm_load_ss(xyz_shift+2);
     t4   = _mm_load_ss(xyz+2);
     t1   = _mm_add_ps(t1,t2);
     t3   = _mm_add_ss(t3,t4);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
@@ -157,30 +157,30 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     tB   = _mm_load_ss(xyz_shift+2);
-    
+
     t1   = _mm_loadu_ps(xyz);
     t2   = _mm_loadu_ps(xyz+4);
     t3   = _mm_load_ss(xyz+8);
-    
+
     tA   = _mm_movelh_ps(tA,tB);
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ss(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -195,31 +195,31 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
     tB   = _mm_load_ss(xyz_shift+2);
-    
+
     t1   = _mm_loadu_ps(xyz);
     t2   = _mm_loadu_ps(xyz+4);
     t3   = _mm_loadu_ps(xyz+8);
-    
+
     tA   = _mm_movelh_ps(tA,tB);
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ps(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -270,7 +270,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   const float * gmx_restrict ptrD,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3) 
+                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 t1,t2,t3,t4;
     t1            = _mm_loadu_ps(ptrA);
@@ -309,7 +309,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4) 
+                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 t1,t2,t3,t4;
     t1            = _mm_loadu_ps(ptrA);
@@ -380,12 +380,78 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
 
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18         = _mm_movehl_ps(_z3,_z3);\
+_t19         = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20         = _mm_movelh_ps(_x1,_z1);\
+_t21         = _mm_movehl_ps(_z1,_x1);\
+_t22         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t23         = _mm_movelh_ps(_y2,_x3);\
+_t24         = _mm_movehl_ps(_x3,_y2);\
+_t25         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_load_ss(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t20);\
+_t2          = _mm_sub_ps(_t2,_t23);\
+_t3          = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_load_ss(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_t21);\
+_t5          = _mm_sub_ps(_t5,_t24);\
+_t6          = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_load_ss(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t22);\
+_t8          = _mm_sub_ps(_t8,_t25);\
+_t9          = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_load_ss(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
-                                       __m128 x3, __m128 y3, __m128 z3) 
+                                       __m128 x3, __m128 y3, __m128 z3)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
@@ -447,15 +513,87 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_store_ss(ptrD+8,t12);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_unpackhi_ps(_z3,_x4);\
+_z3          = _mm_unpacklo_ps(_z3,_x4);\
+_t18         = _mm_unpackhi_ps(_y4,_z4);\
+_y4          = _mm_unpacklo_ps(_y4,_z4);\
+_t19         = _mm_movelh_ps(_x1,_z1);\
+_z1          = _mm_movehl_ps(_z1,_x1);\
+_t20         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t21         = _mm_movelh_ps(_y2,_x3);\
+_x3          = _mm_movehl_ps(_x3,_y2);\
+_t22         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t23         = _mm_movelh_ps(_z3,_y4);\
+_y4          = _mm_movehl_ps(_y4,_z3);\
+_t24         = _mm_movelh_ps(_t17,_t18);\
+_t18         = _mm_movehl_ps(_t18,_t17);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_loadu_ps(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t19);\
+_t2          = _mm_sub_ps(_t2,_t21);\
+_t3          = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_loadu_ps(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_z1);\
+_t5          = _mm_sub_ps(_t5,_x3);\
+_t6          = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_loadu_ps(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t20);\
+_t8          = _mm_sub_ps(_t8,_t22);\
+_t9          = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_loadu_ps(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
                                        __m128 x3, __m128 y3, __m128 z3,
-                                       __m128 x4, __m128 y4, __m128 z4) 
+                                       __m128 x4, __m128 y4, __m128 z4)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
@@ -521,7 +659,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_storeu_ps(ptrD+8,t12);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -550,6 +688,38 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4;\
+\
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+    _t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);\
+    _t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));\
+    _t3   = _mm_shuffle_ps(_t2,_t2,_MM_SHUFFLE(0,0,0,1));\
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+    fiz3 = _mm_add_ss(_mm_add_ps(fiz3,_t1)  , _mm_add_ps(_t2,_t3));\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+    _t4 = _mm_load_ss(fshiftptr+2);\
+    _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+    _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+    _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+    _t3 = _mm_shuffle_ps(_t3  ,_t3  ,_MM_SHUFFLE(1,2,0,0));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _mm_store_ss(fshiftptr+2,_t1);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -589,8 +759,39 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t1);
     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5;\
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+    _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);\
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+    fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+    _t5 = _mm_load_ss(fshiftptr+2);\
+    _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+    _t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+    _t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+    _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+    _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _t5 = _mm_add_ps(_t5,_t1);\
+    _mm_store_ss(fshiftptr+2,_t5);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -631,7 +832,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t5);
     _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 }
-
+#endif
 
 
 static void
@@ -658,22 +859,4 @@ gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
 }
 
 
-static void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
 #endif /* _kernelutil_x86_sse2_single_h_ */
index e7bb484515c65505b2963a05b71e98c29613664f..f304aa5d222f9cbefedebd244abba4041f7974de 100644 (file)
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
 
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
 
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
 
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
 {
     __m128d t1,t2;
 
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrA+2,t2);
 }
 
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_store_sd(ptrB+2,t4);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -699,7 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
-
+#endif
 
 
 
@@ -719,6 +810,34 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -751,8 +870,39 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -790,8 +940,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
 
 static gmx_inline void
 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
index 761febfada665f306549e3ac3101c3e82df1964f..ef8362c1630fe12a5c73d643e55f8596b0f7f5c5 100644 (file)
 #ifndef _kernelutil_x86_sse4_1_single_h_
 #define _kernelutil_x86_sse4_1_single_h_
 
-#include <math.h> 
+#include <math.h>
 
 #include "gmx_x86_sse4_1.h"
 
 #undef gmx_restrict
-#define gmx_restrict 
+#define gmx_restrict
 
 /* Normal sum of four xmm registers */
 #define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
@@ -67,7 +67,7 @@ gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                              const float * gmx_restrict ptrD)
 {
     __m128 t1,t2;
-    
+
     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
     return _mm_unpacklo_ps(t1,t2);
@@ -81,14 +81,14 @@ gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                               __m128 xmm1)
 {
     __m128 t2,t3,t4;
-    
-    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);               
-    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));     
-    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1)); 
-    _mm_store_ss(ptrA,xmm1);                                           
-    _mm_store_ss(ptrB,t2);                                         
-    _mm_store_ss(ptrC,t3);                                         
-    _mm_store_ss(ptrD,t4);                                         
+
+    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_ss(ptrA,xmm1);
+    _mm_store_ss(ptrB,t2);
+    _mm_store_ss(ptrC,t3);
+    _mm_store_ss(ptrD,t4);
 }
 
 /* Similar to store, but increments value in memory */
@@ -99,7 +99,7 @@ gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                   float * gmx_restrict ptrD, __m128 xmm1)
 {
     __m128 tmp;
-    
+
     tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
     tmp = _mm_add_ps(tmp,xmm1);
     gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
@@ -115,7 +115,7 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                              __m128 * gmx_restrict c12)
 {
     __m128 t1,t2,t3,t4;
-    
+
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
     t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
@@ -129,20 +129,20 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
-    
+
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
     t3   = _mm_load_ss(xyz_shift+2);
     t4   = _mm_load_ss(xyz+2);
     t1   = _mm_add_ps(t1,t2);
     t3   = _mm_add_ss(t3,t4);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
@@ -151,14 +151,14 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     tB   = _mm_load_ss(xyz_shift+2);
 
@@ -170,11 +170,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ss(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -189,31 +189,31 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
     tB   = _mm_load_ss(xyz_shift+2);
-    
+
     t1   = _mm_loadu_ps(xyz);
     t2   = _mm_loadu_ps(xyz+4);
     t3   = _mm_loadu_ps(xyz+8);
-    
+
     tA   = _mm_movelh_ps(tA,tB);
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ps(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -264,7 +264,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   const float * gmx_restrict ptrD,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3) 
+                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 t1,t2,t3,t4;
     t1            = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrA ) );
@@ -303,7 +303,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4) 
+                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 t1,t2,t3,t4;
     t1            = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA) ) );
@@ -375,12 +375,78 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
 
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18         = _mm_movehl_ps(_z3,_z3);\
+_t19         = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20         = _mm_movelh_ps(_x1,_z1);\
+_t21         = _mm_movehl_ps(_z1,_x1);\
+_t22         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t23         = _mm_movelh_ps(_y2,_x3);\
+_t24         = _mm_movehl_ps(_x3,_y2);\
+_t25         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_load_ss(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t20);\
+_t2          = _mm_sub_ps(_t2,_t23);\
+_t3          = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_load_ss(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_t21);\
+_t5          = _mm_sub_ps(_t5,_t24);\
+_t6          = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_load_ss(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t22);\
+_t8          = _mm_sub_ps(_t8,_t25);\
+_t9          = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_load_ss(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
-                                       __m128 x3, __m128 y3, __m128 z3) 
+                                       __m128 x3, __m128 y3, __m128 z3)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
@@ -417,7 +483,7 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     t10         = _mm_loadu_ps(ptrD);
     t11         = _mm_loadu_ps(ptrD+4);
     t12         = _mm_load_ss(ptrD+8);
-    
+
     t1          = _mm_sub_ps(t1,t20);
     t2          = _mm_sub_ps(t2,t23);
     t3          = _mm_sub_ss(t3,z3);
@@ -443,15 +509,86 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_store_ss(ptrD+8,t12);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_unpackhi_ps(_z3,_x4);\
+_z3          = _mm_unpacklo_ps(_z3,_x4);\
+_t18         = _mm_unpackhi_ps(_y4,_z4);\
+_y4          = _mm_unpacklo_ps(_y4,_z4);\
+_t19         = _mm_movelh_ps(_x1,_z1);\
+_z1          = _mm_movehl_ps(_z1,_x1);\
+_t20         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t21         = _mm_movelh_ps(_y2,_x3);\
+_x3          = _mm_movehl_ps(_x3,_y2);\
+_t22         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t23         = _mm_movelh_ps(_z3,_y4);\
+_y4          = _mm_movehl_ps(_y4,_z3);\
+_t24         = _mm_movelh_ps(_t17,_t18);\
+_t18         = _mm_movehl_ps(_t18,_t17);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_loadu_ps(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t19);\
+_t2          = _mm_sub_ps(_t2,_t21);\
+_t3          = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_loadu_ps(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_z1);\
+_t5          = _mm_sub_ps(_t5,_x3);\
+_t6          = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_loadu_ps(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t20);\
+_t8          = _mm_sub_ps(_t8,_t22);\
+_t9          = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_loadu_ps(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
                                        __m128 x3, __m128 y3, __m128 z3,
-                                       __m128 x4, __m128 y4, __m128 z4) 
+                                       __m128 x4, __m128 y4, __m128 z4)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
@@ -517,7 +654,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_storeu_ps(ptrD+8,t12);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -525,27 +662,59 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       float * gmx_restrict fptr,
                                       float * gmx_restrict fshiftptr)
 {
-       __m128 t2,t3;
-       
+    __m128 t2,t3;
+
     fix1 = _mm_hadd_ps(fix1,fix1);
-       fiy1 = _mm_hadd_ps(fiy1,fiz1);
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
-    
-       t2 = _mm_load_ss(fptr);
-       t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
-       t3 = _mm_load_ss(fshiftptr);
-       t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
-       
-       t2 = _mm_add_ps(t2,fix1);
-       t3 = _mm_add_ps(t3,fix1);
-       
-       _mm_store_ss(fptr,t2);
-       _mm_storeh_pi((__m64 *)(fptr+1),t2);
-       _mm_store_ss(fshiftptr,t3);
-       _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+    fiy1 = _mm_hadd_ps(fiy1,fiz1);
+
+    fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
+
+    t2 = _mm_load_ss(fptr);
+    t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+    t3 = _mm_load_ss(fshiftptr);
+    t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+    t2 = _mm_add_ps(t2,fix1);
+    t3 = _mm_add_ps(t3,fix1);
+
+    _mm_store_ss(fptr,t2);
+    _mm_storeh_pi((__m64 *)(fptr+1),t2);
+    _mm_store_ss(fshiftptr,t3);
+    _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+_mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+_t4 = _mm_load_ss(fshiftptr+2);\
+_t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+_t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+_t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+_t3 = _mm_shuffle_ps(_t3,_t3,_MM_SHUFFLE(1,2,0,0));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_mm_store_ss(fshiftptr+2,_t1);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -553,39 +722,74 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       float * gmx_restrict fptr,
                                       float * gmx_restrict fshiftptr)
 {
-       __m128 t1,t2,t3,t4;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       fix3 = _mm_hadd_ps(fix3,fiy3);
-       fiz3 = _mm_hadd_ps(fiz3,fiz3);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
-       fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
-    
-       _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
-       _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
-       
-       t4 = _mm_load_ss(fshiftptr+2);
-       t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
-       t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
-       t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
-       t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
-    
-       t1 = _mm_add_ps(t1,t2);
-       t3 = _mm_add_ps(t3,t4);
-       t1 = _mm_add_ps(t1,t3); /* y x - z */
-       
-       _mm_store_ss(fshiftptr+2,t1);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
+    __m128 t1,t2,t3,t4;
+
+    fix1 = _mm_hadd_ps(fix1,fiy1);
+    fiz1 = _mm_hadd_ps(fiz1,fix2);
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);
+    fix3 = _mm_hadd_ps(fix3,fiy3);
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);
+
+    fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+    fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+    fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+    t4 = _mm_load_ss(fshiftptr+2);
+    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
+    t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
+    t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
+    t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
 
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3); /* y x - z */
 
+    _mm_store_ss(fshiftptr+2,t1);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fix4);\
+fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+_mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+_t5 = _mm_load_ss(fshiftptr+2);\
+_t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+_t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+_t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+_t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+_t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_t5 = _mm_add_ps(_t5,_t1);\
+_mm_store_ss(fshiftptr+2,_t5);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -594,41 +798,41 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       float * gmx_restrict fptr,
                                       float * gmx_restrict fshiftptr)
 {
-       __m128 t1,t2,t3,t4,t5;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       fix3 = _mm_hadd_ps(fix3,fiy3);
-       fiz3 = _mm_hadd_ps(fiz3,fix4);
-       fiy4 = _mm_hadd_ps(fiy4,fiz4);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
-       fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
-    
-       _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
-       _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
-       
-       t5 = _mm_load_ss(fshiftptr+2);
-       t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
-       t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
-       t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
-       t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
-       t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
-       
-       t1 = _mm_add_ps(t1,t2);
-       t3 = _mm_add_ps(t3,t4);
-       t1 = _mm_add_ps(t1,t3);
-       t5 = _mm_add_ps(t5,t1);
-       
-       _mm_store_ss(fshiftptr+2,t5);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+    __m128 t1,t2,t3,t4,t5;
+
+    fix1 = _mm_hadd_ps(fix1,fiy1);
+    fiz1 = _mm_hadd_ps(fiz1,fix2);
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);
+    fix3 = _mm_hadd_ps(fix3,fiy3);
+    fiz3 = _mm_hadd_ps(fiz3,fix4);
+    fiy4 = _mm_hadd_ps(fiy4,fiz4);
+
+    fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+    fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+    fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+    t5 = _mm_load_ss(fshiftptr+2);
+    t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+    t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+    t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+    t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+    t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
+
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3);
+    t5 = _mm_add_ps(t5,t1);
+
+    _mm_store_ss(fshiftptr+2,t5);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -643,33 +847,15 @@ static gmx_inline void
 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                       __m128 pot2, float * gmx_restrict ptrB)
 {
-       __m128 t1,t2;
-       t1   = _mm_movehl_ps(pot2,pot1); 
-       t2   = _mm_movelh_ps(pot1,pot2); 
-       t1   = _mm_add_ps(t1,t2);       
-       t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
-       pot1 = _mm_add_ps(t1,t2);       
-       pot2 = _mm_movehl_ps(t2,pot1);
-       _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-       _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-}
-
-
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
-       _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-       _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-       _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-       _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+    __m128 t1,t2;
+    t1   = _mm_movehl_ps(pot2,pot1);
+    t2   = _mm_movelh_ps(pot1,pot2);
+    t1   = _mm_add_ps(t1,t2);
+    t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+    pot1 = _mm_add_ps(t1,t2);
+    pot2 = _mm_movehl_ps(t2,pot1);
+    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
 }
 
 
index 223f77e629581cab1c8026077c6cb0e3ba024b35..cd5468ad8461bea809982e08ad98019fbe8b65b0 100644 (file)
@@ -2081,7 +2081,6 @@ analyze_static(gmx_sel_evaluate_t *data, t_selelem *sel, gmx_ana_index_t *g)
 
         case SEL_EXPRESSION:
         case SEL_MODIFIER:
-            assert(g);
             rc = _gmx_sel_evaluate_method_params(data, sel, g);
             if (rc != 0)
             {
index 76bf1a3a5a9faf7d0de83b46b3d5ea20a953a25c..345598589d1a8ac9b11065683ce78475a8db70f6 100644 (file)
@@ -134,7 +134,6 @@ extern t_complex *** rc_tensor_allocation(int x, int y, int z)
   t_complex ***t;
   int i,j;
 
-  snew(t,x);
   t = (t_complex ***)calloc(x,sizeof(t_complex**));
   if(!t) exit(fprintf(stderr,"\nallocation error"));
   t[0] = (t_complex **)calloc(x*y,sizeof(t_complex*));
index 84f5b7a6e4031bbe6c180b01a49052c8e6bcaf86..27a70417093424741999f03015a93aa00592f10f 100644 (file)
@@ -92,17 +92,38 @@ static void set_ljparams(int comb,double reppow,real v,real w,
   }
 }
 
-static void assign_param(t_functype ftype,t_iparams *newparam,
+/* A return value of 0 means parameters were assigned successfully,
+ * returning -1 means this is an all-zero interaction that should not be added.
+ */
+static int
+assign_param(t_functype ftype,t_iparams *newparam,
                         real old[MAXFORCEPARAM],int comb,double reppow)
 {
   int  i,j;
   real tmp;
+  gmx_bool all_param_zero=TRUE;
 
   /* Set to zero */
   for(j=0; (j<MAXFORCEPARAM); j++) 
-    {
+  {
       newparam->generic.buf[j]=0.0;
-    }
+      /* If all parameters are zero we might not add some interaction types (selected below).
+       * We cannot apply this to ALL interactions, since many have valid reasons for having
+       * zero parameters (e.g. an index to a Cmap interaction, or LJ parameters), but
+       * we use it for angles and torsions that are typically generated automatically.
+       */
+      all_param_zero = (all_param_zero==TRUE) && fabs(old[j])<GMX_REAL_MIN;
+  }
+
+  if(all_param_zero==TRUE)
+  {
+      if(IS_ANGLE(ftype) || IS_RESTRAINT_TYPE(ftype) || ftype==F_IDIHS ||
+         ftype==F_PDIHS || ftype==F_PIDIHS || ftype==F_RBDIHS || ftype==F_FOURDIHS)
+      {
+          return -1;
+      }
+  }
+
   switch (ftype) {
   case F_G96ANGLES:
     /* Post processing of input data: store cosine iso angle itself */
@@ -248,29 +269,23 @@ static void assign_param(t_functype ftype,t_iparams *newparam,
   case F_PIDIHS:
   case F_ANGRES:
   case F_ANGRESZ:
-    newparam->pdihs.phiA = old[0];
-    newparam->pdihs.cpA  = old[1];
-                 
-    /* Dont do any checks if all parameters are zero (such interactions will be removed).
-     * Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
-     * so I have changed the lower limit to -99 /EL
-     *
-     * Second, if the force constant is zero in both A and B states, we set the phase
-     * and multiplicity to zero too so the interaction gets removed during clean-up.
-     */        
-    newparam->pdihs.phiB = old[3];
-    newparam->pdihs.cpB  = old[4];
-          
-    if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
-    {
-        newparam->pdihs.phiA = 0.0; 
-        newparam->pdihs.phiB = 0.0; 
-        newparam->pdihs.mult = 0; 
-    } 
-    else
-    {
-        newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
-    }
+          newparam->pdihs.phiA = old[0];
+          newparam->pdihs.cpA  = old[1];
+
+          /* Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
+           * so I have changed the lower limit to -99 /EL
+           */
+          newparam->pdihs.phiB = old[3];
+          newparam->pdihs.cpB  = old[4];
+          /* If both force constants are zero there is no interaction. Return -1 to signal
+           * this entry should NOT be added.
+           */
+          if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
+          {
+              return -1;
+          }
+    
+          newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
           
     break;
   case F_POSRES:
@@ -336,7 +351,7 @@ static void assign_param(t_functype ftype,t_iparams *newparam,
     newparam->rbdihs.rbcB[3]=-2.0*old[NR_FOURDIHS+2];
     newparam->rbdihs.rbcB[4]=-4.0*old[NR_FOURDIHS+3];
     newparam->rbdihs.rbcB[5]=0.0;
-    break;    
+    break;
   case F_CONSTR:
   case F_CONSTRNC:
     newparam->constr.dA = old[0];
@@ -388,6 +403,7 @@ static void assign_param(t_functype ftype,t_iparams *newparam,
     gmx_fatal(FARGS,"unknown function type %d in %s line %d",
              ftype,__FILE__,__LINE__);
   }
+    return 0;
 }
 
 static int enter_params(gmx_ffparams_t *ffparams, t_functype ftype,
@@ -396,8 +412,14 @@ static int enter_params(gmx_ffparams_t *ffparams, t_functype ftype,
 {
   t_iparams newparam;
   int       type;
-  
-  assign_param(ftype,&newparam,forceparams,comb,reppow);
+  int       rc;
+
+  if( (rc=assign_param(ftype,&newparam,forceparams,comb,reppow))<0 )
+  {
+      /* -1 means this interaction is all-zero and should not be added */
+      return rc;
+  }
+
   if (!bAppend) {
     for (type=start; (type<ffparams->ntypes); type++) {
       if (ffparams->functype[type]==ftype) {
@@ -467,7 +489,8 @@ static void enter_function(t_params *p,t_functype ftype,int comb,real reppow,
                __FILE__,__LINE__,*maxtypes);
     }
     type = enter_params(ffparams,ftype,p->param[k].c,comb,reppow,start,bAppend);
-    if (!bNB) {
+    /* Type==-1 is used as a signal that this interaction is all-zero and should not be added. */
+    if (!bNB && type>=0) {
       nral  = NRAL(ftype);
       delta = nr*(nral+1);
       srenew(il->iatoms,il->nr+delta);
index a6a2d534b7d4f9ba6b5d781cd898b6c1c88e9272..80294d386b0679d8bedb074cf0c784186a79b0d3 100644 (file)
@@ -216,8 +216,10 @@ static void chk_bonds(t_idef *idef,int ePBC,rvec *x,matrix box,real tol)
        b0   = 0;    
        switch (ftype) {
        case F_BONDS:
-       case F_G96BONDS:
          b0 = idef->iparams[type].harmonic.rA;
+      break;
+       case F_G96BONDS:
+         b0 = sqrt(idef->iparams[type].harmonic.rA);
          break;
        case F_MORSE:
          b0 = idef->iparams[type].morse.b0A;
index 16e8baac4a75e2e82340b750fa917b7008bd3367..8810780bb57dd525afb5270ff5588de54d66bb7d 100644 (file)
@@ -215,7 +215,7 @@ void set_histp(t_atoms *pdba,rvec *x,real angle,real dist){
   /* A histidine residue exists that requires automated assignment, so
    * doing the analysis of donors and acceptors is worthwhile. */
   fprintf(stderr,
-         "Analysing hydrogen-bonding network for automated assigment of histidine\n"
+         "Analysing hydrogen-bonding network for automated assignment of histidine\n"
          " protonation.");
 
   snew(donor,natom);
index ad1e79cb398e567fcf7a00194dabcef0a3c8fa90..687080c3d1387e4b92bc2094ba49b4f0ceb1ddfb 100644 (file)
@@ -268,9 +268,10 @@ static char *search_resrename(int nrr,rtprename_t *rr,
         {
             nn = rr[i].main;
         }
+        
         if (nn[0] == '-')
         {
-            gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? " as a starting terminus" : (bEnd ? " as an ending terminus" : ""));
+            gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
         }
     }
 
index 0d473670583e4228f48839b858f0a1483b62ad81..e22db5cacfcabd901c8917e73a0afa9604f3dd9b 100644 (file)
@@ -1621,7 +1621,7 @@ void get_ir(const char *mdparin,const char *mdparout,
   CTYPE ("a value of -1 means: use rlist");
   RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
   CTYPE ("nblist cut-off");
-  RTYPE ("rlist",      ir->rlist,      -1);
+  RTYPE ("rlist",      ir->rlist,      1.0);
   CTYPE ("long-range cut-off for switched potentials");
   RTYPE ("rlistlong",  ir->rlistlong,  -1);
   ITYPE ("nstcalclr",  ir->nstcalclr,  -1);
@@ -1633,7 +1633,7 @@ void get_ir(const char *mdparin,const char *mdparout,
   EETYPE("coulomb-modifier",   ir->coulomb_modifier,    eintmod_names);
   CTYPE ("cut-off lengths");
   RTYPE ("rcoulomb-switch",    ir->rcoulomb_switch,    0.0);
-  RTYPE ("rcoulomb",   ir->rcoulomb,   -1);
+  RTYPE ("rcoulomb",   ir->rcoulomb,   1.0);
   CTYPE ("Relative dielectric constant for the medium and the reaction field");
   RTYPE ("epsilon-r",   ir->epsilon_r,  1.0);
   RTYPE ("epsilon-rf",  ir->epsilon_rf, 0.0);
@@ -1642,12 +1642,12 @@ void get_ir(const char *mdparin,const char *mdparout,
   EETYPE("vdw-modifier",       ir->vdw_modifier,    eintmod_names);
   CTYPE ("cut-off lengths");
   RTYPE ("rvdw-switch",        ir->rvdw_switch,        0.0);
-  RTYPE ("rvdw",       ir->rvdw,       -1);
+  RTYPE ("rvdw",       ir->rvdw,       1.0);
   CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
   EETYPE("DispCorr",    ir->eDispCorr,  edispc_names);
   CTYPE ("Extension of the potential lookup tables beyond the cut-off");
   RTYPE ("table-extension", ir->tabext, 1.0);
-  CTYPE ("Seperate tables between energy group pairs");
+  CTYPE ("Separate tables between energy group pairs");
   STYPE ("energygrp-table", egptable,   NULL);
   CTYPE ("Spacing for the PME/PPPM FFT grid");
   RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
index 86878aa014e6ba08b1461e90c8d637d0e0404324..2b3a7c2b93ff81c31d2b1b8c302bef395a5cf062 100644 (file)
@@ -729,6 +729,17 @@ static void print_allswitchind(FILE *fplog,int n,int *ind,int *pind, int *allswa
     }
     fprintf(fplog,"\n");
 
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
     fprintf(fplog,"Order After Exchange: ");
     for (i=0;i<n;i++)
     {
@@ -797,6 +808,7 @@ static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int
                  =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
                  =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
                  =  de[b][a] + de[a][b] */
+
         /* permuted:
            ediff =  E_new - E_old
                  =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
@@ -804,6 +816,16 @@ static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int
                  =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
                  =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
                  =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+        /* but, in the current code implementation, we flip configurations, not indices . . .
+           So let's examine that.
+                 =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                 =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                 = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                 So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                 So the simple solution is to flip the
+                 position of perturbed and original indices in the tests.
+        */
+
         ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
         delta = ediff*beta[a]; /* assume all same temperature in this case */
         break;
@@ -869,7 +891,7 @@ test_for_replica_exchange(FILE *fplog,
     gmx_bool bPrint,bMultiEx;
     gmx_bool *bEx = re->bEx;
     real *prob = re->prob;
-    int *pind = re->destinations;
+    int *pind = re->destinations; /* permuted index */
     gmx_bool bEpot=FALSE;
     gmx_bool bDLambda=FALSE;
     gmx_bool bVol=FALSE;
@@ -955,24 +977,32 @@ test_for_replica_exchange(FILE *fplog,
         for (i=0;i<re->nex;i++)
         {
             /* randomly select a pair  */
-            /* find out which state it is from, and what label that state currently has */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
             i0 = (int)(re->nrepl*rando(&(re->seed)));
             i1 = (int)(re->nrepl*rando(&(re->seed)));
             if (i0==i1)
             {
                 i--;
-                continue;  /* got the same pair, back up and do it again */
+                continue;  /* self-exchange, back up and do it again */
             }
 
-            a = re->ind[i0];
+            a = re->ind[i0]; /* what are the indices of these states? */
             b = re->ind[i1];
             ap = pind[i0];
             bp = pind[i1];
 
             bPrint = FALSE; /* too noisy */
-            delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); /* calculate the energy difference */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog,bPrint,re,ap,bp,a,b);
 
-            /* we actually only use the first space, since there are actually many switches between pairs. */
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
 
             if (delta <= 0)
             {
@@ -1067,6 +1097,7 @@ test_for_replica_exchange(FILE *fplog,
         re->nmoves[re->ind[i]][pind[i]] +=1;
         re->nmoves[pind[i]][re->ind[i]] +=1;
     }
+    fflush(fplog); /* make sure we can see what the last exchange was */
 }
 
 static void write_debug_x(t_state *state)
@@ -1306,6 +1337,7 @@ gmx_bool replica_exchange(FILE *fplog,const t_commrec *cr,struct gmx_repl_ex *re
             /* There will be only one swap cycle with standard replica
              * exchange, but there may be multiple swap cycles if we
              * allow multiple swaps. */
+
             for (j = 0; j < maxswap; j++)
             {
                 exchange_partner = re->order[replica_id][j];
index 37e9854baa56eefe538113d0fa2179e5c61de49a..be3ed6b310c9a3a60ccdd5bdd9a496c62ef6936d 100644 (file)
@@ -1043,23 +1043,32 @@ static void set_cpu_affinity(FILE *fplog,
         }
         else
         {
-            /* check if some threads failed to set their affinities */
+            /* check & warn if some threads failed to set their affinities */
             if (nth_affinity_set != nthread_local)
             {
-                char sbuf[STRLEN];
-                sbuf[0] = '\0';
+                char sbuf1[STRLEN], sbuf2[STRLEN];
+
+                /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+                sbuf1[0] = sbuf2[0] = '\0';
 #ifdef GMX_MPI
 #ifdef GMX_THREAD_MPI
-                sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
+                sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
 #else /* GMX_LIB_MPI */
+                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
 #endif
-                sprintf(sbuf, "In MPI process #%d", cr->nodeid);
 #endif /* GMX_MPI */
+
+                if (nthread_local > 1)
+                {
+                    sprintf(sbuf2, "of %d/%d thread%s ",
+                            nthread_local - nth_affinity_set, nthread_local,
+                            (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+                }
+
                 md_print_warn(NULL, fplog,
-                              "%s%d/%d thread%s failed to set their affinities. "
-                              "This can cause performance degradation!",
-                              sbuf, nthread_local - nth_affinity_set, nthread_local,
-                              (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+                              "NOTE: %sAffinity setting %sfailed.\n"
+                              "      This can cause performance degradation!",
+                              sbuf1, sbuf2);
             }
         }
     }
index eb92debad552515233485d1db3256cedd9b1849c..2c08a97042854522f2b15a96e8141e081e392ea7 100644 (file)
@@ -1252,9 +1252,9 @@ gmx_constr_t init_constraints(FILE *fplog,
     /* Initialize the essential dynamics sampling.
      * Put the pointer to the ED struct in constr */
     constr->ed = ed;
-    if (ed != NULL
+    if (ed != NULL || state->edsamstate.nED > 0)
     {
-        init_edsam(mtop,ir,cr,ed,state->x,state->box);
+        init_edsam(mtop,ir,cr,ed,state->x,state->box,&state->edsamstate);
     }
     
     constr->warn_mtop = mtop;
index 80f2cd63349583311d6a585ef37aca94ff1bdca1..56b173861524e4211fe5da87e3f02957875fd44f 100644 (file)
@@ -138,9 +138,11 @@ typedef struct gmx_edx
                                    * with respect to the collective
                                    * anrs[0...nr-1] array                     */
     rvec          *x;             /* positions for this structure             */
-    rvec          *x_old;         /* used to keep track of the shift vectors
-                                     such that the ED molecule can always be
-                                     made whole in the parallel case          */
+    rvec          *x_old;         /* Last positions which have the correct PBC
+                                     representation of the ED group. In
+                                     combination with keeping track of the
+                                     shift vectors, the ED group can always
+                                     be made whole                            */
     real          *m;             /* masses                                   */
     real          mtot;           /* total mass (only used in sref)           */
     real          *sqrtm;         /* sqrt of the masses used for mass-
@@ -186,7 +188,6 @@ typedef struct gmx_edsam
     FILE          *edo;           /* output file pointer                  */
     t_edpar       *edpar;
     gmx_bool      bFirst;
-    gmx_bool      bStartFromCpt;
 } t_gmx_edsam;
 
 
@@ -238,7 +239,9 @@ static real projectx(t_edpar *edi, rvec *xcoll, rvec *vec)
 
 
     for (i=0; i<edi->sav.nr; i++)
+    {
         proj += edi->sav.sqrtm[i]*iprod(vec[i], xcoll[i]);
+    }
 
     return proj;
 }
@@ -254,7 +257,9 @@ static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec, t_commrec *cr)
 
     /* Subtract average positions */
     for (i = 0; i < edi->sav.nr; i++)
+    {
         rvec_dec(x[i], edi->sav.x[i]);
+    }
 
     for (i = 0; i < vec->neig; i++)
     {
@@ -265,7 +270,9 @@ static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec, t_commrec *cr)
 
     /* Add average positions */
     for (i = 0; i < edi->sav.nr; i++)
+    {
         rvec_inc(x[i], edi->sav.x[i]);
+    }
 }
 
 
@@ -283,14 +290,20 @@ static void project_to_eigvectors(rvec       *x,    /* The positions to project
 
     /* Subtract average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_dec(x[i], edi->sav.x[i]);
+    }
 
     for (i=0; i<vec->neig; i++)
+    {
         vec->xproj[i] = projectx(edi, x, vec->vec[i]);
+    }
 
     /* Add average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_inc(x[i], edi->sav.x[i]);
+    }
 }
 
 
@@ -316,7 +329,9 @@ static real calc_radius(t_eigvec *vec)
 
 
     for (i=0; i<vec->neig; i++)
+    {
         rad += pow((vec->refproj[i]-vec->xproj[i]),2);
+    }
 
     return rad=sqrt(rad);
 }
@@ -345,11 +360,13 @@ static void dump_xcoll(t_edpar *edi, struct t_do_edsam *buf, t_commrec *cr,
     fp = fopen(fn, "w");
 
     for (i=0; i<edi->sav.nr; i++)
+    {
         fprintf(fp, "%d %9.5f %9.5f %9.5f   %d %d %d   %d %d %d\n",
                 edi->sav.anrs[i]+1,
                 xcoll[i][XX]  , xcoll[i][YY]  , xcoll[i][ZZ],
                 shifts[i][XX] , shifts[i][YY] , shifts[i][ZZ],
                 eshifts[i][XX], eshifts[i][YY], eshifts[i][ZZ]);
+    }
 
     fclose(fp);
 }
@@ -363,16 +380,22 @@ static void dump_edi_positions(FILE *out, struct gmx_edx *s, const char name[])
 
     fprintf(out, "#%s positions:\n%d\n", name, s->nr);
     if (s->nr == 0)
+    {
         return;
+    }
 
     fprintf(out, "#index, x, y, z");
     if (s->sqrtm)
+    {
         fprintf(out, ", sqrt(m)");
+    }
     for (i=0; i<s->nr; i++)
     {
         fprintf(out, "\n%6d  %11.6f %11.6f %11.6f",s->anrs[i], s->x[i][XX], s->x[i][YY], s->x[i][ZZ]);
         if (s->sqrtm)
+        {
             fprintf(out,"%9.3f",s->sqrtm[i]);
+        }
     }
     fprintf(out, "\n");
 }
@@ -392,7 +415,9 @@ static void dump_edi_eigenvecs(FILE *out, t_eigvec *ev,
         fprintf(out, "EV %4d\ncomponents %d\nstepsize %f\nxproj %f\nfproj %f\nrefproj %f\nradius %f\nComponents:\n",
                 ev->ieig[i], length, ev->stpsz[i], ev->xproj[i], ev->fproj[i], ev->refproj[i], ev->radius);
         for (j=0; j<length; j++)
+        {
             fprintf(out, "%11.6f %11.6f %11.6f\n", ev->vec[i][j][XX], ev->vec[i][j][YY], ev->vec[i][j][ZZ]);
+        }
     }
 }
 
@@ -457,7 +482,9 @@ static void dump_rvec(FILE *out, int dim, rvec *x)
 
 
     for (i=0; i<dim; i++)
+    {
         fprintf(out,"%4d   %f %f %f\n",i,x[i][XX],x[i][YY],x[i][ZZ]);
+    }
 }
 
 
@@ -471,7 +498,9 @@ static void dump_mat(FILE* out, int dim, double** mat)
     for (i=0;i<dim;i++)
     {
         for (j=0;j<dim;j++)
+        {
             fprintf(out,"%f ",mat[i][j]);
+        }
         fprintf(out,"\n");
     }
 }
@@ -496,7 +525,9 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
     gmx_bool bFirst;
 
     if(edi->buf->do_edfit != NULL)
+    {
         bFirst = FALSE;
+    }
     else
     {
         bFirst = TRUE;
@@ -543,7 +574,9 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
     /* construct loc->omega */
     /* loc->omega is symmetric -> loc->omega==loc->omega' */
     for(r=0;(r<6);r++)
+    {
         for(c=0;(c<=r);c++)
+        {
             if ((r>=3) && (c<3))
             {
                 loc->omega[r][c]=u[r-3][c];
@@ -554,6 +587,8 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
                 loc->omega[r][c]=0;
                 loc->omega[c][r]=0;
             }
+        }
+    }
 
     /* determine h and k */
 #ifdef DEBUG
@@ -561,13 +596,17 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
         int i;
         dump_mat(stderr,2*DIM,loc->omega);
         for (i=0; i<6; i++)
+        {
             fprintf(stderr,"d[%d] = %f\n",i,d[i]);
+        }
     }
 #endif
     jacobi(loc->omega,6,d,loc->om,&irot);
 
     if (irot==0)
+    {
         fprintf(stderr,"IROT=0\n");
+    }
 
     index=0; /* For the compiler only */
 
@@ -575,11 +614,13 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
     {
         max_d=-1000;
         for(i=0;(i<6);i++)
+        {
             if (d[i]>max_d)
             {
                 max_d=d[i];
                 index=i;
             }
+        }
         d[index]=-10000;
         for(i=0;(i<3);i++)
         {
@@ -590,16 +631,26 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
 
     /* determine R */
     for(c=0;(c<3);c++)
+    {
         for(r=0;(r<3);r++)
+        {
             R[c][r]=vk[0][r]*vh[0][c]+
-            vk[1][r]*vh[1][c]+
-            vk[2][r]*vh[2][c];
+                    vk[1][r]*vh[1][c]+
+                    vk[2][r]*vh[2][c];
+        }
+    }
     if (det(R) < 0)
+    {
         for(c=0;(c<3);c++)
+        {
             for(r=0;(r<3);r++)
+            {
                 R[c][r]=vk[0][r]*vh[0][c]+
-                vk[1][r]*vh[1][c]-
-                vk[2][r]*vh[2][c];
+                        vk[1][r]*vh[1][c]-
+                        vk[2][r]*vh[2][c];
+            }
+        }
+    }
 }
 
 
@@ -696,7 +747,9 @@ static void write_edo_flood(t_edpar *edi, FILE *fp, gmx_large_int_t step)
         for (i = 0; i < edi->flood.vecs.neig; i++)
         {
             if (edi->flood.vecs.refprojslope[i] != 0.0)
+            {
                 bOutputRef=TRUE;
+            }
         }
         if (bOutputRef)
         {
@@ -711,7 +764,9 @@ static void write_edo_flood(t_edpar *edi, FILE *fp, gmx_large_int_t step)
     fprintf(fp,"FL_FORCES: ");
 
     for (i=0; i<edi->flood.vecs.neig; i++)
+    {
         fprintf(fp," %12.5e",edi->flood.vecs.fproj[i]);
+    }
 
     fprintf(fp,"\n");
 }
@@ -777,16 +832,20 @@ static void flood_forces(t_edpar *edi)
 
 
     if (edi->flood.bHarmonic)
+    {
         for (i=0; i<edi->flood.vecs.neig; i++)
         {
             edi->flood.vecs.fproj[i] = edi->flood.Efl* edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]);
         }
+    }
     else
+    {
         for (i=0; i<edi->flood.vecs.neig; i++)
         {
             /* if Efl is zero the forces are zero if not use the formula */
             edi->flood.vecs.fproj[i] = edi->flood.Efl!=0 ? edi->flood.kT/edi->flood.Efl/edi->flood.alpha2*energy*edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]) : 0;
         }
+    }
 }
 
 
@@ -817,7 +876,9 @@ static void flood_blowup(t_edpar *edi, rvec *forces_cart)
 
     /* Clear forces first */
     for (j=0; j<edi->sav.nr_loc; j++)
+    {
         clear_rvec(forces_cart[j]);
+    }
 
     /* Now compute atomwise */
     for (j=0; j<edi->sav.nr_loc; j++)
@@ -846,7 +907,9 @@ static void update_adaption(t_edpar *edi)
         edi->flood.Efl = edi->flood.Efl+edi->flood.dt/edi->flood.tau*(edi->flood.deltaF0-edi->flood.deltaF);
         /* check if restrain (inverted flooding) -> don't let EFL become positive */
         if (edi->flood.alpha2<0 && edi->flood.Efl>-0.00000001)
+        {
             edi->flood.Efl = 0;
+        }
 
         edi->flood.deltaF = (1-edi->flood.dt/edi->flood.tau)*edi->flood.deltaF+edi->flood.dt/edi->flood.tau*edi->flood.Vfl;
     }
@@ -881,8 +944,10 @@ static void do_single_flood(
 
     /* Only assembly REFERENCE positions if their indices differ from the average ones */
     if (!edi->bRefEqAv)
+    {
         communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, bNS, x,
                 edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+    }
 
     /* If bUpdateShifts was TRUE, the shifts have just been updated in get_positions.
      * We do not need to update the shifts until the next NS step */
@@ -893,9 +958,13 @@ static void do_single_flood(
 
     /* Fit the reference indices to the reference structure */
     if (edi->bRefEqAv)
+    {
         fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+    }
     else
+    {
         fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+    }
 
     /* Now apply the translation and rotation to the ED structure */
     translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
@@ -924,11 +993,15 @@ static void do_single_flood(
 
     /* Finally add forces to the main force variable */
     for (i=0; i<edi->sav.nr_loc; i++)
+    {
         rvec_inc(force[edi->sav.anrs_loc[i]],edi->flood.forces_cartesian[i]);
+    }
 
     /* Output is written by the master process */
     if (do_per_step(step,edi->outfrq) && MASTER(cr))
+    {
         write_edo_flood(edi,edo,step);
+    }
 }
 
 
@@ -954,7 +1027,9 @@ extern void do_flood(
     {
         /* Call flooding for one matrix */
         if (edi->flood.vecs.neig)
+        {
             do_single_flood(ed->edo,x,force,edi,step,box,cr,bNS);
+        }
         edi = edi->next_edi;
     }
 }
@@ -962,7 +1037,7 @@ extern void do_flood(
 
 /* Called by init_edi, configure some flooding related variables and structures,
  * print headers to output files */
-static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
+static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr, gmx_bool bPrintheader)
 {
     int i;
 
@@ -991,9 +1066,13 @@ static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
                         edi->flood.vecs.ieig[i], edi->flood.vecs.fproj[i]);
             }
         }
-        fprintf(ed->edo,"FL_HEADER: Flooding of matrix %d is switched on! The flooding output will have the following format:\n",
-                edi->flood.flood_id);
-        fprintf(ed->edo,"FL_HEADER: Step     Efl          Vfl       deltaF\n");
+
+        if (bPrintheader)
+        {
+            fprintf(ed->edo,"FL_HEADER: Flooding of matrix %d is switched on! The flooding output will have the following format:\n",
+                    edi->flood.flood_id);
+            fprintf(ed->edo,"FL_HEADER: Step     Efl          Vfl       deltaF\n");
+        }
     }
 }
 
@@ -1035,7 +1114,9 @@ static void get_flood_energies(t_edpar *edi, real Vfl[],int nnames)
         count++;
     }
     if (nnames!=count-1)
+    {
         gmx_fatal(FARGS,"Number of energies is not consistent with t_edi structure");
+    }
 }
 /************* END of FLOODING IMPLEMENTATION ****************************/
 #endif
@@ -1060,7 +1141,6 @@ gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec
         fprintf(stderr,"ED sampling will be performed!\n");
         ed->edonam = ftp2fn(efEDO,nfile,fnm);
         ed->edo    = gmx_fio_fopen(ed->edonam,(Flags & MD_APPENDFILES)? "a+" : "w+");
-        ed->bStartFromCpt = Flags & MD_STARTFROMCPT;
     }
     return ed;
 }
@@ -1281,7 +1361,9 @@ static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
 static void check(const char *line, const char *label)
 {
     if (!strstr(line,label))
+    {
         gmx_fatal(FARGS,"Could not find input parameter %s at expected position in edsam input-file (.edi)\nline read instead is %s",label,line);
+    }
 }
 
 
@@ -1351,7 +1433,9 @@ static void read_edx(FILE *file,int number,int *anrs,rvec *x)
         sscanf (line,"%d%lf%lf%lf",&anrs[i],&d[0],&d[1],&d[2]);
         anrs[i]--; /* we are reading FORTRAN indices */
         for(j=0; j<3; j++)
+        {
             x[i][j]=d[j]; /* always read as double and convert to single */
+        }
     }
 }
 
@@ -1432,7 +1516,9 @@ static void read_edvec(FILE *in,int nr,t_eigvec *tvec,gmx_bool bReadRefproj, gmx
             {
                 nscan = sscanf(line,"%d%lf",&idum,&rdum);
                 if (nscan != 2)
+                {
                     gmx_fatal(FARGS,"Expected 2 values for flooding vec: <nr> <stpsz>\n");
+                }
             }
             tvec->ieig[i]=idum;
             tvec->stpsz[i]=rdum;
@@ -1471,14 +1557,18 @@ static gmx_bool check_if_same(struct gmx_edx sref, struct gmx_edx sav)
     /* If the number of atoms differs between the two structures,
      * they cannot be identical */
     if (sref.nr != sav.nr)
+    {
         return FALSE;
+    }
 
     /* Now that we know that both stuctures have the same number of atoms,
      * check if also the indices are identical */
     for (i=0; i < sav.nr; i++)
     {
         if (sref.anrs[i] != sav.anrs[i])
+        {
             return FALSE;
+        }
     }
     fprintf(stderr, "ED: Note: Reference and average structure are composed of the same atom indices.\n");
 
@@ -1502,21 +1592,29 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
     readmagic=read_edint(in,&bEOF);
     /* Check whether we have reached the end of the input file */
     if (bEOF)
+    {
         return 0;
+    }
 
     if (readmagic != magic)
     {
         if (readmagic==666 || readmagic==667 || readmagic==668)
+        {
             gmx_fatal(FARGS,"Wrong magic number: Use newest version of make_edi to produce edi file");
+        }
         else if (readmagic != 669)
+        {
             gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,ed->edinam);
+        }
     }
 
     /* check the number of atoms */
     edi->nini=read_edint(in,&bEOF);
     if (edi->nini != nr_mdatoms)
+    {
         gmx_fatal(FARGS,"Nr of atoms in %s (%d) does not match nr of md atoms (%d)",
                 ed->edinam,edi->nini,nr_mdatoms);
+    }
 
     /* Done checking. For the rest we blindly trust the input */
     edi->fitmas          = read_checked_edint(in,"FITMAS");
@@ -1534,9 +1632,13 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
     edi->flood.kT        = read_checked_edreal(in,"KT");
     edi->flood.bHarmonic = read_checked_edint(in,"HARMONIC");
     if (readmagic > 669)
+    {
         edi->flood.bConstForce = read_checked_edint(in,"CONST_FORCE_FLOODING");
+    }
     else
+    {
         edi->flood.bConstForce = FALSE;
+    }
     edi->flood.flood_id  = edi_nr;
     edi->sref.nr         = read_checked_edint(in,"NREF");
 
@@ -1575,13 +1677,13 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
     edi->sori.nr=read_edint(in,&bEOF);
     if (edi->sori.nr > 0)
     {
-       if (bHaveReference)
-       {
-               /* Both an -ori structure and a at least one manual reference point have been
-                * specified. That's ambiguous and probably not intentional. */
-               gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
-                                "    point was manually specified in the edi file. That is ambiguous. Aborting.\n");
-       }
+        if (bHaveReference)
+        {
+            /* Both an -ori structure and a at least one manual reference point have been
+             * specified. That's ambiguous and probably not intentional. */
+            gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
+                             "    point was manually specified in the edi file. That is ambiguous. Aborting.\n");
+        }
         snew(edi->sori.anrs,edi->sori.nr);
         snew(edi->sori.x   ,edi->sori.nr);
         edi->sori.sqrtm    =NULL;
@@ -1597,7 +1699,7 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
 /* Read in the edi input file. Note that it may contain several ED data sets which were
  * achieved by concatenating multiple edi files. The standard case would be a single ED
  * data set, though. */
-static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commrec *cr)
+static int read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commrec *cr)
 {
     FILE    *in;
     t_edpar *curr_edi,*last_edi;
@@ -1619,8 +1721,10 @@ static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commre
         edi_nr++;
         /* Make shure that the number of atoms in each dataset is the same as in the tpr file */
         if (edi->nini != nr_mdatoms)
+        {
             gmx_fatal(FARGS,"edi file %s (dataset #%d) was made for %d atoms, but the simulation contains %d atoms.",
                     ed->edinam, edi_nr, edi->nini, nr_mdatoms);
+        }
         /* Since we arrived within this while loop we know that there is still another data set to be read in */
         /* We need to allocate space for the data: */
         snew(edi_read,1);
@@ -1632,7 +1736,9 @@ static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commre
         curr_edi = edi_read;
     }
     if (edi_nr == 0)
+    {
         gmx_fatal(FARGS, "No complete ED data set found in edi file %s.", ed->edinam);
+    }
 
     /* Terminate the edi dataset list with a NULL pointer: */
     last_edi->next_edi = NULL;
@@ -1641,6 +1747,8 @@ static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commre
 
     /* Close the .edi file again */
     gmx_fio_fclose(in);
+
+    return edi_nr;
 }
 
 
@@ -1674,7 +1782,9 @@ static void fit_to_reference(rvec      *xcoll,    /* The positions to be fitted
 
     /* We do not touch the original positions but work on a copy. */
     for (i=0; i<edi->sref.nr; i++)
+    {
         copy_rvec(xcoll[i], loc->xcopy[i]);
+    }
 
     /* Calculate the center of mass */
     get_center(loc->xcopy, edi->sref.m, edi->sref.nr, com);
@@ -1716,7 +1826,9 @@ static real rmsd_from_structure(rvec           *x,  /* The positions under consi
 
 
     for (i=0; i < s->nr; i++)
+    {
         rmsd += distance2(s->x[i], x[i]);
+    }
 
     rmsd /= (real) s->nr;
     rmsd = sqrt(rmsd);
@@ -1739,8 +1851,10 @@ void dd_make_local_ed_indices(gmx_domdec_t *dd, struct gmx_edsam *ed)
             /* Local atoms of the reference structure (for fitting), need only be assembled
              * if their indices differ from the average ones */
             if (!edi->bRefEqAv)
+            {
                 dd_make_local_group_indices(dd->ga2la, edi->sref.nr, edi->sref.anrs,
                         &edi->sref.nr_loc, &edi->sref.anrs_loc, &edi->sref.nalloc_loc, edi->sref.c_ind);
+            }
 
             /* Local atoms of the average structure (on these ED will be performed) */
             dd_make_local_group_indices(dd->ga2la, edi->sav.nr, edi->sav.anrs,
@@ -1773,7 +1887,8 @@ static inline void ed_unshift_single_coord(matrix box, const rvec x, const ivec
         xu[XX] = x[XX]-tx*box[XX][XX]-ty*box[YY][XX]-tz*box[ZZ][XX];
         xu[YY] = x[YY]-ty*box[YY][YY]-tz*box[ZZ][YY];
         xu[ZZ] = x[ZZ]-tz*box[ZZ][ZZ];
-    } else
+    }
+    else
     {
         xu[XX] = x[XX]-tx*box[XX][XX];
         xu[YY] = x[YY]-ty*box[YY][YY];
@@ -1829,12 +1944,16 @@ static void do_linacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
         if (edi->vecs.linacc.stpsz[i] > 0.0)
         {
             if ((proj-edi->vecs.linacc.refproj[i]) < 0.0)
+            {
                 add = edi->vecs.linacc.refproj[i] - proj;
+            }
         }
         if (edi->vecs.linacc.stpsz[i] < 0.0)
         {
             if ((proj-edi->vecs.linacc.refproj[i]) > 0.0)
+            {
                 add = edi->vecs.linacc.refproj[i] - proj;
+            }
         }
 
         /* apply the correction */
@@ -1883,7 +2002,8 @@ static void do_radfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
         /* apply the correction */
         proj[i] /= edi->sav.sqrtm[i];
         proj[i] *= ratio;
-        for (j=0; j<edi->sav.nr; j++) {
+        for (j=0; j<edi->sav.nr; j++)
+        {
             svmul(proj[i], edi->vecs.radfix.vec[i][j], vec_dum);
             rvec_inc(xcoll[j], vec_dum);
         }
@@ -2026,20 +2146,28 @@ static void ed_apply_constraints(rvec *xcoll, t_edpar *edi, gmx_large_int_t step
 
     /* subtract the average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_dec(xcoll[i], edi->sav.x[i]);
+    }
 
     /* apply the constraints */
     if (step >= 0)
+    {
         do_linfix(xcoll, edi, step, cr);
+    }
     do_linacc(xcoll, edi, cr);
     if (step >= 0)
+    {
         do_radfix(xcoll, edi, step, cr);
+    }
     do_radacc(xcoll, edi, cr);
     do_radcon(xcoll, edi, cr);
 
     /* add back the average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_inc(xcoll[i], edi->sav.x[i]);
+    }
 
     GMX_MPE_LOG(ev_ed_apply_cons_finish);
 }
@@ -2055,7 +2183,9 @@ static void write_edo(int nr_edi, t_edpar *edi, gmx_edsam_t ed, gmx_large_int_t
     if (edi->bNeedDoEdsam)
     {
         if (step == -1)
+        {
             fprintf(ed->edo, "Initial projections:\n");
+        }
         else
         {
             fprintf(ed->edo,"Step %s, ED #%d  ", gmx_step_str(step, buf), nr_edi);
@@ -2066,28 +2196,36 @@ static void write_edo(int nr_edi, t_edpar *edi, gmx_edsam_t ed, gmx_large_int_t
         {
             fprintf(ed->edo,"  Monitor eigenvectors");
             for (i=0; i<edi->vecs.mon.neig; i++)
+            {
                 fprintf(ed->edo," %d: %12.5e ",edi->vecs.mon.ieig[i],edi->vecs.mon.xproj[i]);
+            }
             fprintf(ed->edo,"\n");
         }
         if (edi->vecs.linfix.neig)
         {
             fprintf(ed->edo,"  Linfix  eigenvectors");
             for (i=0; i<edi->vecs.linfix.neig; i++)
+            {
                 fprintf(ed->edo," %d: %12.5e ",edi->vecs.linfix.ieig[i],edi->vecs.linfix.xproj[i]);
+            }
             fprintf(ed->edo,"\n");
         }
         if (edi->vecs.linacc.neig)
         {
             fprintf(ed->edo,"  Linacc  eigenvectors");
             for (i=0; i<edi->vecs.linacc.neig; i++)
+            {
                 fprintf(ed->edo," %d: %12.5e ",edi->vecs.linacc.ieig[i],edi->vecs.linacc.xproj[i]);
+            }
             fprintf(ed->edo,"\n");
         }
         if (edi->vecs.radfix.neig)
         {
             fprintf(ed->edo,"  Radfix  eigenvectors");
             for (i=0; i<edi->vecs.radfix.neig; i++)
+            {
                 fprintf(ed->edo," %d: %12.5e ",edi->vecs.radfix.ieig[i],edi->vecs.radfix.xproj[i]);
+            }
             fprintf(ed->edo,"\n");
             fprintf(ed->edo,"  fixed increment radius = %f\n", calc_radius(&edi->vecs.radfix));
         }
@@ -2095,7 +2233,9 @@ static void write_edo(int nr_edi, t_edpar *edi, gmx_edsam_t ed, gmx_large_int_t
         {
             fprintf(ed->edo,"  Radacc  eigenvectors");
             for (i=0; i<edi->vecs.radacc.neig; i++)
+            {
                 fprintf(ed->edo," %d: %12.5e ",edi->vecs.radacc.ieig[i],edi->vecs.radacc.xproj[i]);
+            }
             fprintf(ed->edo,"\n");
             fprintf(ed->edo,"  acceptance radius      = %f\n", calc_radius(&edi->vecs.radacc));
         }
@@ -2103,7 +2243,9 @@ static void write_edo(int nr_edi, t_edpar *edi, gmx_edsam_t ed, gmx_large_int_t
         {
             fprintf(ed->edo,"  Radcon  eigenvectors");
             for (i=0; i<edi->vecs.radcon.neig; i++)
+            {
                 fprintf(ed->edo," %d: %12.5e ",edi->vecs.radcon.ieig[i],edi->vecs.radcon.xproj[i]);
+            }
             fprintf(ed->edo,"\n");
             fprintf(ed->edo,"  contracting radius     = %f\n", calc_radius(&edi->vecs.radcon));
         }
@@ -2131,7 +2273,9 @@ static void copyEvecReference(t_eigvec* floodvecs)
 
 
     if (NULL==floodvecs->refproj0)
+    {
         snew(floodvecs->refproj0, floodvecs->neig);
+    }
 
     for (i=0; i<floodvecs->neig; i++)
     {
@@ -2140,31 +2284,144 @@ static void copyEvecReference(t_eigvec* floodvecs)
 }
 
 
+/* Call on MASTER only. Check whether the essential dynamics / flooding
+ * datasets of the checkpoint file are consistent with the provided .edi file. */
+static void crosscheck_edi_file_vs_checkpoint(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+    t_edpar *edi = NULL;    /* points to a single edi data set */
+    int i, edinum;
+
+
+    if (NULL == EDstate->nref || NULL == EDstate->nav)
+    {
+        gmx_fatal(FARGS, "Essential dynamics and flooding can only be switched on (or off) at the\n"
+                         "start of a new simulation. If a simulation runs with/without ED constraints,\n"
+                         "it must also continue with/without ED constraints when checkpointing.\n"
+                         "To switch on (or off) ED constraints, please prepare a new .tpr to start\n"
+                         "from without a checkpoint.\n");
+    }
+
+    edi=ed->edpar;
+    edinum = 0;
+    while(edi != NULL)
+    {
+        /* Check number of atoms in the reference and average structures */
+        if (EDstate->nref[edinum] != edi->sref.nr)
+        {
+            gmx_fatal(FARGS, "The number of reference structure atoms in ED dataset #%d is\n"
+                             "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+                    edinum+1, EDstate->nref[edinum], edi->sref.nr);
+        }
+        if (EDstate->nav[edinum] != edi->sav.nr)
+        {
+            gmx_fatal(FARGS, "The number of average structure atoms in ED dataset #%d is\n"
+                             "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+                    edinum+1, EDstate->nav[edinum], edi->sav.nr);
+        }
+        edi=edi->next_edi;
+        edinum++;
+    }
+
+    if (edinum != EDstate->nED)
+    {
+        gmx_fatal(FARGS, "The number of essential dynamics / flooding datasets is not consistent.\n"
+                         "There are %d ED datasets in .cpt file, but %d in .edi file!\n"
+                         "Are you shure this is the correct .edi file?\n", EDstate->nED, edinum);
+    }
+}
+
+
+/* The edsamstate struct stores the information we need to make the ED group
+ * whole again after restarts from a checkpoint file. Here we do the following:
+ * a) If we did not start from .cpt, we prepare the struct for proper .cpt writing,
+ * b) if we did start from .cpt, we copy over the last whole structures from .cpt,
+ * c) in any case, for subsequent checkpoint writing, we set the pointers in
+ * edsamstate to the x_old arrays, which contain the correct PBC representation of
+ * all ED structures at the last time step. */
+static void init_edsamstate(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+    int     i, nr_edi;
+    t_edpar *edi;
+
+
+    snew(EDstate->old_sref_p, EDstate->nED);
+    snew(EDstate->old_sav_p , EDstate->nED);
+
+    /* If we did not read in a .cpt file, these arrays are not yet allocated */
+    if (!EDstate->bFromCpt)
+    {
+        snew(EDstate->nref, EDstate->nED);
+        snew(EDstate->nav , EDstate->nED);
+    }
+
+    /* Loop over all ED/flooding data sets (usually only one, though) */
+    edi = ed->edpar;
+    for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
+    {
+        /* We always need the last reference and average positions such that
+         * in the next time step we can make the ED group whole again
+         * if the atoms do not have the correct PBC representation */
+        if (EDstate->bFromCpt)
+        {
+            /* Copy the last whole positions of reference and average group from .cpt */
+            for (i=0; i<edi->sref.nr; i++)
+            {
+                copy_rvec(EDstate->old_sref[nr_edi-1][i], edi->sref.x_old[i]);
+            }
+            for (i=0; i<edi->sav.nr ; i++)
+            {
+                copy_rvec(EDstate->old_sav [nr_edi-1][i], edi->sav.x_old [i]);
+            }
+        }
+        else
+        {
+            EDstate->nref[nr_edi-1] = edi->sref.nr;
+            EDstate->nav [nr_edi-1] = edi->sav.nr;
+        }
+
+        /* For subsequent checkpoint writing, set the edsamstate pointers to the edi arrays: */
+        EDstate->old_sref_p[nr_edi-1] = edi->sref.x_old;
+        EDstate->old_sav_p [nr_edi-1] = edi->sav.x_old ;
+
+        edi = edi->next_edi;
+    }
+}
+
+
 void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                 t_inputrec  *ir,     /* input record                       */
                 t_commrec   *cr,     /* communication record               */
                 gmx_edsam_t ed,      /* contains all ED data               */
                 rvec        x[],     /* positions of the whole MD system   */
-                matrix      box)     /* the box                            */
+                matrix      box,     /* the box                            */
+                edsamstate_t *EDstate)
 {
     t_edpar *edi = NULL;    /* points to a single edi data set */
-    int     numedis=0;      /* keep track of the number of ED data sets in edi file */
     int     i,nr_edi,avindex;
     rvec    *x_pbc  = NULL; /* positions of the whole MD system with pbc removed  */
-    rvec    *xfit   = NULL; /* the positions which will be fitted to the reference structure  */
-    rvec    *xstart = NULL; /* the positions which are subject to ED sampling */
+    rvec    *xfit=NULL, *xstart=NULL; /* dummy arrays to determine initial RMSDs  */
     rvec    fit_transvec;   /* translation ... */
     matrix  fit_rotmat;     /* ... and rotation from fit to reference structure */
 
 
     if (!DOMAINDECOMP(cr) && PAR(cr) && MASTER(cr))
+    {
         gmx_fatal(FARGS, "Please switch on domain decomposition to use essential dynamics in parallel.");
+    }
 
     GMX_MPE_LOG(ev_edsam_start);
 
     if (MASTER(cr))
+    {
         fprintf(stderr, "ED: Initializing essential dynamics constraints.\n");
 
+        if (NULL == ed)
+        {
+            gmx_fatal(FARGS, "The checkpoint file you provided is from an essential dynamics or\n"
+                             "flooding simulation. Please also provide the correct .edi file with -ei.\n");
+        }
+    }
+
     /* Needed for initializing radacc radius in do_edsam */
     ed->bFirst = 1;
 
@@ -2175,7 +2432,14 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
     {
         snew(ed->edpar,1);
         /* Read the whole edi file at once: */
-        read_edi_file(ed,ed->edpar,mtop->natoms,cr);
+        EDstate->nED = read_edi_file(ed,ed->edpar,mtop->natoms,cr);
+
+        /* Make shure the checkpoint was produced in a run using this .edi file */
+        if (EDstate->bFromCpt)
+        {
+            crosscheck_edi_file_vs_checkpoint(ed, EDstate);
+        }
+        init_edsamstate(ed, EDstate);
 
         /* Initialization for every ED/flooding dataset. Flooding uses one edi dataset per
          * flooding vector, Essential dynamics can be applied to more than one structure
@@ -2187,10 +2451,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             init_edi(mtop,ir,cr,ed,edi);
 
             /* Init flooding parameters if needed */
-            init_flood(edi,ed,ir->delta_t,cr);
+            init_flood(edi,ed,ir->delta_t,cr,!EDstate->bFromCpt);
 
             edi=edi->next_edi;
-            numedis++;
         }
     }
 
@@ -2209,32 +2472,34 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
         edi=ed->edpar;
 
         /* Loop over all ED/flooding data sets (usually only one, though) */
-        for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+        for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
         {
-            /* We use srenew to allocate memory since the size of the buffers
-             * is likely to change with every ED dataset */
-            srenew(xfit  , edi->sref.nr );
-            srenew(xstart, edi->sav.nr  );
-
-            /* Extract the positions of the atoms to which will be fitted */
-            for (i=0; i < edi->sref.nr; i++)
+            /* Extract the initial reference and average positions. When starting
+             * from .cpt, these have already been read into sref.x_old
+             * in init_edsamstate() */
+            if (!EDstate->bFromCpt)
             {
-                copy_rvec(x_pbc[edi->sref.anrs[i]], xfit[i]);
+                /* If this is the first run (i.e. no checkpoint present) we assume
+                 * that the starting positions give us the correct PBC representation */
+                for (i=0; i < edi->sref.nr; i++)
+                {
+                    copy_rvec(x_pbc[edi->sref.anrs[i]], edi->sref.x_old[i]);
+                }
 
-                /* Save the sref positions such that in the next time step we can make the ED group whole
-                 * in case any of the atoms do not have the correct PBC representation */
-                copy_rvec(xfit[i], edi->sref.x_old[i]);
+                for (i=0; i < edi->sav.nr; i++)
+                {
+                    copy_rvec(x_pbc[edi->sav.anrs[i]], edi->sav.x_old[i]);
+                }
             }
 
-            /* Extract the positions of the atoms subject to ED sampling */
-            for (i=0; i < edi->sav.nr; i++)
-            {
-                copy_rvec(x_pbc[edi->sav.anrs[i]], xstart[i]);
-
-                /* Save the sav positions such that in the next time step we can make the ED group whole
-                 * in case any of the atoms do not have the correct PBC representation */
-                copy_rvec(xstart[i], edi->sav.x_old[i]);
-            }
+            /* Now we have the PBC-correct start positions of the reference and
+               average structure. We copy that over to dummy arrays on which we
+               can apply fitting to print out the RMSD. We srenew the memory since
+               the size of the buffers is likely different for every ED dataset */
+            srenew(xfit  , edi->sref.nr );
+            srenew(xstart, edi->sav.nr  );
+            copy_rvecn(edi->sref.x_old, xfit, 0, edi->sref.nr);
+            copy_rvecn(edi->sav.x_old, xstart, 0, edi->sav.nr);
 
             /* Make the fit to the REFERENCE structure, get translation and rotation */
             fit_to_reference(xfit, fit_transvec, fit_rotmat, edi);
@@ -2278,12 +2543,17 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                     avindex = edi->star.nr - edi->sav.nr;
                 }
                 rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon, cr);
-            } else
+            }
+            else
+            {
                 rad_project(edi, xstart, &edi->vecs.radcon, cr);
+            }
 
             /* process structure that will serve as origin of expansion circle */
             if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
+            {
                 fprintf(stderr, "ED: Setting center of flooding potential (0 = average structure)\n");
+            }
 
             if (edi->sori.nr > 0)
             {
@@ -2325,7 +2595,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                     {
                         fprintf(stderr, "ED: A (possibly changing) ref. projection will define the flooding potential center.\n");
                         for (i=0; i<edi->flood.vecs.neig; i++)
+                        {
                             edi->flood.vecs.refproj[i] = edi->flood.vecs.refproj0[i];
+                        }
                     }
                     else
                     {
@@ -2333,7 +2605,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                         /* Set center of flooding potential to the center of the covariance matrix,
                          * i.e. the average structure, i.e. zero in the projected system */
                         for (i=0; i<edi->flood.vecs.neig; i++)
+                        {
                             edi->flood.vecs.refproj[i] = 0.0;
+                        }
                     }
                 }
             }
@@ -2342,9 +2616,11 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             {
                 for (i=0; i<edi->flood.vecs.neig; i++)
                 {
-                    fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", i, edi->flood.vecs.refproj[i]);
+                    fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", edi->flood.vecs.ieig[i], edi->flood.vecs.refproj[i]);
                     if (edi->flood.bHarmonic)
+                    {
                         fprintf(stdout, " (adding %11.4e/timestep)", edi->flood.vecs.refprojslope[i]);
+                    }
                     fprintf(stdout, "\n");
                 }
             }
@@ -2354,8 +2630,10 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             rad_project(edi, xstart, &edi->vecs.linfix, cr);
 
             /* Output to file, set the step to -1 so that write_edo knows it was called from init_edsam */
-            if (ed->edo && !(ed->bStartFromCpt))
+            if (ed->edo && !(EDstate->bFromCpt))
+            {
                 write_edo(nr_edi, edi, ed, -1, 0);
+            }
 
             /* Prepare for the next edi data set: */
             edi=edi->next_edi;
@@ -2370,9 +2648,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
     if (PAR(cr))
     {
         /* First let everybody know how many ED data sets to expect */
-        gmx_bcast(sizeof(numedis), &numedis, cr);
+        gmx_bcast(sizeof(EDstate->nED), &EDstate->nED, cr);
         /* Broadcast the essential dynamics / flooding data to all nodes */
-        broadcast_ed_data(cr, ed, numedis);
+        broadcast_ed_data(cr, ed, EDstate->nED);
     }
     else
     {
@@ -2381,7 +2659,7 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
 
         /* Loop over all ED data sets (usually only one, though) */
         edi=ed->edpar;
-        for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+        for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
         {
             edi->sref.anrs_loc = edi->sref.anrs;
             edi->sav.anrs_loc  = edi->sav.anrs;
@@ -2391,13 +2669,17 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             snew(edi->sav.c_ind, edi->sav.nr);
             /* Initialize the array */
             for (i=0; i<edi->sav.nr; i++)
+            {
                 edi->sav.c_ind[i] = i;
+            }
             /* In the general case we will need a different-sized array for the reference indices: */
             if (!edi->bRefEqAv)
             {
                 snew(edi->sref.c_ind, edi->sref.nr);
                 for (i=0; i<edi->sref.nr; i++)
+                {
                     edi->sref.c_ind[i] = i;
+                }
             }
             /* Point to the very same array in case of other structures: */
             edi->star.c_ind = edi->sav.c_ind;
@@ -2416,7 +2698,7 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
     /* Allocate space for ED buffer variables */
     /* Again, loop over ED data sets */
     edi=ed->edpar;
-    for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+    for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
     {
         /* Allocate space for ED buffer */
         snew(edi->buf, 1);
@@ -2451,7 +2733,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
     /* Flush the edo file so that the user can check some things
      * when the simulation has started */
     if (ed->edo)
+    {
         fflush(ed->edo);
+    }
 
     GMX_MPE_LOG(ev_edsam_finish);
 }
@@ -2479,12 +2763,16 @@ void do_edsam(t_inputrec  *ir,
 
     /* Check if ED sampling has to be performed */
     if ( ed->eEDtype==eEDnone )
+    {
         return;
+    }
 
     /* Suppress output on first call of do_edsam if
      * two-step sd2 integrator is used */
     if ( (ir->eI==eiSD2) && (v != NULL) )
+    {
         bSuppress = TRUE;
+    }
 
     dt_1 = 1.0/ir->delta_t;
 
@@ -2500,8 +2788,10 @@ void do_edsam(t_inputrec  *ir,
             buf=edi->buf->do_edsam;
 
             if (ed->bFirst)
+            {
                 /* initialise radacc radius for slope criterion */
                 buf->oldrad=calc_radius(&edi->vecs.radacc);
+            }
 
             /* Copy the positions into buf->xc* arrays and after ED
              * feed back corrections to the official positions */
@@ -2519,8 +2809,10 @@ void do_edsam(t_inputrec  *ir,
 #endif
             /* Only assembly reference positions if their indices differ from the average ones */
             if (!edi->bRefEqAv)
+            {
                 communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
                         edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+            }
 
             /* If bUpdateShifts was TRUE then the shifts have just been updated in communicate_group_positions.
              * We do not need to update the shifts until the next NS step. Note that dd_make_local_ed_indices
@@ -2532,9 +2824,13 @@ void do_edsam(t_inputrec  *ir,
 
             /* Fit the reference indices to the reference structure */
             if (edi->bRefEqAv)
+            {
                 fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+            }
             else
+            {
                 fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+            }
 
             /* Now apply the translation and rotation to the ED structure */
             translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
@@ -2574,8 +2870,11 @@ void do_edsam(t_inputrec  *ir,
                     project(buf->xcoll, edi);
                     rad_project(edi, buf->xcoll, &edi->vecs.radacc, cr);
                     buf->oldrad = 0.0;
-                } else
+                }
+                else
+                {
                     buf->oldrad = edi->vecs.radacc.radius;
+                }
             }
 
             /* apply the constraints */
@@ -2591,7 +2890,9 @@ void do_edsam(t_inputrec  *ir,
             {
                 project(buf->xcoll, edi);
                 if (MASTER(cr) && !bSuppress)
+                {
                     write_edo(edinr, edi, ed, step, rmsdev);
+                }
             }
 
             /* Copy back the positions unless monitoring only */
index dc0758a3fccc3ecd0592fee4a932c8e4af81a476..c1955a99548da156ae3e27ad8b764c97997509dd 100644 (file)
@@ -1433,7 +1433,7 @@ static void pick_nbnxn_kernel_cpu(FILE *fp,
 #endif
         if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
         {
-#ifdef GMX_NBNXN_SIMD_2XNN
+#ifdef GMX_NBNXN_SIMD_4XN
             *kernel_type = nbnxnk4xN_SIMD_4xN;
 #else
             gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
index 97a0ef84b9dc535806fa6e60a7935f913619edb4..ce5a6734c8953ba4ab7bb48f1890e226871e84ed 100644 (file)
@@ -108,7 +108,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
     real       *nbfp_i;
     int        n,ci,ci_sh;
     int        ish,ishf;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
 
@@ -213,8 +213,15 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef CALC_ENERGIES
 #ifndef ENERGY_GROUPS
@@ -237,8 +244,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
             }
         }
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
 #ifdef CALC_ENERGIES
             real Vc_sub_self;
index cab66c3e346310461148ab9489e42a5cde9eab5b..fa50cbeb4b7e6346ec0541bf142a6028a6fd4eac 100644 (file)
 #ifdef CALC_LJ
 
 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
-            load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+            load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE0,c12_SSE0);
 #ifndef HALF_LJ
-            load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+            load_lj_pair_params2(nbfp2,nbfp3,type,aj,c6_SSE2,c12_SSE2);
 #endif
 #endif /* not defined any LJ rule */
 
index faa445efbfb1fb2ef9d67becdbc88ed1a9b46de2..f656e4d6dd05eba0c8023d140866291090052cc0 100644 (file)
@@ -35,7 +35,7 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+/* GMX_MM256_HERE should be set before including this file */
 #include "gmx_simd_macros.h"
 
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
 #define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
 
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE     4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if defined GMX_MM256_HERE
 #define STRIDE     4
 #endif 
 
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
-#else
-/* SSE double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
-#endif
-#endif
-
 #ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 2x(4+4) kernel */
 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
 #define TAB_FDV0
 #else
-/* AVX double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+#error "unsupported kernel configuration"
 #endif
 #endif
 
@@ -167,7 +152,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     int        nbfp_stride;
     int        n,ci,ci_sh;
     int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        sci,scix,sciy,sciz,sci2;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
@@ -206,9 +191,6 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
     gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
 
-#ifndef GMX_MM256_HERE
-    __m128i    zeroi_SSE = _mm_setzero_si128();
-#endif
 #ifdef GMX_X86_SSE4_1
     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
 #endif
@@ -229,8 +211,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     const real *tab_coul_V;
 #endif
 #ifdef GMX_MM256_HERE
-    int        ti0_array[2*UNROLLJ-1],*ti0;
-    int        ti2_array[2*UNROLLJ-1],*ti2;
+    int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+    int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
 #endif
 #ifdef CALC_ENERGIES
     gmx_mm_pr  mhalfsp_SSE;
@@ -310,9 +292,9 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
 
 #ifdef CALC_COUL_TAB
 #ifdef GMX_MM256_HERE
-    /* Generate aligned table pointers */
-    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    /* Generate aligned table index pointers */
+    ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 #endif
 
     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
@@ -407,7 +389,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     egps_jshift  = 2*nbat->neg_2log;
     egps_jmask   = (1<<egps_jshift) - 1;
     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 #endif
 
@@ -420,9 +402,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
 
         ish              = (nbln->shift & NBNXN_CI_SHIFT);
         ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
@@ -441,8 +422,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
         sci             += (ci & 1)*(STRIDE>>1);
 #endif
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef ENERGY_GROUPS
         egps_i = nbat->energrp[ci];
@@ -513,8 +501,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
         iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz)  ,shZ_SSE);
         iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
             gmx_mm_pr facel_SSE;
 
index 1ab915deaecc3e5670ac0de2fffbd6b7da69bf78..ee6e0051f1b7e42bc02118e58ecce16ac444a902 100644 (file)
 
 #ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
 #define SUM_SIMD(x) SUM_SIMD4(x)
 #define TAB_FDV0
 #else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
 #define SUM_SIMD(x) (x[0]+x[1])
 #endif
 #endif
 
 #ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
 #define TAB_FDV0
 #else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
 #define SUM_SIMD(x) SUM_SIMD4(x)
 #endif
 #endif
@@ -167,7 +167,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     int        nbfp_stride;
     int        n,ci,ci_sh;
     int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        sci,scix,sciy,sciz,sci2;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
@@ -203,7 +203,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
@@ -216,7 +216,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
 #endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
     /* AVX: use floating point masks, as there are no integer instructions */
 #ifndef GMX_DOUBLE
     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
@@ -230,7 +231,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 #endif
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
     __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
     __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
@@ -246,7 +247,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
     __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 #endif
-#else /* GMX_MM256_HERE */
+#endif
+#ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
     gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
     gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
@@ -264,7 +266,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 #endif
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
     __m128i    zeroi_SSE = _mm_setzero_si128();
 #endif
 #ifdef GMX_X86_SSE4_1
@@ -289,10 +291,10 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     const real *tab_coul_V;
 #endif
 #ifdef GMX_MM256_HERE
-    int        ti0_array[2*UNROLLJ-1],*ti0;
-    int        ti1_array[2*UNROLLJ-1],*ti1;
-    int        ti2_array[2*UNROLLJ-1],*ti2;
-    int        ti3_array[2*UNROLLJ-1],*ti3;
+    int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+    int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1],*ti1;
+    int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
+    int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1],*ti3;
 #endif
 #ifdef CALC_ENERGIES
     gmx_mm_pr  mhalfsp_SSE;
@@ -376,11 +378,11 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 
 #ifdef CALC_COUL_TAB
 #ifdef GMX_MM256_HERE
-    /* Generate aligned table pointers */
-    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    /* Generate aligned table index pointers */
+    ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 #endif
 
     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
@@ -475,7 +477,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     egps_jshift  = 2*nbat->neg_2log;
     egps_jmask   = (1<<egps_jshift) - 1;
     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 #endif
 
@@ -488,9 +490,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 
         ish              = (nbln->shift & NBNXN_CI_SHIFT);
         ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
@@ -509,8 +510,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
         sci             += (ci & 1)*(STRIDE>>1);
 #endif
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef ENERGY_GROUPS
         egps_i = nbat->energrp[ci];
@@ -585,8 +593,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
index 45ab2aedcc9205f3d7d86d41cb3fbef186d09ef2..dc7112e7a3e9d74b6a6a7f0595413234e12e9f55 100644 (file)
  *   energy group pair energy storage
  */
 
+/* Transpose 2 double precision registers */
 #define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1)                      \
 {                                                                       \
-    out0 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(0,0));                   \
-    out1 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(1,1));                   \
+    out0 = _mm_unpacklo_pd(in0,in1);                                    \
+    out1 = _mm_unpackhi_pd(in0,in1);                                    \
 }
 
 #if defined GMX_MM128_HERE || !defined GMX_DOUBLE
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
 #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1)    \
 {                                                                       \
     __m128 _c01,_c23;                                                   \
-    _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(1,0,1,0));                \
-    _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(1,0,1,0));                \
+    _c01 = _mm_movelh_ps(in0,in1);                                      \
+    _c23 = _mm_movelh_ps(in2,in3);                                      \
     out0 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0));              \
     out1 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(3,1,3,1));              \
 }
 #else
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
 #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1)    \
 {                                                                       \
     __m256d _c01,_c23;                                                  \
@@ -72,6 +75,7 @@
 }
 #endif
 
+/* Collect element 2 of the 4 inputs to out */
 #define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0,in1,in2,in3,out)           \
 {                                                                       \
     __m128 _c01,_c23;                                                   \
 
 #ifndef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out)                   \
 {                                                                       \
-    _MM_TRANSPOSE4_PS(i_SSE0,i_SSE1,i_SSE2,i_SSE3);                     \
-    i_SSE0 = _mm_add_ps(i_SSE0,i_SSE1);                                 \
-    i_SSE2 = _mm_add_ps(i_SSE2,i_SSE3);                                 \
-    o_SSE  = _mm_add_ps(i_SSE0,i_SSE2);                                 \
+    _MM_TRANSPOSE4_PS(in0,in1,in2,in3);                                 \
+    in0 = _mm_add_ps(in0,in1);                                          \
+    in2 = _mm_add_ps(in2,in3);                                          \
+    out  = _mm_add_ps(in0,in2);                                         \
 }
 #else
-#define GMX_MM_TRANSPOSE_SUM2_PD(i_SSE0,i_SSE1,o_SSE)                   \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM2_PD(in0,in1,out)                           \
 {                                                                       \
-    GMX_MM_TRANSPOSE2_PD(i_SSE0,i_SSE1);                                \
-    o_SSE  = _mm_add_pd(i_SSE0,i_SSE1);                                 \
+    GMX_MM_TRANSPOSE2_PD(in0,in1);                                      \
+    out  = _mm_add_pd(in0,in1);                                         \
 }
 #endif
 #else
 #ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out)                   \
 {                                                                       \
-    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE1);                             \
-    i_SSE2 = _mm256_hadd_ps(i_SSE2,i_SSE3);                             \
-    i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
-    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
+    in0 = _mm256_hadd_ps(in0,in1);                                      \
+    in2 = _mm256_hadd_ps(in2,in3);                                      \
+    in1 = _mm256_hadd_ps(in0,in2);                                      \
+    out = _mm_add_ps(_mm256_castps256_ps128(in1),_mm256_extractf128_ps(in1,1)); \
 }
-#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE)                  \
+/* Sum the elements of halfs of each input register and store sums in out */
+#define GMX_MM_TRANSPOSE_SUM4H_PR(in0,in2,out)                          \
 {                                                                       \
-    i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps());                \
-    i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps());                \
-    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
-    i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001);                      \
-    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+    in0 = _mm256_hadd_ps(in0,_mm256_setzero_ps());                      \
+    in2 = _mm256_hadd_ps(in2,_mm256_setzero_ps());                      \
+    in0 = _mm256_hadd_ps(in0,in2);                                      \
+    in2 = _mm256_permute_ps(in0,_MM_SHUFFLE(2,3,0,1));                  \
+    out = _mm_add_ps(_mm256_castps256_ps128(in0),_mm256_extractf128_ps(in2,1)); \
 }
 #else
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out)                   \
 {                                                                       \
-    i_SSE0 = _mm256_hadd_pd(i_SSE0,i_SSE1);                             \
-    i_SSE2 = _mm256_hadd_pd(i_SSE2,i_SSE3);                             \
-    o_SSE  = _mm256_add_pd(_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x20),_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x31)); \
+    in0 = _mm256_hadd_pd(in0,in1);                                      \
+    in2 = _mm256_hadd_pd(in2,in3);                                      \
+    out = _mm256_add_pd(_mm256_permute2f128_pd(in0,in2,0x20),_mm256_permute2f128_pd(in0,in2,0x31)); \
 }
 #endif
 #endif
@@ -136,24 +145,24 @@ gmx_mm128_invsqrt_ps_single(__m128 x)
     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
 }
 
-/* Do 2/4 double precision invsqrt operations.
- * Doing the SSE rsqrt and the first Newton Raphson iteration
+/* Do 2 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
  * in single precision gives full double precision accuracy.
- * The speed is more than twice as fast as two gmx_mm_invsqrt_pd calls.
+ * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
  */
-#define GMX_MM128_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1)              \
+#define GMX_MM128_INVSQRT2_PD(in0,in1,out0,out1)                        \
 {                                                                       \
     const __m128d half  = _mm_set1_pd(0.5);                             \
     const __m128d three = _mm_set1_pd(3.0);                             \
-    __m128  s_SSE,ir_SSE;                                               \
+    __m128  s,ir;                                                       \
     __m128d lu0,lu1;                                                    \
                                                                         \
-    s_SSE  = _mm_movelh_ps(_mm_cvtpd_ps(i_SSE0),_mm_cvtpd_ps(i_SSE1));  \
-    ir_SSE = gmx_mm128_invsqrt_ps_single(s_SSE);                        \
-    lu0    = _mm_cvtps_pd(ir_SSE);                                      \
-    lu1    = _mm_cvtps_pd(_mm_movehl_ps(ir_SSE,ir_SSE));                \
-    o_SSE0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
-    o_SSE1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+    s    = _mm_movelh_ps(_mm_cvtpd_ps(in0),_mm_cvtpd_ps(in1));          \
+    ir   = gmx_mm128_invsqrt_ps_single(s);                              \
+    lu0  = _mm_cvtps_pd(ir);                                            \
+    lu1  = _mm_cvtps_pd(_mm_movehl_ps(ir,ir));                          \
+    out0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),in0)),lu0)); \
+    out1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),in1)),lu1)); \
 }
 
 #define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
@@ -173,19 +182,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
 }
 
-#define GMX_MM256_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1)              \
+/* Do 4 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ */
+#define GMX_MM256_INVSQRT2_PD(in0,in1,out0,out1)                        \
 {                                                                       \
     const __m256d half  = _mm256_set1_pd(0.5);                          \
     const __m256d three = _mm256_set1_pd(3.0);                          \
-    __m256  s_SSE,ir_SSE;                                               \
+    __m256  s,ir;                                                       \
     __m256d lu0,lu1;                                                    \
                                                                         \
-    s_SSE  = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(i_SSE0)),_mm256_cvtpd_ps(i_SSE1),1); \
-    ir_SSE = gmx_mm256_invsqrt_ps_single(s_SSE);                        \
-    lu0    = _mm256_cvtps_pd(_mm256_castps256_ps128(ir_SSE));           \
-    lu1    = _mm256_cvtps_pd(_mm256_extractf128_ps(ir_SSE,1));          \
-    o_SSE0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
-    o_SSE1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+    s    = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)),_mm256_cvtpd_ps(in1),1); \
+    ir   = gmx_mm256_invsqrt_ps_single(s);                              \
+    lu0  = _mm256_cvtps_pd(_mm256_castps256_ps128(ir));                 \
+    lu1  = _mm256_cvtps_pd(_mm256_extractf128_ps(ir,1));                \
+    out0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),in0)),lu0)); \
+    out1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),in1)),lu1)); \
 }
 
 #define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
@@ -236,18 +249,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
 }
 
-#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE)                \
+#define load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE,c12_SSE)        \
 {                                                                       \
-    __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2];                     \
+    __m128 clj_SSE0[UNROLLJ],clj_SSE1[UNROLLJ],c6t_SSE[2],c12t_SSE[2];  \
     int p;                                                              \
                                                                         \
-    for(p=0; p<2*UNROLLJ; p++)                                            \
+    for(p=0; p<UNROLLJ; p++)                                            \
     {                                                                   \
         /* Here we load 4 aligned floats, but we need just 2 */         \
-        clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
+        clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE);        \
     }                                                                   \
-    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
-    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+    for(p=0; p<UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE);        \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0],clj_SSE0[1],clj_SSE0[2],clj_SSE0[3],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0],clj_SSE1[1],clj_SSE1[2],clj_SSE1[3],c6t_SSE[1],c12t_SSE[1]); \
                                                                         \
     GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE);                     \
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
@@ -298,7 +316,9 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
  * But AMD CPUs perform significantly worse with gcc than with icc.
  * Performance is improved a bit by using the extract function UNROLLJ times,
  * instead of doing an _mm_store_si128 for every i-particle.
- * With AVX this significantly deteriorates performance (8 extracts iso 4).
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
  * Because of this, the load_table_f macro always takes the ti parameter,
  * but it is only used with AVX.
  */
index 0358e3523d40217794eafd3a8067056093e33f36..6b39abd7818db0c19854936c306fc6de35914423 100644 (file)
@@ -424,6 +424,7 @@ static real grid_atom_density(int n,rvec corner0,rvec corner1)
 
 static int set_grid_size_xy(const nbnxn_search_t nbs,
                             nbnxn_grid_t *grid,
+                            int dd_zone,
                             int n,rvec corner0,rvec corner1,
                             real atom_density,
                             int XFormat)
@@ -470,6 +471,23 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
         grid->ncy = 1;
     }
 
+    grid->sx = size[XX]/grid->ncx;
+    grid->sy = size[YY]/grid->ncy;
+    grid->inv_sx = 1/grid->sx;
+    grid->inv_sy = 1/grid->sy;
+
+    if (dd_zone > 0)
+    {
+        /* This is a non-home zone, add an extra row of cells
+         * for particles communicated for bonded interactions.
+         * These can be beyond the cut-off. It doesn't matter where
+         * they end up on the grid, but for performance it's better
+         * if they don't end up in cells that can be within cut-off range.
+         */
+        grid->ncx++;
+        grid->ncy++;
+    }
+
     /* We need one additional cell entry for particles moved by DD */
     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
     {
@@ -532,23 +550,39 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
 
     copy_rvec(corner0,grid->c0);
     copy_rvec(corner1,grid->c1);
-    grid->sx = size[XX]/grid->ncx;
-    grid->sy = size[YY]/grid->ncy;
-    grid->inv_sx = 1/grid->sx;
-    grid->inv_sy = 1/grid->sy;
 
     return nc_max;
 }
 
-#define SORT_GRID_OVERSIZE 2
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
 #define SGSF (SORT_GRID_OVERSIZE + 1)
 
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the inverse hole spacing.
+ * nsort, the theortical hole limit, is only used for debugging.
+ * sort is the sorting work array.
+ */
 static void sort_atoms(int dim,gmx_bool Backwards,
                        int *a,int n,rvec *x,
                        real h0,real invh,int nsort,int *sort)
 {
     int i,c;
-    int zi,zim;
+    int zi,zim,zi_min,zi_max;
     int cp,tmp;
 
     if (n <= 1)
@@ -557,13 +591,10 @@ static void sort_atoms(int dim,gmx_bool Backwards,
         return;
     }
 
-    /* For small oversize factors clearing the whole area is fastest.
-     * For large oversize we should clear the used elements after use.
-     */
-    for(i=0; i<nsort; i++)
-    {
-        sort[i] = -1;
-    }
+    /* Determine the index range used, so we can limit it for the second pass */
+    zi_min = INT_MAX;
+    zi_max = -1;
+
     /* Sort the particles using a simple index sort */
     for(i=0; i<n; i++)
     {
@@ -588,6 +619,8 @@ static void sort_atoms(int dim,gmx_bool Backwards,
         if (sort[zi] < 0)
         {
             sort[zi] = a[i];
+            zi_min = min(zi_min,zi);
+            zi_max = max(zi_max,zi);
         }
         else
         {
@@ -617,8 +650,10 @@ static void sort_atoms(int dim,gmx_bool Backwards,
                     zim++;
                 }
                 sort[zim] = cp;
+                zi_max = max(zi_max,zim);
             }
             sort[zi] = a[i];
+            zi_max = max(zi_max,zi);
         }
     }
 
@@ -630,16 +665,18 @@ static void sort_atoms(int dim,gmx_bool Backwards,
             if (sort[zi] >= 0)
             {
                 a[c++] = sort[zi];
+                sort[zi] = -1;
             }
         }
     }
     else
     {
-        for(zi=nsort-1; zi>=0; zi--)
+        for(zi=zi_max; zi>=zi_min; zi--)
         {
             if (sort[zi] >= 0)
             {
                 a[c++] = sort[zi];
+                sort[zi] = -1;
             }
         }
     }
@@ -1359,7 +1396,8 @@ static void sort_columns_supersub(const nbnxn_search_t nbs,
 /* Determine in which grid column atoms should go */
 static void calc_column_indices(nbnxn_grid_t *grid,
                                 int a0,int a1,
-                                rvec *x,const int *move,
+                                rvec *x,
+                                int dd_zone,const int *move,
                                 int thread,int nthread,
                                 int *cell,
                                 int *cxy_na)
@@ -1375,50 +1413,78 @@ static void calc_column_indices(nbnxn_grid_t *grid,
 
     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
-    for(i=n0; i<n1; i++)
+    if (dd_zone == 0)
     {
-        if (move == NULL || move[i] >= 0)
+        /* Home zone */
+        for(i=n0; i<n1; i++)
         {
-            /* We need to be careful with rounding,
-             * particles might be a few bits outside the local box.
-             * The int cast takes care of the lower bound,
-             * we need to explicitly take care of the upper bound.
-             */
-            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
-            if (cx == grid->ncx)
-            {
-                cx = grid->ncx - 1;
-            }
-            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
-            if (cy == grid->ncy)
+            if (move == NULL || move[i] >= 0)
             {
-                cy = grid->ncy - 1;
-            }
-            /* For the moment cell contains only the, grid local,
-             * x and y indices, not z.
-             */
-            cell[i] = cx*grid->ncy + cy;
+                /* We need to be careful with rounding,
+                 * particles might be a few bits outside the local zone.
+                 * The int cast takes care of the lower bound,
+                 * we will explicitly take care of the upper bound.
+                 */
+                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 
 #ifdef DEBUG_NBNXN_GRIDDING
-            if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+                if (cx < 0 || cx >= grid->ncx ||
+                    cy < 0 || cy >= grid->ncy)
+                {
+                    gmx_fatal(FARGS,
+                              "grid cell cx %d cy %d out of range (max %d %d)\n"
+                              "atom %f %f %f, grid->c0 %f %f",
+                              cx,cy,grid->ncx,grid->ncy,
+                              x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+                }
+#endif
+                /* Take care of potential rouding issues */
+                cx = min(cx,grid->ncx - 1);
+                cy = min(cy,grid->ncy - 1);
+
+                /* For the moment cell will contain only the, grid local,
+                 * x and y indices, not z.
+                 */
+                cell[i] = cx*grid->ncy + cy;
+            }
+            else
             {
-                gmx_fatal(FARGS,
-                          "grid cell cx %d cy %d out of range (max %d %d)\n"
-                          "atom %f %f %f, grid->c0 %f %f",
-                          cx,cy,grid->ncx,grid->ncy,
-                          x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+                /* Put this moved particle after the end of the grid,
+                 * so we can process it later without using conditionals.
+                 */
+                cell[i] = grid->ncx*grid->ncy;
             }
-#endif
+
+            cxy_na[cell[i]]++;
         }
-        else
+    }
+    else
+    {
+        /* Non-home zone */
+        for(i=n0; i<n1; i++)
         {
-            /* Put this moved particle after the end of the grid,
-             * so we can process it later without using conditionals.
+            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+            /* For non-home zones there could be particles outside
+             * the non-bonded cut-off range, which have been communicated
+             * for bonded interactions only. For the result it doesn't
+             * matter where these end up on the grid. For performance
+             * we put them in an extra row at the border.
              */
-            cell[i] = grid->ncx*grid->ncy;
-        }
+            cx = max(cx,0);
+            cx = min(cx,grid->ncx - 1);
+            cy = max(cy,0);
+            cy = min(cy,grid->ncy - 1);
 
-        cxy_na[cell[i]]++;
+            /* For the moment cell will contain only the, grid local,
+             * x and y indices, not z.
+             */
+            cell[i] = cx*grid->ncy + cy;
+
+            cxy_na[cell[i]]++;
+        }
     }
 }
 
@@ -1442,7 +1508,7 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
 #pragma omp parallel for num_threads(nthread) schedule(static)
     for(thread=0; thread<nthread; thread++)
     {
-        calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+        calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
                             nbs->cell,nbs->work[thread].cxy_na);
     }
 
@@ -1509,6 +1575,11 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
             srenew(nbs->work[thread].sort_work,
                    nbs->work[thread].sort_work_nalloc);
+            /* When not in use, all elements should be -1 */
+            for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
+            {
+                nbs->work[thread].sort_work[i] = -1;
+            }
         }
     }
 
@@ -1522,12 +1593,18 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
     }
 
-    /* Set the cell indices for the moved particles */
-    n0 = grid->nc*grid->na_sc;
-    n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
-    for(i=n0; i<n1; i++)
+    if (dd_zone == 0)
     {
-        nbs->cell[nbs->a[i]] = i;
+        /* Set the cell indices for the moved particles */
+        n0 = grid->nc*grid->na_sc;
+        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+        if (dd_zone == 0)
+        {
+            for(i=n0; i<n1; i++)
+            {
+                nbs->cell[nbs->a[i]] = i;
+            }
+        }
     }
 
     /* Sort the super-cell columns along z into the sub-cells. */
@@ -1672,7 +1749,8 @@ void nbnxn_put_on_grid(nbnxn_search_t nbs,
         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
     }
 
-    nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+    nc_max_grid = set_grid_size_xy(nbs,grid,
+                                   dd_zone,n-nmoved,corner0,corner1,
                                    nbs->grid[0].atom_density,
                                    nbat->XFormat);
 
@@ -3362,13 +3440,17 @@ static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
     {
         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
 
-        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+        /* The counts below are used for non-bonded pair/flop counts
+         * and should therefore match the available kernel setups.
+         */
+        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
         {
-            nbl->work->ncj_hlj += jlen;
+            nbl->work->ncj_noq += jlen;
         }
-        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
         {
-            nbl->work->ncj_noq += jlen;
+            nbl->work->ncj_hlj += jlen;
         }
 
         nbl->nci++;
index ab2b2e814dbb28278be23926f7c7995d359c9272..a158b5e0811a39dca01d6c77ddab89ffb716b37b 100644 (file)
@@ -43,7 +43,7 @@ add_library(gmxana
             gmx_analyze.c   gmx_anaeig.c    gmx_angle.c     gmx_bond.c      
             gmx_bundle.c    gmx_chi.c       gmx_cluster.c   gmx_confrms.c   
             gmx_covar.c     gmx_current.c   
-            gmx_density.c   gmx_densmap.c   gmx_dih.c       
+            gmx_density.c   gmx_densmap.c       
             gmx_dielectric.c        
             gmx_kinetics.c  gmx_spatial.c   gmx_tune_pme.c
             gmx_dipoles.c   gmx_disre.c     gmx_dist.c      gmx_dyndom.c    
@@ -82,7 +82,7 @@ set(GMX_TOOLS_PROGRAMS
     make_ndx mk_angndx trjcat trjconv trjorder g_wheel 
     xpm2ps genion g_anadock make_edi g_analyze g_anaeig
     g_angle g_bond g_bundle g_chi g_cluster g_confrms g_covar
-    g_current g_density g_densmap g_dih g_dielectric
+    g_current g_density g_densmap g_dielectric
     g_helixorient g_principal g_dipoles g_disre g_dist
     g_dyndom g_enemat g_energy g_lie g_filter g_gyrate
     g_h2order g_hbond g_helix g_mindist g_msd g_morph g_nmeig
diff --git a/src/tools/g_dih.c b/src/tools/g_dih.c
deleted file mode 100644 (file)
index 5b6b448..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- * Copyright (c) 2012, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <gmx_ana.h>
-
-
-/* This is just a wrapper binary.
-* The code that used to be in g_dih.c is now in gmx_dih.c,
-* where the old main function is called gmx_dih().
-*/
-int
-main(int argc, char *argv[])
-{
-  gmx_dih(argc,argv);
-  return 0;
-}
-
-
-  
index 3f290d6dceae9129b6a5c383d8cdba0c0da80ead..51ba5123b50aa561d843d9901b47730aa5fefcc2 100644 (file)
@@ -97,20 +97,20 @@ int gmx_g_angle(int argc,char *argv[])
 {
   static const char *desc[] = {
     "[TT]g_angle[tt] computes the angle distribution for a number of angles",
-    "or dihedrals. This way you can check whether your simulation",
-    "is correct. With option [TT]-ov[tt] you can plot the average angle of",
-    "a group of angles as a function of time. With the [TT]-all[tt] option",
-    "the first graph is the average, the rest are the individual angles.[PAR]",
+    "or dihedrals.[PAR]",
+    "With option [TT]-ov[tt], you can plot the average angle of",
+    "a group of angles as a function of time. With the [TT]-all[tt] option,",
+    "the first graph is the average and the rest are the individual angles.[PAR]",
     "With the [TT]-of[tt] option, [TT]g_angle[tt] also calculates the fraction of trans",
     "dihedrals (only for dihedrals) as function of time, but this is",
-    "probably only fun for a selected few.[PAR]",
-    "With option [TT]-oc[tt] a dihedral correlation function is calculated.[PAR]",
-    "It should be noted that the index file should contain",
-    "atom-triples for angles or atom-quadruplets for dihedrals.",
+    "probably only fun for a select few.[PAR]",
+    "With option [TT]-oc[tt], a dihedral correlation function is calculated.[PAR]",
+    "It should be noted that the index file must contain",
+    "atom triplets for angles or atom quadruplets for dihedrals.",
     "If this is not the case, the program will crash.[PAR]",
-    "With option [TT]-or[tt] a trajectory file is dumped containing cos and",
-    "sin of selected dihedral angles which subsequently can be used as",
-    "input for a PCA analysis using [TT]g_covar[tt].[PAR]",
+    "With option [TT]-or[tt], a trajectory file is dumped containing cos and",
+    "sin of selected dihedral angles, which subsequently can be used as",
+    "input for a principal components analysis using [TT]g_covar[tt].[PAR]",
     "Option [TT]-ot[tt] plots when transitions occur between",
     "dihedral rotamers of multiplicity 3 and [TT]-oh[tt]",
     "records a histogram of the times between such transitions,",
diff --git a/src/tools/gmx_dih.c b/src/tools/gmx_dih.c
deleted file mode 100644 (file)
index 7956210..0000000
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- * Copyright (c) 2012, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <math.h>
-
-#include "sysstuff.h"
-#include "string2.h"
-#include "copyrite.h"
-#include "futil.h"
-#include "smalloc.h"
-#include "statutil.h"
-#include "nrama.h"
-#include "physics.h"
-#include "macros.h"
-#include "xvgr.h"
-#include "vec.h"
-#include "gmx_ana.h"
-
-
-#define NOMIN 'X'
-
-static void ana_dih(FILE *out,char *index,int nframes,real dih[],t_dih *dd)
-{
-  int i;
-  real mind,maxd,sum,av,var,prev,width;
-  gmx_bool bTrans;
-  
-  mind=5400,maxd=-5400,sum=0,av=0,var=0;
-
-  prev=dih[0];
-  for(i=0; (i<nframes); i++) {
-    if ((dih[i]-prev) > 180) {
-      /* PBC.. */
-      dih[i]-=360;
-    }
-    else if ((dih[i]-prev) < -180)
-      dih[i]+=360;
-    prev=dih[i];
-      
-    sum+=dih[i];
-    mind=min(mind,dih[i]);
-    maxd=max(maxd,dih[i]);
-  }
-  av=sum/nframes;
-  for(i=0; (i<nframes); i++)
-    var+=sqr(dih[i]-av);
-  var/=nframes;
-  width=(360.0/dd->mult);
-  bTrans=((maxd - mind) > width);
-
-  fprintf(out,"%-10s %10.3f %10.3f %10.3f %10.3f %10.3f %-10s%3.0f\n",
-         index,mind,av,maxd,var,sqrt(var),
-         bTrans ? "Yep" : "",width);
-}
-
-static int find_min(real phi,int ntab,real phitab[])
-{
-  int  i,imin;
-  real mind,mm;
-  real width;
-  /* Set closest minimum to the first one */
-  width=360.0/ntab;
-  mind=fabs(phi-phitab[0]);
-  imin=0;
-  for(i=1; (i<ntab); i++) {
-    mm=fabs(phi-phitab[i]);
-    if (mm < mind) {
-      imin=i;
-      mind=mm;
-    }
-  }
-  if (mind < width*0.5 )
-    return imin;
-  else
-    return -1;
-}
-
-static int vphi(t_dih *dih,real phi,int mult)
-{
-  static real m2[] = { 90, 270 };
-  static real m3[] = { 60, 180, 300 };
-  static real m4[] = { 45, 135, 225, 315 };
-  static real m6[] = { 30, 90, 150, 210, 270, 330 };
-
-  real phiref;
-  int  vpp=0;
-  
-  phiref=RAD2DEG*(phi-dih->phi0);
-  while (phiref < 0)
-    phiref+=360;
-  while (phiref > 360)
-    phiref-=360;
-  
-  switch(mult) {
-  case 2:
-    vpp=find_min(phiref,2,m2);
-    break;
-  case 3:
-    vpp=find_min(phiref,3,m3);
-    break;
-  case 4:
-    vpp=find_min(phiref,4,m4);
-    break;
-  case 6:
-    vpp=find_min(phiref,6,m6);
-    break;
-  default:
-    gmx_fatal(FARGS,"No such multiplicity %d",dih->mult);
-  }
-
-  if (vpp == -1)
-    return NOMIN;
-  else
-    return vpp+'0';
-}
-
-typedef struct t_cluster {
-  int    ndih;
-  int    freq;
-  char   *minimum;
-  struct t_cluster *next;
-} t_cluster;
-
-static t_cluster *search_cluster(t_cluster *cl,char *minimum)
-{
-  t_cluster *ccl=cl;
-
-  while (ccl != NULL) {
-    if (strcmp(minimum,ccl->minimum)==0)
-      return ccl;
-    ccl=ccl->next;
-  }
-  return NULL;
-}
-
-static void add_cluster(t_cluster **cl,int ndih,char *minimum)
-{
-  t_cluster *loper;
-  t_cluster *ccl;
-
-  snew(ccl,1);
-  ccl->ndih=ndih;
-  ccl->freq=1;
-  ccl->minimum=strdup(minimum);
-  ccl->next=NULL;
-  
-  if (*cl == NULL)
-    *cl=ccl;
-  else {
-    loper=*cl;
-    while (loper->next != NULL) 
-      loper=loper->next;
-    loper->next=ccl;
-  }
-}
-
-static void p_cluster(FILE *out,t_cluster *cl)
-{
-  t_cluster *loper;
-
-  fprintf(out,"* * * C L U S T E R   A N A L Y S I S * * *\n\n");
-  fprintf(out," Frequency  Dihedral minima\n");
-  loper=cl;
-  while (loper != NULL) {
-    fprintf(out,"%10d  %s\n",loper->freq,loper->minimum);
-    loper=loper->next;
-  }
-}
-
-static void ana_cluster(FILE *out, t_xrama *xr,real **dih,real time[],
-                       t_topology *top,int nframes,int mult)
-{
-  t_cluster *cl=NULL,*scl;
-  char      *minimum;
-  int       i,j,nx;
-
-  /* Number of dihedrals + terminating NULL 
-   * this allows for using string routines
-   */
-  snew(minimum,xr->ndih+1);
-  
-  for(i=0; (i<nframes); i++) {
-    nx=0;
-    for(j=0; (j<xr->ndih); j++) {
-      minimum[j] = vphi(&xr->dih[j],dih[j][i],
-                       mult == -1 ? xr->dih[j].mult : mult);
-      if (minimum[j] == NOMIN)
-       nx++;
-    }
-    if (nx == 0) {
-      if ((scl=search_cluster(cl,minimum)) == NULL)
-       add_cluster(&cl,xr->ndih,minimum);
-      else
-       scl->freq++;
-    }
-  }
-  p_cluster(out,cl);
-
-  sfree(minimum);
-}
-
-static void ana_trans(FILE *out, t_xrama *xr,real **dih,real time[],
-                     t_topology *top,int nframes, const output_env_t oenv)
-{
-  FILE *outd;
-  real prev_phi,prev_psi;
-  int  i,j,phi,psi;
-  char buf[10];
-
-  fprintf(out,"\n\t* * * D I H E D R A L    S T A T I S T I C S * * *\n\n");
-  fprintf(out,"%-10s %10s %10s %10s %10s %10s %10s\n",
-         "index","minimum","average","maximum","variance","std.dev",
-         "transition");
-  for(i=0; (i<xr->ndih); i++) {
-    sprintf(buf,"dih-%d",i);
-    ana_dih(out,buf,nframes,dih[i],&(xr->dih[i]));
-  }
-  for(i=0; (i<xr->npp); i++) {
-    sprintf(buf,"%s",xr->pp[i].label);
-    outd=xvgropen(buf,"Dihedral Angles","Time (ps)","Degrees",oenv);
-
-    phi=xr->pp[i].iphi;
-    psi=xr->pp[i].ipsi;
-    prev_phi=dih[phi][0];
-    prev_psi=dih[psi][0];
-    for(j=0; (j<nframes); j++) {
-      /* PBC.. */
-      if ((dih[phi][j]-prev_phi) > 180) 
-       dih[phi][j]-=360;
-      else if ((dih[phi][j]-prev_phi) < -180)
-       dih[phi][j]+=360;
-      prev_phi=dih[phi][j];
-      if ((dih[psi][j]-prev_psi) > 180) 
-       dih[psi][j]-=360;
-      else if ((dih[psi][j]-prev_psi) < -180)
-       dih[psi][j]+=360;
-      prev_psi=dih[psi][j];
-      fprintf(outd,"%10g  %10g  %10g\n",time[j],prev_phi,prev_psi);
-    }
-    ffclose(outd);
-  }
-}
-
-int gmx_dih(int argc,char *argv[])
-{
-  const char *desc[] = {
-    "[TT]g_dih[tt] can do two things. The default is to analyze dihedral transitions",
-    "by merely computing all the dihedral angles defined in your topology",
-    "for the whole trajectory. When a dihedral flips over to another minimum",
-    "an angle/time plot is made.[PAR]",
-    "The opther option is to discretize the dihedral space into a number of",
-    "bins, and group each conformation in dihedral space in the",
-    "appropriate bin. The output is then given as a number of dihedral",
-    "conformations sorted according to occupancy."
-  };
-  static int  mult = -1;
-  static gmx_bool bSA  = FALSE;
-  t_pargs pa[] = {
-    { "-sa", FALSE, etBOOL, {&bSA},
-      "Perform cluster analysis in dihedral space instead of analysing dihedral transitions." },
-    { "-mult", FALSE, etINT, {&mult},
-      "mulitiplicity for dihedral angles (by default read from topology)" }
-  };
-  FILE       *out;
-  t_xrama    *xr;
-  t_topology *top;
-  real       **dih,*time;
-  real       dd;
-  int        i,nframes,maxframes=1000;
-  output_env_t oenv;
-  t_filenm   fnm[] = {
-    { efTRX, "-f", NULL, ffREAD },
-    { efTPX, NULL, NULL, ffREAD },
-    { efOUT, NULL, NULL, ffWRITE }
-  };
-#define NFILE asize(fnm)
-
-  CopyRight(stderr,argv[0]);
-  parse_common_args(&argc,argv,PCA_CAN_VIEW | PCA_CAN_TIME | PCA_BE_NICE,
-                   NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,&oenv);
-  
-  if (mult != -1)
-    fprintf(stderr,"Using %d for dihedral multiplicity rather than topology values\n",mult);
-    
-  snew(xr,1);
-  init_rama(oenv,ftp2fn(efTRX,NFILE,fnm),
-           ftp2fn(efTPX,NFILE,fnm),xr,3);
-  top=read_top(ftp2fn(efTPX,NFILE,fnm),NULL);
-              
-  /* Brute force malloc, may be too big... */
-  snew(dih,xr->ndih);
-  for(i=0; (i<xr->ndih); i++)
-    snew(dih[i],maxframes);
-  snew(time,maxframes);
-
-  fprintf(stderr,"\n");
-  nframes = 0;
-  while (new_data(xr)) {
-    for(i=0; (i<xr->ndih); i++) {
-      dd=xr->dih[i].ang*RAD2DEG;
-      while (dd < 0)
-       dd+=360;
-      while (dd > 360)
-       dd-=360;
-      dih[i][nframes]=dd;
-    }
-    time[nframes]=xr->t;
-    nframes++;
-    if (nframes > maxframes) {
-      maxframes += 1000;
-      for(i=0; (i<xr->ndih); i++)
-       srenew(dih[i],maxframes);
-      srenew(time,maxframes);
-    }
-  } 
-
-  fprintf(stderr,"\nCalculated all dihedrals, now analysing...\n");
-
-  out=ftp2FILE(efOUT,NFILE,fnm,"w");
-
-  if (bSA) {
-    /* Cluster and structure analysis */
-    ana_cluster(out,xr,dih,time,top,nframes,mult);
-  }
-  else {
-    /* Analyse transitions... */
-    ana_trans(out,xr,dih,time,top,nframes,oenv);
-  }
-  ffclose(out);
-    
-  thanx(stderr);
-    
-  return 0;
-}
index 705d7a5da36a17b619fe497c297a3b3b7eae0c80..bf86a3e467ee1cecc0c3a8b7dca9fc09c4ae45e5 100644 (file)
@@ -137,7 +137,7 @@ int gmx_genpr(int argc,char *argv[])
   nfn     = opt2fn_null("-n",NFILE,fnm);
   
   if (( nfn == NULL ) && ( xfn == NULL))
-    gmx_fatal(FARGS,"no index file and no structure file suplied");
+    gmx_fatal(FARGS,"no index file and no structure file supplied");
       
   if ((disre_frac < 0) || (disre_frac >= 1))
     gmx_fatal(FARGS,"disre_frac should be between 0 and 1");
index e1165e29b3a09c93328463565c7996553e0a9645..f813d8fc8dcec6c135dfbf00802bdb9d8b0621e9 100644 (file)
@@ -118,14 +118,12 @@ int gmx_helix(int argc,char *argv[])
     "of the", 
     "helix in nm. This is simply the average rise (see above) times the",  
     "number of helical residues (see below).[BR]",
-    "[BB]5.[bb] Number of helical residues (file [TT]n-ahx.xvg[tt]). The title says",
-    "it all.[BR]",
-    "[BB]6.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
-    "[BB]7.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
+    "[BB]5.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
+    "[BB]6.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
     "atoms only (file [TT]rms-ahx.xvg[tt]).[BR]",
-    "[BB]8.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
-    "[BB]9.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
-    "[BB]10.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
+    "[BB]7.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
+    "[BB]8.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
+    "[BB]9.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
     "[PAR]"
   };
   static const char *ppp[efhNR+2] = { 
index 4da7814f2ad1b8d339b83405c8af048630987a5f..3ea5828637bba53be0229f0fc090c1975a0749fe 100644 (file)
@@ -473,7 +473,7 @@ int gmx_make_edi(int argc,char *argv[])
   static const char *desc[] = {
       "[TT]make_edi[tt] generates an essential dynamics (ED) sampling input file to be used with [TT]mdrun[tt]",
       "based on eigenvectors of a covariance matrix ([TT]g_covar[tt]) or from a",
-      "normal modes anaysis ([TT]g_nmeig[tt]).",
+      "normal modes analysis ([TT]g_nmeig[tt]).",
       "ED sampling can be used to manipulate the position along collective coordinates",
       "(eigenvectors) of (biological) macromolecules during a simulation. Particularly,",
       "it may be used to enhance the sampling efficiency of MD simulations by stimulating",
@@ -518,10 +518,16 @@ int gmx_make_edi(int argc,char *argv[])
       "before a new cycle is started.[PAR]",
       "Note on the parallel implementation: since ED sampling is a 'global' thing",
       "(collective coordinates etc.), at least on the 'protein' side, ED sampling",
-      "is not very parallel-friendly from an implentation point of view. Because",
+      "is not very parallel-friendly from an implementation point of view. Because",
       "parallel ED requires some extra communication, expect the performance to be",
-      "lower as in a free MD simulation, especially on a large number of nodes. [PAR]",
-      "All output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a .edo file. In the output",
+      "lower as in a free MD simulation, especially on a large number of nodes and/or",
+      "when the ED group contains a lot of atoms. [PAR]",
+      "Please also note that if your ED group contains more than a single protein,",
+      "then the [TT].tpr[tt] file must contain the correct PBC representation of the ED group.",
+      "Take a look on the initial RMSD from the reference structure, which is printed",
+      "out at the start of the simulation; if this is much higher than expected, one",
+      "of the ED molecules might be shifted by a box vector. [PAR]",
+      "All output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a [TT].edo[tt] file. In the output",
       "file, per OUTFRQ step the following information is present: [PAR]",
       "[TT]*[tt] the step number[BR]",
       "[TT]*[tt] the number of the ED dataset. ([BB]Note[bb] that you can impose multiple ED constraints in",
@@ -537,7 +543,7 @@ int gmx_make_edi(int argc,char *argv[])
       "is kept in that region.",
       "[PAR]",
       "The origin is normally the average structure stored in the [TT]eigvec.trr[tt] file.",
-      "It can be changed with [TT]-ori[tt] to an arbitrary position in configurational space.",
+      "It can be changed with [TT]-ori[tt] to an arbitrary position in configuration space.",
       "With [TT]-tau[tt], [TT]-deltaF0[tt], and [TT]-Eflnull[tt] you control the flooding behaviour.",
       "Efl is the flooding strength, it is updated according to the rule of adaptive flooding.",
       "Tau is the time constant of adaptive flooding, high [GRK]tau[grk] means slow adaption (i.e. growth). ",
index 321d6b099d8ad56bb978ab49b965ce298809dff1..b3471401024f06ac4ffe7986ebeff82753cb965c 100644 (file)
@@ -426,7 +426,7 @@ int gmx_rmsf(int argc,char *argv[])
                                 *(top.atoms.atomname[index[i]]));
        
        fprintf(fp,"%5d  %10.5f  %10.5f\n",
-               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,rmsf[i]*bfac,
+               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,rmsf[i]*bfac,
                pdb_bfac);
       }
     }
@@ -437,7 +437,7 @@ int gmx_rmsf(int argc,char *argv[])
       if (!bRes || i+1==isize ||
          top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
        fprintf(fp,"%5d %8.4f\n",
-               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
     ffclose(fp);
   }
   
@@ -455,7 +455,7 @@ int gmx_rmsf(int argc,char *argv[])
       if (!bRes || i+1==isize ||
          top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
        fprintf(fp,"%5d %8.4f\n",
-               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
     ffclose(fp);
   }
 
index 7ca28ce89f9e0040fbc4b62c7ca7a43add38bb3a..c124f19353bae9e53ab825c15ed72b89a2c923af 100644 (file)
@@ -65,21 +65,22 @@ int gmx_sans(int argc,char *argv[])
         "This is simple tool to compute SANS spectra using Debye formula",
         "It currently uses topology file (since it need to assigne element for each atom)",
         "[PAR]",
-        "[TT]-pr[tt] Computes normalized g(r) function",
-        "[PAR]",
-        "[TT]-sq[tt] Computes SANS intensity curve for needed q diapason",
-        "[PAR]",
-        "[TT]-startq[tt] Starting q value in nm",
-        "[PAR]",
-        "[TT]-endq[tt] Ending q value in nm",
-        "[PAR]",
-        "[TT]-qstep[tt] Stepping in q space",
-        "[PAR]",
+        "Parameters:[PAR]"
+        "[TT]-pr[tt] Computes normalized g(r) function averaged over trajectory[PAR]",
+        "[TT]-prframe[tt] Computes normalized g(r) function for each frame[PAR]",
+        "[TT]-sq[tt] Computes SANS intensity curve averaged over trajectory[PAR]",
+        "[TT]-sqframe[tt] Computes SANS intensity curve for each frame[PAR]",
+        "[TT]-startq[tt] Starting q value in nm[PAR]",
+        "[TT]-endq[tt] Ending q value in nm[PAR]",
+        "[TT]-qstep[tt] Stepping in q space[PAR]",
         "Note: When using Debye direct method computational cost increases as",
-        "1/2 * N * (N - 1) where N is atom number in group of interest"
+        "1/2 * N * (N - 1) where N is atom number in group of interest",
+        "[PAR]",
+        "WARNING: If sq or pr specified this tool can produce large number of files! Up to two times larger than number of frames!"
     };
     static gmx_bool bPBC=TRUE;
-    static real binwidth=0.2,grid=0.05; /* bins shouldnt be smaller then bond (~0.1nm) length */
+    static gmx_bool bNORM=FALSE;
+    static real binwidth=0.2,grid=0.05; /* bins shouldnt be smaller then smallest bond (~0.1nm) length */
     static real start_q=0.0, end_q=2.0, q_step=0.01;
     static real mcover=-1;
     static unsigned int  seed=0;
@@ -88,7 +89,7 @@ int gmx_sans(int argc,char *argv[])
     static const char *emode[]= { NULL, "direct", "mc", NULL };
     static const char *emethod[]={ NULL, "debye", "fft", NULL };
 
-    gmx_nentron_atomic_structurefactors_t    *gnsf;
+    gmx_neutron_atomic_structurefactors_t    *gnsf;
     gmx_sans_t              *gsans;
 
 #define NPA asize(pa)
@@ -120,7 +121,7 @@ int gmx_sans(int argc,char *argv[])
 #endif
     };
   FILE      *fp;
-  const char *fnTPX,*fnNDX,*fnDAT=NULL;
+  const char *fnTPX,*fnNDX,*fnTRX,*fnDAT=NULL;
   t_trxstatus *status;
   t_topology *top=NULL;
   t_atom    *atom=NULL;
@@ -138,24 +139,30 @@ int gmx_sans(int argc,char *argv[])
   atom_id    *index=NULL;
   int        isize;
   int         i,j;
-  gmx_radial_distribution_histogram_t  *pr=NULL;
-  gmx_static_structurefator_t  *sq=NULL;
+  char       *hdr=NULL;
+  char       *suffix=NULL;
+  t_filenm   *fnmdup=NULL;
+  gmx_radial_distribution_histogram_t  *prframecurrent=NULL, *pr=NULL;
+  gmx_static_structurefactor_t  *sqframecurrent=NULL, *sq=NULL;
   output_env_t oenv;
 
 #define NFILE asize(fnm)
 
   t_filenm   fnm[] = {
-      { efTPX,  "-s",         NULL,   ffREAD },
-      { efNDX,  NULL,         NULL,   ffOPTRD },
-      { efDAT,  "-d",   "nsfactor",   ffOPTRD },
-      { efXVG, "-sq",         "sq",   ffWRITE },
-      { efXVG, "-pr",         "pr",   ffWRITE }
+      { efTPX,  "-s",       NULL,       ffREAD },
+      { efTRX,  "-f",       NULL,       ffREAD },
+      { efNDX,  NULL,       NULL,       ffOPTRD },
+      { efDAT,  "-d",       "nsfactor", ffOPTRD },
+      { efXVG,  "-pr",      "pr",       ffWRITE },
+      { efXVG,  "-sq",       "sq",      ffWRITE },
+      { efXVG,  "-prframe", "prframe",  ffOPTWR },
+      { efXVG,  "-sqframe", "sqframe",  ffOPTWR }
   };
 
   nthreads = gmx_omp_get_max_threads();
 
   CopyRight(stderr,argv[0]);
-  parse_common_args(&argc,argv,PCA_BE_NICE,
+  parse_common_args(&argc,argv,PCA_CAN_TIME | PCA_TIME_UNIT | PCA_BE_NICE,
                     NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,&oenv);
 
   /* check that binwidth not smaller than smallers distance */
@@ -187,11 +194,22 @@ int gmx_sans(int argc,char *argv[])
       break;
   }
 
-  if (!bDEBYE && !bFFT)
-      gmx_fatal(FARGS,"Unknown method. Set pr or fft!\n");
+  if (bDEBYE) {
+      if (bMC) {
+          fprintf(stderr,"Using Monte Carlo Debye method to calculate spectrum\n");
+      } else {
+          fprintf(stderr,"Using direct Debye method to calculate spectrum\n");
+      }
+  } else if (bFFT) {
+      gmx_fatal(FARGS,"FFT method not implemented!");
+  } else {
+      gmx_fatal(FARGS,"Unknown combination for mode and method!");
+  }
+
   /* Try to read files */
   fnDAT = ftp2fn(efDAT,NFILE,fnm);
   fnTPX = ftp2fn(efTPX,NFILE,fnm);
+  fnTRX = ftp2fn(efTRX,NFILE,fnm);
 
   gnsf = gmx_neutronstructurefactors_init(fnDAT);
   fprintf(stderr,"Read %d atom names from %s with neutron scattering parameters\n\n",gnsf->nratoms,fnDAT);
@@ -213,38 +231,120 @@ int gmx_sans(int argc,char *argv[])
       gmx_rmpbc(gpbc,top->atoms.nr,box,x);
   }
 
-  natoms=top->atoms.nr;
+  natoms=read_first_x(oenv,&status,fnTRX,&t,&x,box);
+  if (natoms != top->atoms.nr) {
+      fprintf(stderr,"\nWARNING: number of atoms in tpx (%d) and trajectory (%d) do not match\n",natoms,top->atoms.nr);
+  }
 
-  if (bDEBYE) {
-      if (bMC) {
-          fprintf(stderr,"Using Monte Carlo Debye method to calculate spectrum\n");
+  do {
+      if (bPBC) {
+          gmx_rmpbc(gpbc,top->atoms.nr,box,x);
+      }
+      /* allocate memory for pr */
+      if (pr == NULL) {
+          /* in case its first frame to read */
+          snew(pr,1);
+      }
+      /*  realy calc p(r) */
+      prframecurrent = calc_radial_distribution_histogram(gsans,x,box,index,isize,binwidth,bMC,bNORM,mcover,seed);
+      /* copy prframecurrent -> pr and summ up pr->gr[i] */
+      /* allocate and/or resize memory for pr->gr[i] and pr->r[i] */
+      if (pr->gr == NULL) {
+          /* check if we use pr->gr first time */
+          snew(pr->gr,prframecurrent->grn);
+          snew(pr->r,prframecurrent->grn);
       } else {
-          fprintf(stderr,"Using direct Debye method to calculate spectrum\n");
+          /* resize pr->gr and pr->r if needed to preven overruns */
+          if(prframecurrent->grn > pr->grn) {
+              srenew(pr->gr,prframecurrent->grn);
+              srenew(pr->r,prframecurrent->grn);
+          }
       }
-  } else if (bFFT) {
-      gmx_fatal(FARGS,"Not implented!");
-  } else {
-      gmx_fatal(FARGS,"Whats this!");
-  }
-
-  /*  realy calc p(r) */
-  pr = calc_radial_distribution_histogram(gsans,x,box,index,isize,binwidth,bMC,mcover,seed);
+      pr->grn = prframecurrent->grn;
+      pr->binwidth = prframecurrent->binwidth;
+      /* summ up gr and fill r */
+      for(i=0;i<prframecurrent->grn;i++) {
+          pr->gr[i] += prframecurrent->gr[i];
+          pr->r[i] = prframecurrent->r[i];
+      }
+      /* normalize histo */
+      normalize_probability(prframecurrent->grn,prframecurrent->gr);
+      /* convert p(r) to sq */
+      sqframecurrent = convert_histogram_to_intensity_curve(prframecurrent,start_q,end_q,q_step);
+      /* print frame data if needed */
+      if(opt2fn_null("-prframe",NFILE,fnm)) {
+          snew(hdr,25);
+          snew(suffix,GMX_PATH_MAX);
+          /* prepare header */
+          sprintf(hdr,"g(r), t = %f",t);
+          /* prepare output filename */
+          fnmdup = dup_tfn(NFILE,fnm);
+          sprintf(suffix,"-t%.2f",t);
+          add_suffix_to_output_names(fnmdup,NFILE,suffix);
+          fp = xvgropen(opt2fn_null("-prframe",NFILE,fnmdup),hdr,"Distance (nm)","Probability",oenv);
+          for(i=0;i<prframecurrent->grn;i++) {
+              fprintf(fp,"%10.6f%10.6f\n",prframecurrent->r[i],prframecurrent->gr[i]);
+          }
+          done_filenms(NFILE,fnmdup);
+          fclose(fp);
+          sfree(hdr);
+          sfree(suffix);
+          sfree(fnmdup);
+      }
+      if(opt2fn_null("-sqframe",NFILE,fnm)) {
+          snew(hdr,25);
+          snew(suffix,GMX_PATH_MAX);
+          /* prepare header */
+          sprintf(hdr,"I(q), t = %f",t);
+          /* prepare output filename */
+          fnmdup = dup_tfn(NFILE,fnm);
+          sprintf(suffix,"-t%.2f",t);
+          add_suffix_to_output_names(fnmdup,NFILE,suffix);
+          fp = xvgropen(opt2fn_null("-sqframe",NFILE,fnmdup),hdr,"q (nm^-1)","s(q)/s(0)",oenv);
+          for(i=0;i<sqframecurrent->qn;i++) {
+              fprintf(fp,"%10.6f%10.6f\n",sqframecurrent->q[i],sqframecurrent->s[i]);
+          }
+          done_filenms(NFILE,fnmdup);
+          fclose(fp);
+          sfree(hdr);
+          sfree(suffix);
+          sfree(fnmdup);
+      }
+      /* free pr structure */
+      sfree(prframecurrent->gr);
+      sfree(prframecurrent->r);
+      sfree(prframecurrent);
+      /* free sq structure */
+      sfree(sqframecurrent->q);
+      sfree(sqframecurrent->s);
+      sfree(sqframecurrent);
+  } while (read_next_x(oenv,status,&t,natoms,x,box));
+  close_trj(status);
 
+  /* normalize histo */
+  normalize_probability(pr->grn,pr->gr);
+  sq = convert_histogram_to_intensity_curve(pr,start_q,end_q,q_step);
   /* prepare pr.xvg */
   fp = xvgropen(opt2fn_null("-pr",NFILE,fnm),"G(r)","Distance (nm)","Probability",oenv);
   for(i=0;i<pr->grn;i++)
-      fprintf(fp,"%10.6lf%10.6lf\n",pr->r[i],pr->gr[i]);
+      fprintf(fp,"%10.6f%10.6f\n",pr->r[i],pr->gr[i]);
   xvgrclose(fp);
 
   /* prepare sq.xvg */
-  sq = convert_histogram_to_intensity_curve(pr,start_q,end_q,q_step);
   fp = xvgropen(opt2fn_null("-sq",NFILE,fnm),"I(q)","q (nm^-1)","s(q)/s(0)",oenv);
   for(i=0;i<sq->qn;i++) {
-      fprintf(fp,"%10.6lf%10.6lf\n",sq->q[i],sq->s[i]);
+      fprintf(fp,"%10.6f%10.6f\n",sq->q[i],sq->s[i]);
   }
   xvgrclose(fp);
-
+  /*
+   * Clean up memory
+   */
+  sfree(pr->gr);
+  sfree(pr->r);
   sfree(pr);
+  sfree(sq->q);
+  sfree(sq->s);
+  sfree(sq);
 
   please_cite(stdout,"Garmay2012");
   thanx(stderr);
index c3f8027f3cf0a5564a061098439574fc5a21d83c..4d819202fe28062f60b9faca02be02824f2b2121 100644 (file)
@@ -66,14 +66,14 @@ void check_mcover(real mcover) {
     }
 }
 
-void normalize_probability(int n,double *a){
+void normalize_probability(int n,double *a) {
     int i;
     double norm=0.0;
     for (i=0;i<n;i++) norm +=a[i];
     for (i=0;i<n;i++) a[i]/=norm;
 }
 
-gmx_nentron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn) {
+gmx_neutron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn) {
     /* read nsfactor.dat */
     FILE    *fp;
     char    line[STRLEN];
@@ -82,7 +82,7 @@ gmx_nentron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const ch
     int     i, line_no;
     char    atomnm[8];
     double  slength;
-    gmx_nentron_atomic_structurefactors_t   *gnsf;
+    gmx_neutron_atomic_structurefactors_t   *gnsf;
 
     fp=libopen(datfn);
     line_no = 0;
@@ -122,10 +122,10 @@ gmx_nentron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const ch
 
     fclose(fp);
 
-    return (gmx_nentron_atomic_structurefactors_t *) gnsf;
+    return (gmx_neutron_atomic_structurefactors_t *) gnsf;
 }
 
-gmx_sans_t *gmx_sans_init (t_topology *top, gmx_nentron_atomic_structurefactors_t *gnsf) {
+gmx_sans_t *gmx_sans_init (t_topology *top, gmx_neutron_atomic_structurefactors_t *gnsf) {
     gmx_sans_t    *gsans=NULL;
     int     i,j;
     /* Try to assing scattering length from nsfactor.dat */
@@ -159,6 +159,7 @@ gmx_radial_distribution_histogram_t *calc_radial_distribution_histogram (
                             int isize,
                             double binwidth,
                             gmx_bool bMC,
+                            gmx_bool bNORM,
                             real mcover,
                             unsigned int seed) {
     gmx_radial_distribution_histogram_t    *pr=NULL;
@@ -283,8 +284,11 @@ gmx_radial_distribution_histogram_t *calc_radial_distribution_histogram (
 #endif
     }
 
-    /* normalize */
-    normalize_probability(pr->grn,pr->gr);
+    /* normalize if needed */
+    if (bNORM) {
+        normalize_probability(pr->grn,pr->gr);
+    }
+
     snew(pr->r,pr->grn);
     for(i=0;i<pr->grn;i++)
         pr->r[i]=(pr->binwidth*i+pr->binwidth*0.5);
@@ -292,8 +296,8 @@ gmx_radial_distribution_histogram_t *calc_radial_distribution_histogram (
     return (gmx_radial_distribution_histogram_t *) pr;
 }
 
-gmx_static_structurefator_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step) {
-    gmx_static_structurefator_t    *sq=NULL;
+gmx_static_structurefactor_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step) {
+    gmx_static_structurefactor_t    *sq=NULL;
     int         i,j;
     /* init data */
     snew(sq,1);
@@ -318,5 +322,5 @@ gmx_static_structurefator_t *convert_histogram_to_intensity_curve (gmx_radial_di
         }
     }
 
-    return (gmx_static_structurefator_t *) sq;
+    return (gmx_static_structurefactor_t *) sq;
 }
index 0923299d308d8f6fb1126473932f18248cfccd51..11862daa9af0e44a331146687aa44f7323c7d206 100644 (file)
 extern "C" {
 #endif
 
-typedef struct gmx_nentron_atomic_structurefactors_t {
+typedef struct gmx_neutron_atomic_structurefactors_t {
     int     nratoms;
     int     *p; /* proton number */
     int     *n; /* neuton number */
     double  *slength; /* scattering length in fm */
     char    **atomnm; /* atom symbol */
-} gmx_nentron_atomic_structurefactors_t;
+} gmx_neutron_atomic_structurefactors_t;
 
 typedef struct gmx_sans_t {
     t_topology *top; /* topology */
@@ -68,12 +68,12 @@ typedef struct gmx_radial_distribution_histogram_t {
     double *gr; /* Probability */
 } gmx_radial_distribution_histogram_t;
 
-typedef struct gmx_static_structurefator_t {
+typedef struct gmx_static_structurefactor_t {
     int     qn; /* number of items */
     double  *s; /* scattering */
     double  *q; /* q vectors */
     double  qstep; /* q increment */
-} gmx_static_structurefator_t;
+} gmx_static_structurefactor_t;
 
 void check_binwidth(real binwidth);
 
@@ -81,9 +81,9 @@ void check_mcover(real mcover);
 
 void normalize_probability(int n, double *a);
 
-gmx_nentron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn);
+gmx_neutron_atomic_structurefactors_t *gmx_neutronstructurefactors_init(const char *datfn);
 
-gmx_sans_t *gmx_sans_init(t_topology *top, gmx_nentron_atomic_structurefactors_t *gnsf);
+gmx_sans_t *gmx_sans_init(t_topology *top, gmx_neutron_atomic_structurefactors_t *gnsf);
 
 gmx_radial_distribution_histogram_t *calc_radial_distribution_histogram  (gmx_sans_t *gsans,
                             rvec *x,
@@ -92,10 +92,11 @@ gmx_radial_distribution_histogram_t *calc_radial_distribution_histogram  (gmx_sa
                             int isize,
                             double binwidth,
                             gmx_bool bMC,
+                            gmx_bool bNORM,
                             real mcover,
                             unsigned int seed);
 
-gmx_static_structurefator_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step);
+gmx_static_structurefactor_t *convert_histogram_to_intensity_curve (gmx_radial_distribution_histogram_t *pr, double start_q, double end_q, double q_step);
 
 
 #ifdef __cplusplus