First part of commit for redesigned SIMD module - namechanges.

author Erik Lindahl <erik@kth.se>

Fri, 24 Jan 2014 20:04:47 +0000 (21:04 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Tue, 11 Feb 2014 13:02:23 +0000 (14:02 +0100)
author Erik Lindahl <erik@kth.se>
Fri, 24 Jan 2014 20:04:47 +0000 (21:04 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 11 Feb 2014 13:02:23 +0000 (14:02 +0100)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 9a3ce3c3ced4e570b9f3bcf5c3155f60fde9ed05..61eb7d20a60ba4509042ef488a3bca1cee4bcccf 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,22 +185,22 @@ gmx_add_cache_dependency(GMX_COOL_QUOTES BOOL "NOT GMX_FAHCORE" OFF)
  include(gmxManageGPU)
  
  # Detect the architecture the compiler is targetting, detect
-# acceleration possibilities on that hardware, suggest an acceleration
+# SIMD instructions possibilities on that hardware, suggest SIMD instruction set
  # to use if none is specified, and populate the cache option for CPU
-# accleration.
+# SIMD.
  include(gmxDetectTargetArchitecture)
  gmx_detect_target_architecture()
-include(gmxDetectAcceleration)
-gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
-if("${GMX_SUGGESTED_CPU_ACCELERATION}" STREQUAL "AVX2_256")
+include(gmxDetectSimd)
+gmx_detect_simd(GMX_SUGGESTED_SIMD)
+if("${GMX_SUGGESTED_SIMD}" STREQUAL "AVX2_256")
      message(STATUS "Changing acceleration from AVX2 to AVX (until AVX2 patches commited).")
-    set(GMX_SUGGESTED_CPU_ACCELERATION "AVX_256")
+    set(GMX_SUGGESTED_SIMD "AVX_256")
  endif()
  
  gmx_option_multichoice(
-    GMX_CPU_ACCELERATION
-    "Acceleration for CPU kernels and compiler optimization"
-    "${GMX_SUGGESTED_CPU_ACCELERATION}"
+    GMX_SIMD
+    "SIMD instruction set for CPU kernels and compiler optimization"
+    "${GMX_SUGGESTED_SIMD}"
      None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 IBM_QPX Sparc64_HPC_ACE Reference)
  
  gmx_option_multichoice(
@@ -227,8 +227,8 @@ gmx_option_multichoice(
      None
      none gaussian mopac gamess orca)
  
-gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_TYPE "Reference kernel type (4xn or 2xnn)" STRING "4xn" "GMX_CPU_ACCELERATION STREQUAL REFERENCE")
-gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_WIDTH "Reference kernel width" STRING "4" "GMX_CPU_ACCELERATION STREQUAL REFERENCE")
+gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_TYPE "Reference kernel type (4xn or 2xnn)" STRING "4xn" "GMX_SIMD STREQUAL REFERENCE")
+gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_WIDTH "Reference kernel width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
  
  option(GMX_BROKEN_CALLOC "Work around broken calloc()" OFF)
  mark_as_advanced(GMX_BROKEN_CALLOC)
@@ -286,7 +286,7 @@ include(gmxCFlags)
  gmx_c_flags()
  
  # This variable should be used for additional compiler flags which are not
-# generated in gmxCFlags nor are acceleration or MPI related.
+# generated in gmxCFlags nor are SIMD or MPI related.
  set(EXTRA_C_FLAGS "")
  set(EXTRA_CXX_FLAGS "")
  
@@ -576,13 +576,13 @@ endif(NOT GMX_SYSTEM_XDR)
  
  
  ##################################################
-# Process CPU acceleration settings
+# Process SIMD instruction settings
  ##################################################
  # This checks what flags to add in order to
  # support the SIMD instructions we need, and sets
-# correct defines for the acceleration supported.
-include(gmxTestCPUAcceleration)
-gmx_test_cpu_acceleration()
+# correct defines for the SIMD instructions supported.
+include(gmxTestSimd)
+gmx_test_simd()
  
  
  # Process QM/MM Settings
@@ -689,16 +689,16 @@ endif()
  # # # # # # # # # # NO MORE TESTS AFTER THIS LINE! # # # # # # # # # # #
  # these are set after everything else
  if (NOT GMX_SKIP_DEFAULT_CFLAGS)
-    set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "${SIMD_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${SIMD_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
      set(CMAKE_EXE_LINKER_FLAGS "${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
      set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
  else()
      message("Recommended flags which are not added because GMX_SKIP_DEFAULT_CFLAGS=yes:")
-    message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${GMXC_CFLAGS}")
+    message("CMAKE_C_FLAGS: ${SIMD_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${GMXC_CFLAGS}")
      message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}")
      message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}")
-    message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${GMXC_CXXFLAGS}")
+    message("CMAKE_CXX_FLAGS: ${SIMD_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${GMXC_CXXFLAGS}")
      message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
      message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
      message("CMAKE_EXE_LINKER_FLAGS: ${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS}")
diff --git a/admin/installguide/installguide.tex b/admin/installguide/installguide.tex

index 953ee9f442217506e0e164db1273770adfa131b5..4368cb539a40f2bdace36bb5ff7d6ab3bc186c96 100644 (file)
--- a/admin/installguide/installguide.tex
+++ b/admin/installguide/installguide.tex
@@ -1,7 +1,7 @@
  %
  % This file is part of the GROMACS molecular simulation package.
  %
-% Copyright (c) 2013, by the GROMACS development team, led by
+% Copyright (c) 2013,2014, by the GROMACS development team, led by
  % Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  % and including many others, as listed in the AUTHORS file in the
  % top-level source directory and at http://www.gromacs.org.
@@ -425,6 +425,45 @@ also do this kind of thing with \verb+ccmake+, but you should avoid
  this, because the options set with '\verb+-D+' will not be able to be
  changed interactively in that run of \verb+ccmake+.
  
+\subsection{SIMD support}
+\gromacs{} has extensive support for detecting and using the SIMD
+capabilities of nearly all modern HPC CPUs. If you are building
+\gromacs{} on the same hardware you will run it on, then you don't
+need to read more about this. Otherwise, you may wish to choose the
+value of \verb+GMX_SIMD+ to much the execution environment. If you
+make no choice, the default will be based on the computer on which you
+are running \cmake{}. Valid values are listed below, and the
+applicable value lowest on the list is generally the one you should
+choose:
+\begin{enumerate}
+\item \verb+None+ For use only on an architecture either lacking SIMD,
+  or to which \gromacs{} has not yet been ported and none of the
+  options below are applicable.
+\item \verb+SSE2+ Essentially all x86 machines in existence have this
+\item \verb+SSE4.1+ More recent x86 have this
+\item \verb+AVX_128_FMA+ More recent AMD x86 have this
+\item \verb+AVX_256+ More recent Intel x86 have this
+\item \verb+AVX2_256+ Yet more recent Intel x86 have this
+\item \verb+IBM_QPX + BlueGene/Q A2 cores have this
+\item \verb+Sparc64_HPC_ACE+ Fujitsu machines like the K computer have this
+\end{enumeration}
+The \cmake{} configure system will check that the compiler you have
+chosen can target the architecture you have chosen. mdrun will check
+further at runtime, so if in doubt, choose the lowest setting you
+think might work, and see what mdrun says. The configure system also
+works around many known issues in many versions of common HPC
+compilers.
+
+A further \verb+GMX_SIMD=Reference+ option exists, which is a special
+SIMD-like implementation written in plain C that developers can use
+when developing support in GROMACS for new SIMD architectures. It is
+not designed for use in production simulations, but if you are using
+an architecture with SIMD support to which \gromacs{} has not yet been
+ported, you may wish to try the performance of this option, in case
+the auto-vectorization in your compiler does a good job. And post on
+the \gromacs{} mailing lists, because \gromacs{} can probably be
+ported for new SIMD architectures in a few days.
+
  \subsection{CMake advanced options}
  The options that can be seen with \verb+ccmake+ are ones that we
  think a reasonable number of users might want to consider
diff --git a/cmake/Platform/BlueGeneL-static-XL-C.cmake b/cmake/Platform/BlueGeneL-static-XL-C.cmake

index c1de08aa19f31bd6b28e017ba3244f3ce1728749..ddc87997fea86ad0e82bb8956470bd09a7bcf7b5 100644 (file)
--- a/cmake/Platform/BlueGeneL-static-XL-C.cmake
+++ b/cmake/Platform/BlueGeneL-static-XL-C.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2010,2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2010,2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -82,4 +82,4 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
  
-set(GMX_CPU_ACCELERATION "BlueGene" CACHE STRING "Forcing BlueGene acceleration when using BlueGene toolchain")
+set(GMX_SIMD "BlueGene" CACHE STRING "Forcing BlueGene SIMD when using BlueGene toolchain")
diff --git a/cmake/Platform/BlueGeneP-static-XL-C.cmake b/cmake/Platform/BlueGeneP-static-XL-C.cmake

index 6558fd415a9a39d0b47c1d9134721b1e7fdc852d..b2ad826a062f7e22de8aec18d8fdb3b68b9ebd92 100644 (file)
--- a/cmake/Platform/BlueGeneP-static-XL-C.cmake
+++ b/cmake/Platform/BlueGeneP-static-XL-C.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -39,4 +39,4 @@ set(CMAKE_SYSTEM_NAME BlueGeneP-static CACHE STRING "Cross-compiling for BlueGen
  set(CMAKE_C_COMPILER mpixlc_r)
  set(CMAKE_CXX_COMPILER mpixlcxx_r)
  
-set(GMX_CPU_ACCELERATION "BlueGene" CACHE STRING "Forcing BlueGene acceleration when using BlueGene toolchain")
+set(GMX_SIMD "BlueGene" CACHE STRING "Forcing BlueGene SIMD when using BlueGene toolchain")
diff --git a/cmake/TestAVXMaskload.c b/cmake/TestAVXMaskload.c

index 61777b077fc21033fdbf3cc44f9e8e2d540cf3d9..508c2ec17468977f09fba9bc4dfaf1c0f66b3ecd 100644 (file)
--- a/cmake/TestAVXMaskload.c
+++ b/cmake/TestAVXMaskload.c
@@ -8,7 +8,7 @@ int main()
      a = _mm256_setzero_pd();
      mask = _mm256_castpd_si256(a);
  
-#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
      a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
  #else
      a = _mm256_maskload_pd(d,mask);
diff --git a/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake b/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake

index 0ed9b51b6bd59e647bca6e3775734081874c2904..2b0a180b563b9b015876d2443527d7c6659aa44a 100644 (file)
--- a/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake
+++ b/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -52,4 +52,4 @@ set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to H
  set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE)
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE)
  
-set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain")
+set(GMX_SIMD "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE SIMD when using Fujitsu Sparc64 toolchain")
diff --git a/cmake/Toolchain-Fujitsu-Sparc64.cmake b/cmake/Toolchain-Fujitsu-Sparc64.cmake

index 923869f88d28e05184e7b34b0de048c21d5e201c..3f301eed64a30e015fc73cb9d0476ad4373b2066 100644 (file)
--- a/cmake/Toolchain-Fujitsu-Sparc64.cmake
+++ b/cmake/Toolchain-Fujitsu-Sparc64.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -51,4 +51,4 @@ set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to H
  set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE)
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE)
  
-set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain")
+set(GMX_SIMD "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE SIMD when using Fujitsu Sparc64 toolchain")
diff --git a/cmake/gmxBuildTypeReference.cmake b/cmake/gmxBuildTypeReference.cmake

index 532392440888897ea1dd9ed947a7db731f95a4df..2a6f7fac19217e937b6bff74b19044ab2287dc51 100644 (file)
--- a/cmake/gmxBuildTypeReference.cmake
+++ b/cmake/gmxBuildTypeReference.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -42,7 +42,7 @@ mark_as_advanced( CMAKE_CXX_FLAGS_REFERENCE CMAKE_C_FLAGS_REFERENCE)
  if("${CMAKE_BUILD_TYPE}" STREQUAL "Reference")
      set(GMX_GPU OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
      set(GMX_OPENMP OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
-    set(GMX_CPU_ACCELERATION "None" CACHE STRING "Disabled for regressiontests reference builds" FORCE)
+    set(GMX_SIMD "None" CACHE STRING "Disabled for regressiontests reference builds" FORCE)
      set(GMX_FFT_LIBRARY "fftpack" CACHE STRING "Use fftpack for regressiontests reference builds" FORCE)
      set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
      set(GMX_THREAD_MPI OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
diff --git a/cmake/gmxDetectAcceleration.cmake b/cmake/gmxDetectSimd.cmake

similarity index 71%

rename from cmake/gmxDetectAcceleration.cmake

rename to cmake/gmxDetectSimd.cmake

index 884368ffb26241170963f36be05e59e991d13aab..550a9ac16165a1f944b9e1e66630a4672bd1a6d4 100644 (file)
--- a/cmake/gmxDetectAcceleration.cmake
+++ b/cmake/gmxDetectSimd.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -34,22 +34,22 @@
  
  # - Check the username performing the build, as well as date and time
  #
-# gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
+# gmx_detect_simd(GMX_SUGGESTED_SIMD)
  #
-# Try to detect CPU information and suggest an acceleration option
+# Try to detect CPU information and suggest SIMD instructions
  # (such as SSE/AVX) that fits the current CPU. These functions assume
  # that gmx_detect_target_architecture() has already been run, so that
  # things like GMX_TARGET_X86 are already available.
  #
-# Sets ${GMX_SUGGESTED_CPU_ACCELERATION} in the parent scope if
-# GMX_CPU_ACCELERATION is not set (e.g. by the user, or a previous run
+# Sets ${GMX_SUGGESTED_SIMD} in the parent scope if
+# GMX_SIMD is not set (e.g. by the user, or a previous run
  # of CMake).
  #
  
  # we rely on inline asm support for GNU!
  include(gmxTestInlineASM)
  
-function(gmx_suggest_x86_acceleration _suggested_acceleration)
+function(gmx_suggest_x86_simd _suggested_simd)
  
      gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
  
@@ -59,47 +59,47 @@ function(gmx_suggest_x86_acceleration _suggested_acceleration)
          set(GCC_INLINE_ASM_DEFINE "")
      endif(GMX_X86_GCC_INLINE_ASM)
  
-    message(STATUS "Detecting best acceleration for this CPU")
+    message(STATUS "Detecting best SIMD instructions for this CPU")
  
-    # Get CPU acceleration information
+    # Get CPU SIMD properties information
      set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders -DGMX_CPUID_STANDALONE")
      if(GMX_TARGET_X86)
          set(_compile_definitions "${_compile_definitions} -DGMX_TARGET_X86")
      endif()
-    try_run(GMX_CPUID_RUN_ACC GMX_CPUID_COMPILED
+    try_run(GMX_CPUID_RUN_SIMD GMX_CPUID_COMPILED
              ${CMAKE_BINARY_DIR}
              ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_cpuid.c
              COMPILE_DEFINITIONS ${_compile_definitions}
              RUN_OUTPUT_VARIABLE OUTPUT_TMP
              COMPILE_OUTPUT_VARIABLE GMX_CPUID_COMPILE_OUTPUT
-            ARGS "-acceleration")
+            ARGS "-simd")
  
      if(NOT GMX_CPUID_COMPILED)
-        message(WARNING "Cannot compile CPUID code, which means no CPU-specific acceleration.")
+        message(WARNING "Cannot compile CPUID code, which means no SIMD instructions.")
          message(STATUS "Compile output: ${GMX_CPUID_COMPILE_OUTPUT}")
          set(OUTPUT_TMP "None")
-    elseif(NOT GMX_CPUID_RUN_ACC EQUAL 0)
-        message(WARNING "Cannot run CPUID code, which means no CPU-specific optimization.")
+    elseif(NOT GMX_CPUID_RUN_SIMD EQUAL 0)
+        message(WARNING "Cannot run CPUID code, which means no SIMD instructions.")
          message(STATUS "Run output: ${OUTPUT_TMP}")
          set(OUTPUT_TMP "None")
      endif(NOT GMX_CPUID_COMPILED)
  
-    string(STRIP "@OUTPUT_TMP@" OUTPUT_ACC)
+    string(STRIP "@OUTPUT_TMP@" OUTPUT_SIMD)
  
-    set(${_suggested_acceleration} "@OUTPUT_ACC@" PARENT_SCOPE)
-    message(STATUS "Detected best acceleration for this CPU - @OUTPUT_ACC@")
+    set(${_suggested_simd} "@OUTPUT_SIMD@" PARENT_SCOPE)
+    message(STATUS "Detected best SIMD instructions for this CPU - @OUTPUT_SIMD@")
  endfunction()
  
-function(gmx_detect_acceleration _suggested_acceleration)
-    if(NOT DEFINED GMX_CPU_ACCELERATION)
+function(gmx_detect_simd _suggested_simd)
+    if(NOT DEFINED GMX_SIMD)
          if(GMX_TARGET_BGQ)
-            set(${_suggested_acceleration} "IBM_QPX")
+            set(${_suggested_simd} "IBM_QPX")
          elseif(GMX_TARGET_X86)
-            gmx_suggest_x86_acceleration(${_suggested_acceleration})
+            gmx_suggest_x86_simd(${_suggested_simd})
          else()
-            set(${_suggested_acceleration} "None")
+            set(${_suggested_simd} "None")
          endif()
  
-        set(${_suggested_acceleration} ${${_suggested_acceleration}} PARENT_SCOPE)
+        set(${_suggested_simd} ${${_suggested_simd}} PARENT_SCOPE)
      endif()
  endfunction()
diff --git a/cmake/gmxDetectTargetArchitecture.cmake b/cmake/gmxDetectTargetArchitecture.cmake

index e888732e77058f43c8a9c17d15e5c7228a64148a..45a2a47fd5ec509f6ce7d41afb5994d0302e6b68 100644 (file)
--- a/cmake/gmxDetectTargetArchitecture.cmake
+++ b/cmake/gmxDetectTargetArchitecture.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2013, by the GROMACS development team, led by
+# Copyright (c) 2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -34,7 +34,7 @@
  
  # - Define function to detect whether the compiler's target
  # - architecture is one for which GROMACS has special treatment
-# - (e.g. kernel acceleration)
+#   (e.g. SIMD instructions)
  #
  # Sets GMX_TARGET_X86 or GMX_TARGET_BGQ if targetting that
  # architecture. May set other such variables if/when there is future
diff --git a/cmake/gmxFindFlagsForSource.cmake b/cmake/gmxFindFlagsForSource.cmake

index 3ebb57b6b83e87b8f742351e4029239a424ec536..6a569b22b7eb842cdf3bda60624faaeaab8d80ac 100644 (file)
--- a/cmake/gmxFindFlagsForSource.cmake
+++ b/cmake/gmxFindFlagsForSource.cmake
@@ -38,7 +38,7 @@
  # SOURCE              Source code to test
  #                     The compiler is chosen based on the extension of this file
  # FLAGSVAR            Variable (string) to which we should add the correct flag
-# Args 5 through N    Multiple strings with acceleration flags to test
+# Args 5 through N    Multiple strings with optimization flags to test
  FUNCTION(GMX_FIND_CFLAG_FOR_SOURCE VARIABLE DESCRIPTION SOURCE CFLAGSVAR)
      IF(NOT DEFINED ${VARIABLE})
          # Insert a blank element last in the list (try without any flags too)
@@ -71,7 +71,7 @@ ENDFUNCTION(GMX_FIND_CFLAG_FOR_SOURCE VARIABLE DESCRIPTION SOURCE CFLAGSVAR)
  # SOURCE              Source code to test
  #                     The compiler is chosen based on the extension of this file
  # FLAGSVAR            Variable (string) to which we should add the correct flag
-# Args 5 through N    Multiple strings with acceleration flags to test
+# Args 5 through N    Multiple strings with optimization flags to test
  FUNCTION(GMX_FIND_CXXFLAG_FOR_SOURCE VARIABLE DESCRIPTION SOURCE CXXFLAGSVAR)
      IF(NOT DEFINED ${VARIABLE})
          # Insert a blank element last in the list (try without any flags too)
diff --git a/cmake/gmxManageFFTLibraries.cmake b/cmake/gmxManageFFTLibraries.cmake

index 653321ddc83471fc4fc271929c6e27925484e4aa..35c600001ee7ca404e8a24d6b40f71dbcc9a9ae1 100644 (file)
--- a/cmake/gmxManageFFTLibraries.cmake
+++ b/cmake/gmxManageFFTLibraries.cmake
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -81,12 +81,12 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
      set(FFT_LIBRARIES ${${FFTW}_LIBRARIES})
      set(GMX_FFT_FFTW3 1)
  
-    if ((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
+    if ((${GMX_SIMD} MATCHES "SSE" OR ${GMX_SIMD} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
        message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
      endif()
  
-    if((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND ${FFTW}_HAVE_AVX)
-        # If we're not doing CPU acceleration, we don't care about FFTW performance on x86 either
+    if((${GMX_SIMD} MATCHES "SSE" OR ${GMX_SIMD} MATCHES "AVX") AND ${FFTW}_HAVE_AVX)
+        # If we're not using SIMD instructions, we don't care about FFTW performance on x86 either
          message(WARNING "The FFTW library was compiled with --enable-avx to enable AVX SIMD instructions. That might sound like a good idea for your processor, but for FFTW versions up to 3.3.3, these are slower than the SSE/SSE2 SIMD instructions for the way GROMACS uses FFTs. Limitations in the way FFTW allows GROMACS to measure performance make it awkward for either GROMACS or FFTW to make the decision for you based on runtime performance. You should compile a different FFTW library with --enable-sse or --enable-sse2. If you have a more recent FFTW, you may like to compare the performance of GROMACS with FFTW libraries compiled with and without --enable-avx. However, the GROMACS developers do not really expect the FFTW AVX optimization to help, because the performance is limited by memory access, not computation.")
      endif()
  
diff --git a/cmake/gmxTestAVXMaskload.cmake b/cmake/gmxTestAVXMaskload.cmake

index 8b05b12979b490a505882a96678009d9eb4c7b37..40d76c78ac16ed00f83222208c5c7e495ab28990 100644 (file)
--- a/cmake/gmxTestAVXMaskload.cmake
+++ b/cmake/gmxTestAVXMaskload.cmake
@@ -55,7 +55,7 @@ MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE AVX_CFLAGS)
          ELSE()
              TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
                          "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
-                         COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+                         COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG" )
              IF(${VARIABLE}_COMPILEOK)
                  SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
                  MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
diff --git a/cmake/gmxTestCPUAcceleration.cmake b/cmake/gmxTestSimd.cmake

similarity index 72%

rename from cmake/gmxTestCPUAcceleration.cmake

rename to cmake/gmxTestSimd.cmake

index 537379dd683ef7b5380772cc74a2292cd2d880f7..9d910073dc099e5f4757deb67c3e448e0b5c5f97 100644 (file)
--- a/cmake/gmxTestCPUAcceleration.cmake
+++ b/cmake/gmxTestSimd.cmake
@@ -44,75 +44,75 @@ macro(gmx_use_clang_as_with_gnu_compilers_on_osx)
      # compilers assembler instead - and this has to happen before we detect AVX
      # flags.
      if(APPLE AND ${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-        gmx_test_cflag(GNU_C_USE_CLANG_AS "-Wa,-q" ACCELERATION_C_FLAGS)
+        gmx_test_cflag(GNU_C_USE_CLANG_AS "-Wa,-q" SIMD_C_FLAGS)
      endif()
      if(APPLE AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
-        gmx_test_cxxflag(GNU_CXX_USE_CLANG_AS "-Wa,-q" ACCELERATION_CXX_FLAGS)
+        gmx_test_cxxflag(GNU_CXX_USE_CLANG_AS "-Wa,-q" SIMD_CXX_FLAGS)
      endif()
  endmacro()
  
  
-macro(gmx_test_cpu_acceleration)
+macro(gmx_test_simd)
  #
  # To improve backward compatibility on x86 SIMD architectures,
-# we set the flags for all accelerations that are supported, not only
+# we set the flags for all SIMD instructions that are supported, not only
  # the most recent instruction set. I.e., if your machine supports AVX2_256,
  # we will set flags both for AVX2_256, AVX_256, SSE4.1, and SSE2 support.
  
-if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
+if(${GMX_SIMD} STREQUAL "NONE")
      # nothing to do configuration-wise
-    set(ACCELERATION_STATUS_MESSAGE "CPU SIMD acceleration disabled")
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
+    set(SIMD_STATUS_MESSAGE "SIMD instructions disabled")
+elseif(${GMX_SIMD} STREQUAL "SSE2")
  
      gmx_find_cflag_for_source(CFLAGS_SSE2 "C compiler SSE2 flag"
                                "#include<xmmintrin.h>
                                int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_rsqrt_ps(x);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-msse2" "/arch:SSE2")
      gmx_find_cxxflag_for_source(CXXFLAGS_SSE2 "C++ compiler SSE2 flag"
                                  "#include<xmmintrin.h>
                                  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_rsqrt_ps(x);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-msse2" "/arch:SSE2")
  
      if(NOT CFLAGS_SSE2 OR NOT CXXFLAGS_SSE2)
-        message(FATAL_ERROR "Cannot find SSE2 compiler flag. Use a newer compiler, or disable acceleration (slower).")
+        message(FATAL_ERROR "Cannot find SSE2 compiler flag. Use a newer compiler, or disable SIMD (slower).")
      endif()
  
-    set(GMX_CPU_ACCELERATION_X86_SSE2 1)
-    set(GMX_X86_SSE2 1)
+    set(GMX_SIMD_X86_SSE2 1)
+    set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
  
-    set(ACCELERATION_STATUS_MESSAGE "Enabling SSE2 SIMD Gromacs acceleration")
+    set(SIMD_STATUS_MESSAGE "Enabling SSE2 SIMD instructions")
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
+elseif(${GMX_SIMD} STREQUAL "SSE4.1")
  
      # Note: MSVC enables SSE4.1 with the SSE2 flag, so we include that in testing.
      gmx_find_cflag_for_source(CFLAGS_SSE4_1 "C compiler SSE4.1 flag"
                                "#include<smmintrin.h>
                                int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_dp_ps(x,x,0x77);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-msse4.1" "/arch:SSE4.1" "/arch:SSE2")
      gmx_find_cxxflag_for_source(CXXFLAGS_SSE4_1 "C++ compiler SSE4.1 flag"
                                  "#include<smmintrin.h>
                                  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_dp_ps(x,x,0x77);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-msse4.1" "/arch:SSE4.1" "/arch:SSE2")
  
      if(NOT CFLAGS_SSE4_1 OR NOT CXXFLAGS_SSE4_1)
          message(FATAL_ERROR "Cannot find SSE4.1 compiler flag. "
-                            "Use a newer compiler, or choose SSE2 acceleration (slower).")
+                            "Use a newer compiler, or choose SSE2 SIMD (slower).")
      endif()
  
      if(CMAKE_C_COMPILER_ID MATCHES "Intel" AND CMAKE_C_COMPILER_VERSION VERSION_EQUAL "11.1")
-        message(FATAL_ERROR "You are using Intel compiler version 11.1, which produces incorrect results with SSE4.1 acceleration. You need to use a newer compiler (e.g. icc >= 12.0) or in worst case try a lower level of acceleration if performance is not critical.")
+        message(FATAL_ERROR "You are using Intel compiler version 11.1, which produces incorrect results with SSE4.1 SIMD. You need to use a newer compiler (e.g. icc >= 12.0) or in worst case try a lower level of SIMD if performance is not critical.")
      endif()
  
-    set(GMX_CPU_ACCELERATION_X86_SSE4_1 1)
-    set(GMX_X86_SSE4_1 1)
-    set(GMX_X86_SSE2   1)
-    set(ACCELERATION_STATUS_MESSAGE "Enabling SSE4.1 SIMD Gromacs acceleration")
+    set(GMX_SIMD_X86_SSE4_1 1)
+    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1)
+    set(GMX_SIMD_X86_SSE2_OR_HIGHER   1)
+    set(SIMD_STATUS_MESSAGE "Enabling SSE4.1 SIMD instructions")
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA")
+elseif(${GMX_SIMD} STREQUAL "AVX_128_FMA")
  
      gmx_use_clang_as_with_gnu_compilers_on_osx()
  
@@ -125,18 +125,18 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA")
      gmx_find_cflag_for_source(CFLAGS_AVX_128 "C compiler AVX (128 bit) flag"
                                "#include<immintrin.h>
                                int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_permute_ps(x,1);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-mavx" "/arch:AVX")
      gmx_find_cxxflag_for_source(CXXFLAGS_AVX_128 "C++ compiler AVX (128 bit) flag"
                                  "#include<immintrin.h>
                                  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_permute_ps(x,1);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-mavx" "/arch:AVX")
  
      ### STAGE 2: Find the fused-multiply add flag.
      # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
-    check_include_file(x86intrin.h HAVE_X86INTRIN_H ${ACCELERATION_C_FLAGS})
-    check_include_file(intrin.h HAVE_INTRIN_H ${ACCELERATION_C_FLAGS})
+    check_include_file(x86intrin.h HAVE_X86INTRIN_H ${SIMD_C_FLAGS})
+    check_include_file(intrin.h HAVE_INTRIN_H ${SIMD_C_FLAGS})
      if(HAVE_X86INTRIN_H)
          set(INCLUDE_X86INTRIN_H "#include <x86intrin.h>")
      endif()
@@ -149,19 +149,19 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA")
  ${INCLUDE_X86INTRIN_H}
  ${INCLUDE_INTRIN_H}
  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_macc_ps(x,x,x);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-mfma4")
      gmx_find_cxxflag_for_source(CXXFLAGS_AVX_128_FMA "C++ compiler AVX (128 bit) FMA4 flag"
  "#include<immintrin.h>
  ${INCLUDE_X86INTRIN_H}
  ${INCLUDE_INTRIN_H}
  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_macc_ps(x,x,x);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-mfma4")
  
      # We only need to check the last (FMA) test; that will always fail if the basic AVX128 test failed
      if(NOT CFLAGS_AVX_128_FMA OR NOT CXXFLAGS_AVX_128_FMA)
-        message(FATAL_ERROR "Cannot find compiler flags for 128 bit AVX with FMA support. Use a newer compiler, or choose SSE4.1 acceleration (slower).")
+        message(FATAL_ERROR "Cannot find compiler flags for 128 bit AVX with FMA support. Use a newer compiler, or choose SSE4.1 SIMD (slower).")
      endif()
  
      ### STAGE 3: Optional: Find the XOP instruction flag (No point in yelling if this does not work)
@@ -170,14 +170,14 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_macc_ps(x,x,x);return 0;}"
  ${INCLUDE_X86INTRIN_H}
  ${INCLUDE_INTRIN_H}
  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-mxop")
      gmx_find_cxxflag_for_source(CXXFLAGS_AVX_128_XOP "C++ compiler AVX (128 bit) XOP flag"
  "#include<immintrin.h>
  ${INCLUDE_X86INTRIN_H}
  ${INCLUDE_INTRIN_H}
  int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-mxop")
  
      # We don't have the full compiler version string yet (BUILD_C_COMPILER),
@@ -185,7 +185,7 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}"
      # hackintoshes is not worth the effort.
      if (APPLE AND (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR
                  ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang"))
-        message(WARNING "Due to a known compiler bug, Clang up to version 3.2 (and Apple Clang up to version 4.1) produces incorrect code with AVX_128_FMA acceleration. As we cannot work around this bug on OS X, you will have to select a different compiler or CPU acceleration.")
+        message(WARNING "Due to a known compiler bug, Clang up to version 3.2 (and Apple Clang up to version 4.1) produces incorrect code with AVX_128_FMA SIMD. As we cannot work around this bug on OS X, you will have to select a different compiler or SIMD instruction set.")
      endif()
  
  
@@ -200,44 +200,44 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}"
          set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -no-integrated-as")
      endif()
  
-    gmx_test_avx_gcc_maskload_bug(GMX_X86_AVX_GCC_MASKLOAD_BUG "${ACCELERATION_C_FLAGS}")
+    gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}")
  
-    set(GMX_CPU_ACCELERATION_X86_AVX_128_FMA 1)
-    set(GMX_X86_AVX_128_FMA 1)
-    set(GMX_X86_SSE4_1      1)
-    set(GMX_X86_SSE2        1)
+    set(GMX_SIMD_X86_AVX_128_FMA 1)
+    set(GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER 1)
+    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER      1)
+    set(GMX_SIMD_X86_SSE2_OR_HIGHER        1)
  
-    set(ACCELERATION_STATUS_MESSAGE "Enabling 128-bit AVX SIMD Gromacs acceleration (with fused-multiply add)")
+    set(SIMD_STATUS_MESSAGE "Enabling 128-bit AVX SIMD Gromacs SIMD (with fused-multiply add)")
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
+elseif(${GMX_SIMD} STREQUAL "AVX_256")
  
      gmx_use_clang_as_with_gnu_compilers_on_osx()
  
      gmx_find_cflag_for_source(CFLAGS_AVX "C compiler AVX (256 bit) flag"
                                "#include<immintrin.h>
                                int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_add_ps(x,x);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-mavx" "/arch:AVX")
      gmx_find_cxxflag_for_source(CXXFLAGS_AVX "C++ compiler AVX (256 bit) flag"
                                  "#include<immintrin.h>
                                  int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_add_ps(x,x);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-mavx" "/arch:AVX")
  
      if(NOT CFLAGS_AVX OR NOT CXXFLAGS_AVX)
-        message(FATAL_ERROR "Cannot find AVX compiler flag. Use a newer compiler, or choose SSE4.1 acceleration (slower).")
+        message(FATAL_ERROR "Cannot find AVX compiler flag. Use a newer compiler, or choose SSE4.1 SIMD (slower).")
      endif()
  
-    gmx_test_avx_gcc_maskload_bug(GMX_X86_AVX_GCC_MASKLOAD_BUG "${ACCELERATION_C_FLAGS}")
+    gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}")
  
-    set(GMX_CPU_ACCELERATION_X86_AVX_256 1)
-    set(GMX_X86_AVX_256  1)
-    set(GMX_X86_SSE4_1   1)
-    set(GMX_X86_SSE2     1)
+    set(GMX_SIMD_X86_AVX_256 1)
+    set(GMX_SIMD_X86_AVX_256_OR_HIGHER  1)
+    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER   1)
+    set(GMX_SIMD_X86_SSE2_OR_HIGHER     1)
  
-    set(ACCELERATION_STATUS_MESSAGE "Enabling 256-bit AVX SIMD Gromacs acceleration")
+    set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX SIMD instructions")
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX2_256")
+elseif(${GMX_SIMD} STREQUAL "AVX2_256")
  
      # Comment out this line for AVX2 development
      message(FATAL_ERROR "AVX2_256 is disabled until the implementation has been commited.")
@@ -247,50 +247,50 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX2_256")
      gmx_find_cflag_for_source(CFLAGS_AVX2 "C compiler AVX2 flag"
                                "#include<immintrin.h>
                                int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_fmadd_ps(x,x,x);return 0;}"
-                              ACCELERATION_C_FLAGS
+                              SIMD_C_FLAGS
                                "-march=core-avx2" "-mavx2" "/arch:AVX") # no AVX2-specific flag for MSVC yet
      gmx_find_cxxflag_for_source(CXXFLAGS_AVX2 "C++ compiler AVX2 flag"
                                  "#include<immintrin.h>
                                  int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_fmadd_ps(x,x,x);return 0;}"
-                                ACCELERATION_CXX_FLAGS
+                                SIMD_CXX_FLAGS
                                  "-march=core-avx2" "-mavx2" "/arch:AVX") # no AVX2-specific flag for MSVC yet
  
      if(NOT CFLAGS_AVX2 OR NOT CXXFLAGS_AVX2)
-        message(FATAL_ERROR "Cannot find AVX2 compiler flag. Use a newer compiler, or choose AVX acceleration (slower).")
+        message(FATAL_ERROR "Cannot find AVX2 compiler flag. Use a newer compiler, or choose AVX SIMD (slower).")
      endif()
  
      # No need to test for Maskload bug - it was fixed before gcc added AVX2 support
  
-    set(GMX_CPU_ACCELERATION_X86_AVX2_256 1)
-    set(GMX_X86_AVX2_256 1)
-    set(GMX_X86_AVX_256  1)
-    set(GMX_X86_SSE4_1   1)
-    set(GMX_X86_SSE2     1)
+    set(GMX_SIMD_X86_AVX2_256 1)
+    set(GMX_SIMD_X86_AVX2_256_OR_HIGHER 1)
+    set(GMX_SIMD_X86_AVX_256_OR_HIGHER  1)
+    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER   1)
+    set(GMX_SIMD_X86_SSE2_OR_HIGHER     1)
  
-    set(ACCELERATION_STATUS_MESSAGE "Enabling 256-bit AVX2 Gromacs acceleration")
+    set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions")
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
+elseif(${GMX_SIMD} STREQUAL "IBM_QPX")
  
      try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
          "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c")
  
      if (TEST_QPX)
-        message(WARNING "IBM QPX acceleration was selected. This will work, but SIMD-accelerated kernels are only available for the Verlet cut-off scheme. The plain C kernels that are used for the group cut-off scheme kernels will be slow, so please consider using the Verlet cut-off scheme.")
-        set(GMX_CPU_ACCELERATION_IBM_QPX 1)
-        set(ACCELERATION_STATUS_MESSAGE "Enabling IBM QPX SIMD acceleration")
+        message(WARNING "IBM QPX SIMD instructions selected. This will work, but SIMD kernels are only available for the Verlet cut-off scheme. The plain C kernels that are used for the group cut-off scheme kernels will be slow, so please consider using the Verlet cut-off scheme.")
+        set(GMX_SIMD_IBM_QPX 1)
+        set(SIMD_STATUS_MESSAGE "Enabling IBM QPX SIMD instructions")
  
      else()
          message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics. If you are compiling for BlueGene/Q with the XL compilers, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-C' to set up the tool chain.")
      endif()
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE")
+elseif(${GMX_SIMD} STREQUAL "SPARC64_HPC_ACE")
  
-    set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1)
-    set(ACCELERATION_STATUS_MESSAGE "Enabling Sparc64 HPC-ACE SIMD acceleration")
+    set(GMX_SIMD_SPARC64_HPC_ACE 1)
+    set(SIMD_STATUS_MESSAGE "Enabling Sparc64 HPC-ACE SIMD instructions")
  
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "REFERENCE")
+elseif(${GMX_SIMD} STREQUAL "REFERENCE")
  
-    add_definitions(-DGMX_SIMD_REFERENCE_PLAIN_C)
+    add_definitions(-DGMX_SIMD_REFERENCE)
      if(${GMX_NBNXN_REF_KERNEL_TYPE} STREQUAL "4xn")
          if(${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "2" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "4" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "8")
              add_definitions(-DGMX_NBNXN_SIMD_4XN -DGMX_SIMD_REF_WIDTH=${GMX_NBNXN_REF_KERNEL_WIDTH})
@@ -308,13 +308,13 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "REFERENCE")
      endif()
  
  else()
-    gmx_invalid_option_value(GMX_CPU_ACCELERATION)
+    gmx_invalid_option_value(GMX_SIMD)
  endif()
  
  
-gmx_check_if_changed(ACCELERATION_CHANGED GMX_CPU_ACCELERATION)
-if (ACCELERATION_CHANGED AND DEFINED ACCELERATION_STATUS_MESSAGE)
-    message(STATUS "${ACCELERATION_STATUS_MESSAGE}")
+gmx_check_if_changed(SIMD_CHANGED GMX_SIMD)
+if (SIMD_CHANGED AND DEFINED SIMD_STATUS_MESSAGE)
+    message(STATUS "${SIMD_STATUS_MESSAGE}")
  endif()
  
  endmacro()
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 1804b63879e2956e0d6d22e95b061e278aa19534..1c09d5b41bb4c55e864c71817614a43e1e6da5fd 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -110,40 +110,40 @@
  #cmakedefine GMX_TARGET_BGQ
  
  /* SSE2 instructions available */
-#cmakedefine GMX_X86_SSE2
+#cmakedefine GMX_SIMD_X86_SSE2_OR_HIGHER
  
  /* SSE4.1 instructions available */
-#cmakedefine GMX_X86_SSE4_1
+#cmakedefine GMX_SIMD_X86_SSE4_1_OR_HIGHER
  
-/* AVX 128-bit FMA instructions available */
-#cmakedefine GMX_X86_AVX_128_FMA
+/* AVX 128-bit FMA instructions available (AMD side of the AVX world) */
+#cmakedefine GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
  
-/* AVX 256-bit instructions available */
-#cmakedefine GMX_X86_AVX_256
+/* AVX 256-bit instructions available (Intel side of the AVX world) */
+#cmakedefine GMX_SIMD_X86_AVX_256_OR_HIGHER
  
  /* GCC bug in AVX maskload/maskstore arguments - worked around internally */
-#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+#cmakedefine GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
  
-/* SSE2 was selected as CPU acceleration level */
-#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
+/* SSE2 was selected for SIMD instruction set level */
+#cmakedefine GMX_SIMD_X86_SSE2
  
-/* SSE4.1 was selected as CPU acceleration level */
-#cmakedefine GMX_CPU_ACCELERATION_X86_SSE4_1
+/* SSE4.1 was selected as SIMD instructions */
+#cmakedefine GMX_SIMD_X86_SSE4_1
  
-/* AVX 128-bit FMA was selected as CPU acceleration level */
-#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_128_FMA
+/* AVX 128-bit FMA was selected as SIMD instructions */
+#cmakedefine GMX_SIMD_X86_AVX_128_FMA
  
-/* AVX 256-bit was selected as CPU acceleration level */
-#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_256
+/* AVX 256-bit was selected as SIMD instructions */
+#cmakedefine GMX_SIMD_X86_AVX_256
  
-/* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */
-#cmakedefine GMX_CPU_ACCELERATION_IBM_QPX
+/* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */
+#cmakedefine GMX_SIMD_IBM_QPX
  
  /* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
-#cmakedefine GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
+#cmakedefine GMX_SIMD_SPARC64_HPC_ACE
  
-/* String for CPU acceleration choice (for writing to log files and stdout) */
-#define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
+/* String for SIMD instruction choice (for writing to log files and stdout) */
+#define GMX_SIMD_STRING "@GMX_SIMD@"
  
  /* Integer byte order is big endian. */
  #cmakedefine GMX_INTEGER_BIG_ENDIAN
diff --git a/src/contrib/fftw/CMakeLists.txt b/src/contrib/fftw/CMakeLists.txt

index e02a0c1ac112795627f2ab4aaf57905a7e7ddd78..1365dda4a7d182919d5f9cb344922a217ea7314a 100644 (file)
--- a/src/contrib/fftw/CMakeLists.txt
+++ b/src/contrib/fftw/CMakeLists.txt
@@ -56,7 +56,7 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND BUILD_SHARED_LIBS) # FFTW doesn
  endif()
  
  # Testing shows FFTW configured with --enable-avx is never better than --enable-sse2, so we do the latter always.
-if(${GMX_CPU_ACCELERATION} MATCHES "^(SSE|AVX)")
+if(${GMX_SIMD} MATCHES "^(SSE|AVX)")
      set(GMX_BUILD_OWN_FFTW_OPTIMIZATION_CONFIGURATION --enable-sse2 CACHE INTERNAL "Optimization flags for FFTW compilation")
  endif()
  
diff --git a/src/gromacs/gmxlib/bondfree.c b/src/gromacs/gmxlib/bondfree.c

index 40989ed89c011d0e3c3939114c2da732f1bfbf65..006221a92bf674d2538a032da08256ebed4715d0 100644 (file)
--- a/src/gromacs/gmxlib/bondfree.c
+++ b/src/gromacs/gmxlib/bondfree.c
@@ -120,15 +120,15 @@ static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
  
  /* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
  typedef struct {
-    gmx_mm_pr inv_bzz;
-    gmx_mm_pr inv_byy;
-    gmx_mm_pr inv_bxx;
-    gmx_mm_pr bzx;
-    gmx_mm_pr bzy;
-    gmx_mm_pr bzz;
-    gmx_mm_pr byx;
-    gmx_mm_pr byy;
-    gmx_mm_pr bxx;
+    gmx_simd_real_t inv_bzz;
+    gmx_simd_real_t inv_byy;
+    gmx_simd_real_t inv_bxx;
+    gmx_simd_real_t bzx;
+    gmx_simd_real_t bzy;
+    gmx_simd_real_t bzz;
+    gmx_simd_real_t byx;
+    gmx_simd_real_t byy;
+    gmx_simd_real_t bxx;
  } pbc_simd_t;
  
  /* Set the SIMD pbc data from a normal t_pbc struct */
@@ -147,48 +147,48 @@ static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
          }
      }
  
-    pbc_simd->inv_bzz = gmx_set1_pr(inv_bdiag[ZZ]);
-    pbc_simd->inv_byy = gmx_set1_pr(inv_bdiag[YY]);
-    pbc_simd->inv_bxx = gmx_set1_pr(inv_bdiag[XX]);
+    pbc_simd->inv_bzz = gmx_simd_set1_r(inv_bdiag[ZZ]);
+    pbc_simd->inv_byy = gmx_simd_set1_r(inv_bdiag[YY]);
+    pbc_simd->inv_bxx = gmx_simd_set1_r(inv_bdiag[XX]);
  
      if (pbc != NULL)
      {
-        pbc_simd->bzx = gmx_set1_pr(pbc->box[ZZ][XX]);
-        pbc_simd->bzy = gmx_set1_pr(pbc->box[ZZ][YY]);
-        pbc_simd->bzz = gmx_set1_pr(pbc->box[ZZ][ZZ]);
-        pbc_simd->byx = gmx_set1_pr(pbc->box[YY][XX]);
-        pbc_simd->byy = gmx_set1_pr(pbc->box[YY][YY]);
-        pbc_simd->bxx = gmx_set1_pr(pbc->box[XX][XX]);
+        pbc_simd->bzx = gmx_simd_set1_r(pbc->box[ZZ][XX]);
+        pbc_simd->bzy = gmx_simd_set1_r(pbc->box[ZZ][YY]);
+        pbc_simd->bzz = gmx_simd_set1_r(pbc->box[ZZ][ZZ]);
+        pbc_simd->byx = gmx_simd_set1_r(pbc->box[YY][XX]);
+        pbc_simd->byy = gmx_simd_set1_r(pbc->box[YY][YY]);
+        pbc_simd->bxx = gmx_simd_set1_r(pbc->box[XX][XX]);
      }
      else
      {
-        pbc_simd->bzx = gmx_setzero_pr();
-        pbc_simd->bzy = gmx_setzero_pr();
-        pbc_simd->bzz = gmx_setzero_pr();
-        pbc_simd->byx = gmx_setzero_pr();
-        pbc_simd->byy = gmx_setzero_pr();
-        pbc_simd->bxx = gmx_setzero_pr();
+        pbc_simd->bzx = gmx_simd_setzero_r();
+        pbc_simd->bzy = gmx_simd_setzero_r();
+        pbc_simd->bzz = gmx_simd_setzero_r();
+        pbc_simd->byx = gmx_simd_setzero_r();
+        pbc_simd->byy = gmx_simd_setzero_r();
+        pbc_simd->bxx = gmx_simd_setzero_r();
      }
  }
  
  /* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
  static gmx_inline void
-pbc_dx_simd(gmx_mm_pr *dx, gmx_mm_pr *dy, gmx_mm_pr *dz,
+pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
              const pbc_simd_t *pbc)
  {
-    gmx_mm_pr sh;
+    gmx_simd_real_t sh;
  
-    sh  = gmx_round_pr(gmx_mul_pr(*dz, pbc->inv_bzz));
-    *dx = gmx_nmsub_pr(sh, pbc->bzx, *dx);
-    *dy = gmx_nmsub_pr(sh, pbc->bzy, *dy);
-    *dz = gmx_nmsub_pr(sh, pbc->bzz, *dz);
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dz, pbc->inv_bzz));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->bzx, *dx);
+    *dy = gmx_simd_fnmadd_r(sh, pbc->bzy, *dy);
+    *dz = gmx_simd_fnmadd_r(sh, pbc->bzz, *dz);
  
-    sh  = gmx_round_pr(gmx_mul_pr(*dy, pbc->inv_byy));
-    *dx = gmx_nmsub_pr(sh, pbc->byx, *dx);
-    *dy = gmx_nmsub_pr(sh, pbc->byy, *dy);
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dy, pbc->inv_byy));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->byx, *dx);
+    *dy = gmx_simd_fnmadd_r(sh, pbc->byy, *dy);
  
-    sh  = gmx_round_pr(gmx_mul_pr(*dx, pbc->inv_bxx));
-    *dx = gmx_nmsub_pr(sh, pbc->bxx, *dx);
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dx, pbc->inv_bxx));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx);
  }
  
  #endif /* SIMD_BONDEDS */
@@ -1051,57 +1051,57 @@ angles_noener_simd(int nbonds,
                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
                     int gmx_unused *global_atom_index)
  {
-#define UNROLL GMX_SIMD_WIDTH_HERE
-    const int      nfa1 = 4;
-    int            i, iu, s, m;
-    int            type, ai[UNROLL], aj[UNROLL], ak[UNROLL];
-    real           coeff_array[2*UNROLL+UNROLL], *coeff;
-    real           dr_array[2*DIM*UNROLL+UNROLL], *dr;
-    real           f_buf_array[6*UNROLL+UNROLL], *f_buf;
-    gmx_mm_pr      k_S, theta0_S;
-    gmx_mm_pr      rijx_S, rijy_S, rijz_S;
-    gmx_mm_pr      rkjx_S, rkjy_S, rkjz_S;
-    gmx_mm_pr      one_S;
-    gmx_mm_pr      min_one_plus_eps_S;
-    gmx_mm_pr      rij_rkj_S;
-    gmx_mm_pr      nrij2_S, nrij_1_S;
-    gmx_mm_pr      nrkj2_S, nrkj_1_S;
-    gmx_mm_pr      cos_S, invsin_S;
-    gmx_mm_pr      theta_S;
-    gmx_mm_pr      st_S, sth_S;
-    gmx_mm_pr      cik_S, cii_S, ckk_S;
-    gmx_mm_pr      f_ix_S, f_iy_S, f_iz_S;
-    gmx_mm_pr      f_kx_S, f_ky_S, f_kz_S;
-    pbc_simd_t     pbc_simd;
+    const int            nfa1 = 4;
+    int                  i, iu, s, m;
+    int                  type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH];
+    int                  ak[GMX_SIMD_REAL_WIDTH];
+    real                 coeff_array[2*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *coeff;
+    real                 dr_array[2*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                 f_buf_array[6*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *f_buf;
+    gmx_simd_real_t      k_S, theta0_S;
+    gmx_simd_real_t      rijx_S, rijy_S, rijz_S;
+    gmx_simd_real_t      rkjx_S, rkjy_S, rkjz_S;
+    gmx_simd_real_t      one_S;
+    gmx_simd_real_t      min_one_plus_eps_S;
+    gmx_simd_real_t      rij_rkj_S;
+    gmx_simd_real_t      nrij2_S, nrij_1_S;
+    gmx_simd_real_t      nrkj2_S, nrkj_1_S;
+    gmx_simd_real_t      cos_S, invsin_S;
+    gmx_simd_real_t      theta_S;
+    gmx_simd_real_t      st_S, sth_S;
+    gmx_simd_real_t      cik_S, cii_S, ckk_S;
+    gmx_simd_real_t      f_ix_S, f_iy_S, f_iz_S;
+    gmx_simd_real_t      f_kx_S, f_ky_S, f_kz_S;
+    pbc_simd_t           pbc_simd;
  
      /* Ensure register memory alignment */
-    coeff = gmx_simd_align_real(coeff_array);
-    dr    = gmx_simd_align_real(dr_array);
-    f_buf = gmx_simd_align_real(f_buf_array);
+    coeff = gmx_simd_align_r(coeff_array);
+    dr    = gmx_simd_align_r(dr_array);
+    f_buf = gmx_simd_align_r(f_buf_array);
  
      set_pbc_simd(pbc, &pbc_simd);
  
-    one_S = gmx_set1_pr(1.0);
+    one_S = gmx_simd_set1_r(1.0);
  
      /* The smallest number > -1 */
-    min_one_plus_eps_S = gmx_set1_pr(-1.0 + 2*GMX_REAL_EPS);
+    min_one_plus_eps_S = gmx_simd_set1_r(-1.0 + 2*GMX_REAL_EPS);
  
-    /* nbonds is the number of angles times nfa1, here we step UNROLL angles */
-    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+    /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
      {
-        /* Collect atoms for UNROLL angles.
+        /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
           * iu indexes into forceatoms, we should not let iu go beyond nbonds.
           */
          iu = i;
-        for (s = 0; s < UNROLL; s++)
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
          {
              type  = forceatoms[iu];
              ai[s] = forceatoms[iu+1];
              aj[s] = forceatoms[iu+2];
              ak[s] = forceatoms[iu+3];
  
-            coeff[s]        = forceparams[type].harmonic.krA;
-            coeff[UNROLL+s] = forceparams[type].harmonic.rA*DEG2RAD;
+            coeff[s]                     = forceparams[type].harmonic.krA;
+            coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD;
  
              /* If you can't use pbc_dx_simd below for PBC, e.g. because
               * you can't round in SIMD, use pbc_rvec_sub here.
@@ -1109,8 +1109,8 @@ angles_noener_simd(int nbonds,
              /* Store the non PBC corrected distances packed and aligned */
              for (m = 0; m < DIM; m++)
              {
-                dr[s +      m *UNROLL] = x[ai[s]][m] - x[aj[s]][m];
-                dr[s + (DIM+m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
+                dr[s +      m *GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
+                dr[s + (DIM+m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
              }
  
              /* At the end fill the arrays with identical entries */
@@ -1120,70 +1120,70 @@ angles_noener_simd(int nbonds,
              }
          }
  
-        k_S       = gmx_load_pr(coeff);
-        theta0_S  = gmx_load_pr(coeff+UNROLL);
+        k_S       = gmx_simd_load_r(coeff);
+        theta0_S  = gmx_simd_load_r(coeff+GMX_SIMD_REAL_WIDTH);
  
-        rijx_S    = gmx_load_pr(dr + 0*UNROLL);
-        rijy_S    = gmx_load_pr(dr + 1*UNROLL);
-        rijz_S    = gmx_load_pr(dr + 2*UNROLL);
-        rkjx_S    = gmx_load_pr(dr + 3*UNROLL);
-        rkjy_S    = gmx_load_pr(dr + 4*UNROLL);
-        rkjz_S    = gmx_load_pr(dr + 5*UNROLL);
+        rijx_S    = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
+        rijy_S    = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
+        rijz_S    = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
+        rkjx_S    = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
+        rkjy_S    = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
+        rkjz_S    = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
  
          pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
          pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
  
-        rij_rkj_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
-                                 rkjx_S, rkjy_S, rkjz_S);
+        rij_rkj_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                     rkjx_S, rkjy_S, rkjz_S);
  
-        nrij2_S   = gmx_norm2_pr(rijx_S, rijy_S, rijz_S);
-        nrkj2_S   = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+        nrij2_S   = gmx_simd_norm2_r(rijx_S, rijy_S, rijz_S);
+        nrkj2_S   = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S);
  
-        nrij_1_S  = gmx_invsqrt_pr(nrij2_S);
-        nrkj_1_S  = gmx_invsqrt_pr(nrkj2_S);
+        nrij_1_S  = gmx_simd_invsqrt_r(nrij2_S);
+        nrkj_1_S  = gmx_simd_invsqrt_r(nrkj2_S);
  
-        cos_S     = gmx_mul_pr(rij_rkj_S, gmx_mul_pr(nrij_1_S, nrkj_1_S));
+        cos_S     = gmx_simd_mul_r(rij_rkj_S, gmx_simd_mul_r(nrij_1_S, nrkj_1_S));
  
          /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
           * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
           * This also ensures that rounding errors would cause the argument
-         * of gmx_acos_pr to be < -1.
+         * of gmx_simd_acos_r to be < -1.
           * Note that we do not take precautions for cos(0)=1, so the outer
           * atoms in an angle should not be on top of each other.
           */
-        cos_S     = gmx_max_pr(cos_S, min_one_plus_eps_S);
-
-        theta_S   = gmx_acos_pr(cos_S);
-
-        invsin_S  = gmx_invsqrt_pr(gmx_sub_pr(one_S, gmx_mul_pr(cos_S, cos_S)));
-
-        st_S      = gmx_mul_pr(gmx_mul_pr(k_S, gmx_sub_pr(theta0_S, theta_S)),
-                               invsin_S);
-        sth_S     = gmx_mul_pr(st_S, cos_S);
-
-        cik_S     = gmx_mul_pr(st_S,  gmx_mul_pr(nrij_1_S, nrkj_1_S));
-        cii_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrij_1_S, nrij_1_S));
-        ckk_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrkj_1_S, nrkj_1_S));
-
-        f_ix_S    = gmx_mul_pr(cii_S, rijx_S);
-        f_ix_S    = gmx_nmsub_pr(cik_S, rkjx_S, f_ix_S);
-        f_iy_S    = gmx_mul_pr(cii_S, rijy_S);
-        f_iy_S    = gmx_nmsub_pr(cik_S, rkjy_S, f_iy_S);
-        f_iz_S    = gmx_mul_pr(cii_S, rijz_S);
-        f_iz_S    = gmx_nmsub_pr(cik_S, rkjz_S, f_iz_S);
-        f_kx_S    = gmx_mul_pr(ckk_S, rkjx_S);
-        f_kx_S    = gmx_nmsub_pr(cik_S, rijx_S, f_kx_S);
-        f_ky_S    = gmx_mul_pr(ckk_S, rkjy_S);
-        f_ky_S    = gmx_nmsub_pr(cik_S, rijy_S, f_ky_S);
-        f_kz_S    = gmx_mul_pr(ckk_S, rkjz_S);
-        f_kz_S    = gmx_nmsub_pr(cik_S, rijz_S, f_kz_S);
-
-        gmx_store_pr(f_buf + 0*UNROLL, f_ix_S);
-        gmx_store_pr(f_buf + 1*UNROLL, f_iy_S);
-        gmx_store_pr(f_buf + 2*UNROLL, f_iz_S);
-        gmx_store_pr(f_buf + 3*UNROLL, f_kx_S);
-        gmx_store_pr(f_buf + 4*UNROLL, f_ky_S);
-        gmx_store_pr(f_buf + 5*UNROLL, f_kz_S);
+        cos_S     = gmx_simd_max_r(cos_S, min_one_plus_eps_S);
+
+        theta_S   = gmx_simd_acos_r(cos_S);
+
+        invsin_S  = gmx_simd_invsqrt_r(gmx_simd_sub_r(one_S, gmx_simd_mul_r(cos_S, cos_S)));
+
+        st_S      = gmx_simd_mul_r(gmx_simd_mul_r(k_S, gmx_simd_sub_r(theta0_S, theta_S)),
+                                   invsin_S);
+        sth_S     = gmx_simd_mul_r(st_S, cos_S);
+
+        cik_S     = gmx_simd_mul_r(st_S,  gmx_simd_mul_r(nrij_1_S, nrkj_1_S));
+        cii_S     = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrij_1_S, nrij_1_S));
+        ckk_S     = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrkj_1_S, nrkj_1_S));
+
+        f_ix_S    = gmx_simd_mul_r(cii_S, rijx_S);
+        f_ix_S    = gmx_simd_fnmadd_r(cik_S, rkjx_S, f_ix_S);
+        f_iy_S    = gmx_simd_mul_r(cii_S, rijy_S);
+        f_iy_S    = gmx_simd_fnmadd_r(cik_S, rkjy_S, f_iy_S);
+        f_iz_S    = gmx_simd_mul_r(cii_S, rijz_S);
+        f_iz_S    = gmx_simd_fnmadd_r(cik_S, rkjz_S, f_iz_S);
+        f_kx_S    = gmx_simd_mul_r(ckk_S, rkjx_S);
+        f_kx_S    = gmx_simd_fnmadd_r(cik_S, rijx_S, f_kx_S);
+        f_ky_S    = gmx_simd_mul_r(ckk_S, rkjy_S);
+        f_ky_S    = gmx_simd_fnmadd_r(cik_S, rijy_S, f_ky_S);
+        f_kz_S    = gmx_simd_mul_r(ckk_S, rkjz_S);
+        f_kz_S    = gmx_simd_fnmadd_r(cik_S, rijz_S, f_kz_S);
+
+        gmx_simd_store_r(f_buf + 0*GMX_SIMD_REAL_WIDTH, f_ix_S);
+        gmx_simd_store_r(f_buf + 1*GMX_SIMD_REAL_WIDTH, f_iy_S);
+        gmx_simd_store_r(f_buf + 2*GMX_SIMD_REAL_WIDTH, f_iz_S);
+        gmx_simd_store_r(f_buf + 3*GMX_SIMD_REAL_WIDTH, f_kx_S);
+        gmx_simd_store_r(f_buf + 4*GMX_SIMD_REAL_WIDTH, f_ky_S);
+        gmx_simd_store_r(f_buf + 5*GMX_SIMD_REAL_WIDTH, f_kz_S);
  
          iu = i;
          s  = 0;
@@ -1191,16 +1191,15 @@ angles_noener_simd(int nbonds,
          {
              for (m = 0; m < DIM; m++)
              {
-                f[ai[s]][m] += f_buf[s + m*UNROLL];
-                f[aj[s]][m] -= f_buf[s + m*UNROLL] + f_buf[s + (DIM+m)*UNROLL];
-                f[ak[s]][m] += f_buf[s + (DIM+m)*UNROLL];
+                f[ai[s]][m] += f_buf[s + m*GMX_SIMD_REAL_WIDTH];
+                f[aj[s]][m] -= f_buf[s + m*GMX_SIMD_REAL_WIDTH] + f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH];
+                f[ak[s]][m] += f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH];
              }
              s++;
              iu += nfa1;
          }
-        while (s < UNROLL && iu < nbonds);
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
      }
-#undef UNROLL
  }
  
  #endif /* SIMD_BONDEDS */
@@ -1514,125 +1513,123 @@ dih_angle_simd(const rvec *x,
                 const int *ai, const int *aj, const int *ak, const int *al,
                 const pbc_simd_t *pbc,
                 real *dr,
-               gmx_mm_pr *phi_S,
-               gmx_mm_pr *mx_S, gmx_mm_pr *my_S, gmx_mm_pr *mz_S,
-               gmx_mm_pr *nx_S, gmx_mm_pr *ny_S, gmx_mm_pr *nz_S,
-               gmx_mm_pr *nrkj_m2_S,
-               gmx_mm_pr *nrkj_n2_S,
+               gmx_simd_real_t *phi_S,
+               gmx_simd_real_t *mx_S, gmx_simd_real_t *my_S, gmx_simd_real_t *mz_S,
+               gmx_simd_real_t *nx_S, gmx_simd_real_t *ny_S, gmx_simd_real_t *nz_S,
+               gmx_simd_real_t *nrkj_m2_S,
+               gmx_simd_real_t *nrkj_n2_S,
                 real *p,
                 real *q)
  {
-#define UNROLL GMX_SIMD_WIDTH_HERE
-    int       s, m;
-    gmx_mm_pr rijx_S, rijy_S, rijz_S;
-    gmx_mm_pr rkjx_S, rkjy_S, rkjz_S;
-    gmx_mm_pr rklx_S, rkly_S, rklz_S;
-    gmx_mm_pr cx_S, cy_S, cz_S;
-    gmx_mm_pr cn_S;
-    gmx_mm_pr s_S;
-    gmx_mm_pr ipr_S;
-    gmx_mm_pr iprm_S, iprn_S;
-    gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
-    gmx_mm_pr toler_S;
-    gmx_mm_pr p_S, q_S;
-    gmx_mm_pr nrkj2_min_S;
-    gmx_mm_pr real_eps_S;
+    int             s, m;
+    gmx_simd_real_t rijx_S, rijy_S, rijz_S;
+    gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S;
+    gmx_simd_real_t rklx_S, rkly_S, rklz_S;
+    gmx_simd_real_t cx_S, cy_S, cz_S;
+    gmx_simd_real_t cn_S;
+    gmx_simd_real_t s_S;
+    gmx_simd_real_t ipr_S;
+    gmx_simd_real_t iprm_S, iprn_S;
+    gmx_simd_real_t nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+    gmx_simd_real_t toler_S;
+    gmx_simd_real_t p_S, q_S;
+    gmx_simd_real_t nrkj2_min_S;
+    gmx_simd_real_t real_eps_S;
  
      /* Used to avoid division by zero.
       * We take into acount that we multiply the result by real_eps_S.
       */
-    nrkj2_min_S = gmx_set1_pr(GMX_REAL_MIN/(2*GMX_REAL_EPS));
+    nrkj2_min_S = gmx_simd_set1_r(GMX_REAL_MIN/(2*GMX_REAL_EPS));
  
      /* The value of the last significant bit (GMX_REAL_EPS is half of that) */
-    real_eps_S  = gmx_set1_pr(2*GMX_REAL_EPS);
+    real_eps_S  = gmx_simd_set1_r(2*GMX_REAL_EPS);
  
-    for (s = 0; s < UNROLL; s++)
+    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
      {
          /* If you can't use pbc_dx_simd below for PBC, e.g. because
           * you can't round in SIMD, use pbc_rvec_sub here.
           */
          for (m = 0; m < DIM; m++)
          {
-            dr[s + (0*DIM + m)*UNROLL] = x[ai[s]][m] - x[aj[s]][m];
-            dr[s + (1*DIM + m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
-            dr[s + (2*DIM + m)*UNROLL] = x[ak[s]][m] - x[al[s]][m];
+            dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
+            dr[s + (1*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
+            dr[s + (2*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[al[s]][m];
          }
      }
  
-    rijx_S = gmx_load_pr(dr + 0*UNROLL);
-    rijy_S = gmx_load_pr(dr + 1*UNROLL);
-    rijz_S = gmx_load_pr(dr + 2*UNROLL);
-    rkjx_S = gmx_load_pr(dr + 3*UNROLL);
-    rkjy_S = gmx_load_pr(dr + 4*UNROLL);
-    rkjz_S = gmx_load_pr(dr + 5*UNROLL);
-    rklx_S = gmx_load_pr(dr + 6*UNROLL);
-    rkly_S = gmx_load_pr(dr + 7*UNROLL);
-    rklz_S = gmx_load_pr(dr + 8*UNROLL);
+    rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
+    rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
+    rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
+    rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
+    rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
+    rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
+    rklx_S = gmx_simd_load_r(dr + 6*GMX_SIMD_REAL_WIDTH);
+    rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH);
+    rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH);
  
      pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
      pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
      pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
  
-    gmx_cprod_pr(rijx_S, rijy_S, rijz_S,
-                 rkjx_S, rkjy_S, rkjz_S,
-                 mx_S, my_S, mz_S);
+    gmx_simd_cprod_r(rijx_S, rijy_S, rijz_S,
+                     rkjx_S, rkjy_S, rkjz_S,
+                     mx_S, my_S, mz_S);
  
-    gmx_cprod_pr(rkjx_S, rkjy_S, rkjz_S,
-                 rklx_S, rkly_S, rklz_S,
-                 nx_S, ny_S, nz_S);
+    gmx_simd_cprod_r(rkjx_S, rkjy_S, rkjz_S,
+                     rklx_S, rkly_S, rklz_S,
+                     nx_S, ny_S, nz_S);
  
-    gmx_cprod_pr(*mx_S, *my_S, *mz_S,
-                 *nx_S, *ny_S, *nz_S,
-                 &cx_S, &cy_S, &cz_S);
+    gmx_simd_cprod_r(*mx_S, *my_S, *mz_S,
+                     *nx_S, *ny_S, *nz_S,
+                     &cx_S, &cy_S, &cz_S);
  
-    cn_S       = gmx_sqrt_pr(gmx_norm2_pr(cx_S, cy_S, cz_S));
+    cn_S       = gmx_simd_sqrt_r(gmx_simd_norm2_r(cx_S, cy_S, cz_S));
  
-    s_S        = gmx_iprod_pr(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
+    s_S        = gmx_simd_iprod_r(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
  
      /* Determine the dihedral angle, the sign might need correction */
-    *phi_S     = gmx_atan2_pr(cn_S, s_S);
+    *phi_S     = gmx_simd_atan2_r(cn_S, s_S);
  
-    ipr_S      = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
-                              *nx_S, *ny_S, *nz_S);
+    ipr_S      = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                  *nx_S, *ny_S, *nz_S);
  
-    iprm_S     = gmx_norm2_pr(*mx_S, *my_S, *mz_S);
-    iprn_S     = gmx_norm2_pr(*nx_S, *ny_S, *nz_S);
+    iprm_S     = gmx_simd_norm2_r(*mx_S, *my_S, *mz_S);
+    iprn_S     = gmx_simd_norm2_r(*nx_S, *ny_S, *nz_S);
  
-    nrkj2_S    = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+    nrkj2_S    = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S);
  
      /* Avoid division by zero. When zero, the result is multiplied by 0
       * anyhow, so the 3 max below do not affect the final result.
       */
-    nrkj2_S    = gmx_max_pr(nrkj2_S, nrkj2_min_S);
-    nrkj_1_S   = gmx_invsqrt_pr(nrkj2_S);
-    nrkj_2_S   = gmx_mul_pr(nrkj_1_S, nrkj_1_S);
-    nrkj_S     = gmx_mul_pr(nrkj2_S, nrkj_1_S);
+    nrkj2_S    = gmx_simd_max_r(nrkj2_S, nrkj2_min_S);
+    nrkj_1_S   = gmx_simd_invsqrt_r(nrkj2_S);
+    nrkj_2_S   = gmx_simd_mul_r(nrkj_1_S, nrkj_1_S);
+    nrkj_S     = gmx_simd_mul_r(nrkj2_S, nrkj_1_S);
  
-    toler_S    = gmx_mul_pr(nrkj2_S, real_eps_S);
+    toler_S    = gmx_simd_mul_r(nrkj2_S, real_eps_S);
  
      /* Here the plain-C code uses a conditional, but we can't do that in SIMD.
       * So we take a max with the tolerance instead. Since we multiply with
       * m or n later, the max does not affect the results.
       */
-    iprm_S     = gmx_max_pr(iprm_S, toler_S);
-    iprn_S     = gmx_max_pr(iprn_S, toler_S);
-    *nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S));
-    *nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S));
+    iprm_S     = gmx_simd_max_r(iprm_S, toler_S);
+    iprn_S     = gmx_simd_max_r(iprn_S, toler_S);
+    *nrkj_m2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprm_S));
+    *nrkj_n2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprn_S));
  
      /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
      *phi_S     = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
  
-    p_S        = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
-                              rkjx_S, rkjy_S, rkjz_S);
-    p_S        = gmx_mul_pr(p_S, nrkj_2_S);
+    p_S        = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                  rkjx_S, rkjy_S, rkjz_S);
+    p_S        = gmx_simd_mul_r(p_S, nrkj_2_S);
  
-    q_S        = gmx_iprod_pr(rklx_S, rkly_S, rklz_S,
-                              rkjx_S, rkjy_S, rkjz_S);
-    q_S        = gmx_mul_pr(q_S, nrkj_2_S);
+    q_S        = gmx_simd_iprod_r(rklx_S, rkly_S, rklz_S,
+                                  rkjx_S, rkjy_S, rkjz_S);
+    q_S        = gmx_simd_mul_r(q_S, nrkj_2_S);
  
-    gmx_store_pr(p, p_S);
-    gmx_store_pr(q, q_S);
-#undef UNROLL
+    gmx_simd_store_r(p, p_S);
+    gmx_simd_store_r(q, q_S);
  }
  
  #endif /* SIMD_BONDEDS */
@@ -1982,48 +1979,47 @@ pdihs_noener_simd(int nbonds,
                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
                    int gmx_unused *global_atom_index)
  {
-#define UNROLL GMX_SIMD_WIDTH_HERE
-    const int       nfa1 = 5;
-    int             i, iu, s;
-    int             type, ai[UNROLL], aj[UNROLL], ak[UNROLL], al[UNROLL];
-    int             t1[UNROLL], t2[UNROLL], t3[UNROLL];
-    real            ddphi;
-    real            dr_array[3*DIM*UNROLL+UNROLL], *dr;
-    real            buf_array[7*UNROLL+UNROLL], *buf;
-    real           *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
-    gmx_mm_pr       phi0_S, phi_S;
-    gmx_mm_pr       mx_S, my_S, mz_S;
-    gmx_mm_pr       nx_S, ny_S, nz_S;
-    gmx_mm_pr       nrkj_m2_S, nrkj_n2_S;
-    gmx_mm_pr       cp_S, mdphi_S, mult_S;
-    gmx_mm_pr       sin_S, cos_S;
-    gmx_mm_pr       mddphi_S;
-    gmx_mm_pr       sf_i_S, msf_l_S;
-    pbc_simd_t      pbc_simd;
+    const int             nfa1 = 5;
+    int                   i, iu, s;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    int                   t1[GMX_SIMD_REAL_WIDTH], t2[GMX_SIMD_REAL_WIDTH], t3[GMX_SIMD_REAL_WIDTH];
+    real                  ddphi;
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[7*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
+    gmx_simd_real_t       phi0_S, phi_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       cp_S, mdphi_S, mult_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       mddphi_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
  
      /* Ensure SIMD register alignment */
-    dr  = gmx_simd_align_real(dr_array);
-    buf = gmx_simd_align_real(buf_array);
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
  
      /* Extract aligned pointer for parameters and variables */
-    cp    = buf + 0*UNROLL;
-    phi0  = buf + 1*UNROLL;
-    mult  = buf + 2*UNROLL;
-    p     = buf + 3*UNROLL;
-    q     = buf + 4*UNROLL;
-    sf_i  = buf + 5*UNROLL;
-    msf_l = buf + 6*UNROLL;
+    cp    = buf + 0*GMX_SIMD_REAL_WIDTH;
+    phi0  = buf + 1*GMX_SIMD_REAL_WIDTH;
+    mult  = buf + 2*GMX_SIMD_REAL_WIDTH;
+    p     = buf + 3*GMX_SIMD_REAL_WIDTH;
+    q     = buf + 4*GMX_SIMD_REAL_WIDTH;
+    sf_i  = buf + 5*GMX_SIMD_REAL_WIDTH;
+    msf_l = buf + 6*GMX_SIMD_REAL_WIDTH;
  
      set_pbc_simd(pbc, &pbc_simd);
  
-    /* nbonds is the number of dihedrals times nfa1, here we step UNROLL dihs */
-    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
      {
-        /* Collect atoms quadruplets for UNROLL dihedrals.
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
           * iu indexes into forceatoms, we should not let iu go beyond nbonds.
           */
          iu = i;
-        for (s = 0; s < UNROLL; s++)
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
          {
              type  = forceatoms[iu];
              ai[s] = forceatoms[iu+1];
@@ -2042,7 +2038,7 @@ pdihs_noener_simd(int nbonds,
              }
          }
  
-        /* Caclulate UNROLL dihedral angles at once */
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
          dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
                         dr,
                         &phi_S,
@@ -2052,34 +2048,34 @@ pdihs_noener_simd(int nbonds,
                         &nrkj_n2_S,
                         p, q);
  
-        cp_S     = gmx_load_pr(cp);
-        phi0_S   = gmx_load_pr(phi0);
-        mult_S   = gmx_load_pr(mult);
+        cp_S     = gmx_simd_load_r(cp);
+        phi0_S   = gmx_simd_load_r(phi0);
+        mult_S   = gmx_simd_load_r(mult);
  
-        mdphi_S  = gmx_sub_pr(gmx_mul_pr(mult_S, phi_S), phi0_S);
+        mdphi_S  = gmx_simd_sub_r(gmx_simd_mul_r(mult_S, phi_S), phi0_S);
  
-        /* Calculate UNROLL sines at once */
-        gmx_sincos_pr(mdphi_S, &sin_S, &cos_S);
-        mddphi_S = gmx_mul_pr(gmx_mul_pr(cp_S, mult_S), sin_S);
-        sf_i_S   = gmx_mul_pr(mddphi_S, nrkj_m2_S);
-        msf_l_S  = gmx_mul_pr(mddphi_S, nrkj_n2_S);
+        /* Calculate GMX_SIMD_REAL_WIDTH sines at once */
+        gmx_simd_sincos_r(mdphi_S, &sin_S, &cos_S);
+        mddphi_S = gmx_simd_mul_r(gmx_simd_mul_r(cp_S, mult_S), sin_S);
+        sf_i_S   = gmx_simd_mul_r(mddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(mddphi_S, nrkj_n2_S);
  
          /* After this m?_S will contain f[i] */
-        mx_S     = gmx_mul_pr(sf_i_S, mx_S);
-        my_S     = gmx_mul_pr(sf_i_S, my_S);
-        mz_S     = gmx_mul_pr(sf_i_S, mz_S);
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
  
          /* After this m?_S will contain -f[l] */
-        nx_S     = gmx_mul_pr(msf_l_S, nx_S);
-        ny_S     = gmx_mul_pr(msf_l_S, ny_S);
-        nz_S     = gmx_mul_pr(msf_l_S, nz_S);
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
  
-        gmx_store_pr(dr + 0*UNROLL, mx_S);
-        gmx_store_pr(dr + 1*UNROLL, my_S);
-        gmx_store_pr(dr + 2*UNROLL, mz_S);
-        gmx_store_pr(dr + 3*UNROLL, nx_S);
-        gmx_store_pr(dr + 4*UNROLL, ny_S);
-        gmx_store_pr(dr + 5*UNROLL, nz_S);
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
  
          iu = i;
          s  = 0;
@@ -2087,19 +2083,18 @@ pdihs_noener_simd(int nbonds,
          {
              do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
                                          p[s], q[s],
-                                        dr[     XX *UNROLL+s],
-                                        dr[     YY *UNROLL+s],
-                                        dr[     ZZ *UNROLL+s],
-                                        dr[(DIM+XX)*UNROLL+s],
-                                        dr[(DIM+YY)*UNROLL+s],
-                                        dr[(DIM+ZZ)*UNROLL+s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
                                          f);
              s++;
              iu += nfa1;
          }
-        while (s < UNROLL && iu < nbonds);
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
      }
-#undef UNROLL
  }
  
  #endif /* SIMD_BONDEDS */
diff --git a/src/gromacs/gmxlib/copyrite.cpp b/src/gromacs/gmxlib/copyrite.cpp

index 144825abf5b49c4be4994c98290b0bebe1ad92e4..b59d759f1ee6e7d4c1caf6f085376f5e7d58dc1c 100644 (file)
--- a/src/gromacs/gmxlib/copyrite.cpp
+++ b/src/gromacs/gmxlib/copyrite.cpp
@@ -679,7 +679,7 @@ static void gmx_print_version_info(FILE *fp)
  #define gmx_stringify2(x) #x
  #define gmx_stringify(x) gmx_stringify2(x)
      fprintf(fp, "invsqrt routine:    %s\n", gmx_stringify(gmx_invsqrt(x)));
-    fprintf(fp, "CPU acceleration:   %s\n", GMX_CPU_ACCELERATION_STRING);
+    fprintf(fp, "SIMD instructions:  %s\n", GMX_SIMD_STRING);
  
      fprintf(fp, "FFT library:        %s\n", gmx_fft_get_version_info());
  #ifdef HAVE_RDTSCP
diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c

index 0824591f1cbbf88dced57ca3991a6f75ee18b2bc..ec1ce471d5820bf7e8689115166ce2be9158b826 100644 (file)
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@ -134,7 +134,7 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
  };
  
  const char *
-gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
+gmx_cpuid_simd_string[GMX_CPUID_NSIMD] =
  {
      "CannotDetect",
      "None",
@@ -222,38 +222,24 @@ gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
  
  
  
-/* What type of acceleration was compiled in, if any?
+/* What type of SIMD was compiled in, if any?
   * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
   * AVX too, so it is important that they appear last in the list.
   */
-#ifdef GMX_X86_AVX_256
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256;
-#elif defined GMX_X86_AVX_128_FMA
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
-#elif defined GMX_X86_SSE4_1
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
-#elif defined GMX_X86_SSE2
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
-#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
-#elif defined GMX_CPU_ACCELERATION_IBM_QPX
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_IBM_QPX;
+#ifdef GMX_SIMD_X86_AVX_256
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_256;
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_128_FMA;
+#elif defined GMX_SIMD_X86_SSE4_1
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE4_1;
+#elif defined GMX_SIMD_X86_SSE2
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE2;
+#elif defined GMX_SIMD_SPARC64_HPC_ACE
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
+#elif defined GMX_SIMD_IBM_QPX
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_QPX;
  #else
-static const
-enum gmx_cpuid_acceleration
-    compiled_acc = GMX_CPUID_ACCELERATION_NONE;
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_NONE;
  #endif
  
  
@@ -321,7 +307,7 @@ execute_x86cpuid(unsigned int   level,
      /* Death and horror!
       * Apparently this is an x86 platform where we don't know how to call cpuid.
       *
-     * This is REALLY bad, since we will lose all Gromacs acceleration.
+     * This is REALLY bad, since we will lose all Gromacs SIMD support.
       */
      *eax = 0;
      *ebx = 0;
@@ -1049,78 +1035,78 @@ gmx_cpuid_formatstring       (gmx_cpuid_t              cpuid,
  
  
  
-enum gmx_cpuid_acceleration
-gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
+enum gmx_cpuid_simd
+gmx_cpuid_simd_suggest  (gmx_cpuid_t                 cpuid)
  {
-    enum gmx_cpuid_acceleration  tmpacc;
+    enum gmx_cpuid_simd  tmpsimd;
  
-    tmpacc = GMX_CPUID_ACCELERATION_NONE;
+    tmpsimd = GMX_CPUID_SIMD_NONE;
  
      if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_INTEL)
      {
          if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX2))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX2_256;
+            tmpsimd = GMX_CPUID_SIMD_X86_AVX2_256;
          }
          else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256;
+            tmpsimd = GMX_CPUID_SIMD_X86_AVX_256;
          }
          else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
+            tmpsimd = GMX_CPUID_SIMD_X86_SSE4_1;
          }
          else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
+            tmpsimd = GMX_CPUID_SIMD_X86_SSE2;
          }
      }
      else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_AMD)
      {
          if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
+            tmpsimd = GMX_CPUID_SIMD_X86_AVX_128_FMA;
          }
          else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
+            tmpsimd = GMX_CPUID_SIMD_X86_SSE4_1;
          }
          else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
+            tmpsimd = GMX_CPUID_SIMD_X86_SSE2;
          }
      }
      else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_FUJITSU)
      {
          if (strstr(gmx_cpuid_brand(cpuid), "SPARC64"))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
+            tmpsimd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
          }
      }
      else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_IBM)
      {
          if (strstr(gmx_cpuid_brand(cpuid), "A2"))
          {
-            tmpacc = GMX_CPUID_ACCELERATION_IBM_QPX;
+            tmpsimd = GMX_CPUID_SIMD_IBM_QPX;
          }
      }
-    return tmpacc;
+    return tmpsimd;
  }
  
  
  
  int
-gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
-                             FILE *        log,
-                             int           print_to_stderr)
+gmx_cpuid_simd_check(gmx_cpuid_t   cpuid,
+                     FILE *        log,
+                     int           print_to_stderr)
  {
      int                           rc;
      char                          str[1024];
-    enum gmx_cpuid_acceleration   acc;
+    enum gmx_cpuid_simd           simd;
  
-    acc = gmx_cpuid_acceleration_suggest(cpuid);
+    simd = gmx_cpuid_simd_suggest(cpuid);
  
-    rc = (acc != compiled_acc);
+    rc = (simd != compiled_simd);
  
      gmx_cpuid_formatstring(cpuid, str, 1023);
      str[1023] = '\0';
@@ -1128,13 +1114,13 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
      if (log != NULL)
      {
          fprintf(log,
-                "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n"
+                "\nDetecting CPU SIMD instructions.\nPresent hardware specification:\n"
                  "%s"
-                "Acceleration most likely to fit this hardware: %s\n"
-                "Acceleration selected at GROMACS compile time: %s\n\n",
+                "SIMD instructions most likely to fit this hardware: %s\n"
+                "SIMD instructions selected at GROMACS compile time: %s\n\n",
                  str,
-                gmx_cpuid_acceleration_string[acc],
-                gmx_cpuid_acceleration_string[compiled_acc]);
+                gmx_cpuid_simd_string[simd],
+                gmx_cpuid_simd_string[compiled_simd]);
      }
  
      if (rc != 0)
@@ -1142,16 +1128,16 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
          if (log != NULL)
          {
              fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
-                    "Acceleration most likely to fit this hardware: %s\n"
-                    "Acceleration selected at GROMACS compile time: %s\n\n",
-                    gmx_cpuid_acceleration_string[acc],
-                    gmx_cpuid_acceleration_string[compiled_acc]);
+                    "SIMD instructions most likely to fit this hardware: %s\n"
+                    "SIMD instructions selected at GROMACS compile time: %s\n\n",
+                    gmx_cpuid_simd_string[simd],
+                    gmx_cpuid_simd_string[compiled_simd]);
          }
          if (print_to_stderr)
          {
-            fprintf(stderr, "Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
-                    gmx_cpuid_acceleration_string[compiled_acc],
-                    gmx_cpuid_acceleration_string[acc]);
+            fprintf(stderr, "Compiled SIMD instructions: %s (Gromacs could use %s on this machine, which is better)\n",
+                    gmx_cpuid_simd_string[compiled_simd],
+                    gmx_cpuid_simd_string[simd]);
          }
      }
      return rc;
@@ -1167,7 +1153,7 @@ int
  main(int argc, char **argv)
  {
      gmx_cpuid_t                   cpuid;
-    enum gmx_cpuid_acceleration   acc;
+    enum gmx_cpuid_simd           simd;
      int                           i, cnt;
  
      if (argc < 2)
@@ -1181,7 +1167,7 @@ main(int argc, char **argv)
                  "-model         Print CPU model version.\n"
                  "-stepping      Print CPU stepping version.\n"
                  "-features      Print CPU feature flags.\n"
-                "-acceleration  Print suggested GROMACS acceleration.\n",
+                "-simd          Print suggested GROMACS SIMD instructions.\n",
                  argv[0]);
          exit(0);
      }
@@ -1224,10 +1210,10 @@ main(int argc, char **argv)
          }
          printf("\n");
      }
-    else if (!strncmp(argv[1], "-acceleration", 3))
+    else if (!strncmp(argv[1], "-simd", 3))
      {
-        acc = gmx_cpuid_acceleration_suggest(cpuid);
-        fprintf(stdout, "%s\n", gmx_cpuid_acceleration_string[acc]);
+        simd = gmx_cpuid_simd_suggest(cpuid);
+        fprintf(stdout, "%s\n", gmx_cpuid_simd_string[simd]);
      }
  
      gmx_cpuid_done(cpuid);
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.c b/src/gromacs/gmxlib/gmx_detect_hardware.c

index 2e56119e473fd6697683cb67cd612f9b901c4d76..2ba16d4ce4650ce736c19b0bc6935ddb61f710d8 100644 (file)
--- a/src/gromacs/gmxlib/gmx_detect_hardware.c
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.c
@@ -251,11 +251,11 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
      bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
      bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
  
-    /* check the acceleration mdrun is compiled with against hardware
+    /* check the SIMD level mdrun is compiled with against hardware
         capabilities */
      /* TODO: Here we assume homogeneous hardware which is not necessarily
               the case! Might not hurt to add an extra check over MPI. */
-    gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr));
+    gmx_cpuid_simd_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr));
  
      /* NOTE: this print is only for and on one physical node */
      print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
diff --git a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt

index 862361b937868e2a61674978972a83f420fb121c..62939813affb5a39f9709ba276ecc21106ec656b 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
+++ b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -35,39 +35,39 @@
  # Sources that should always be built
  file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c)
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
      file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
      file(GLOB NONBONDED_SSE4_1_SINGLE_SOURCES nb_kernel_sse4_1_single/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "SSE2" AND GMX_DOUBLE)
      file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
      file(GLOB NONBONDED_SSE4_1_DOUBLE_SOURCES nb_kernel_sse4_1_double/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "AVX_256" AND GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
  endif()
  
-if("${GMX_CPU_ACCELERATION}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
      file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
  endif()
  
diff --git a/src/gromacs/gmxlib/nonbonded/nonbonded.c b/src/gromacs/gmxlib/nonbonded/nonbonded.c

index fdd3b0111e61acba0eae92abe6e3d570c2488b71..42209eef7d5fda07ee98f9be7e51539fc9adf527 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nonbonded.c
+++ b/src/gromacs/gmxlib/nonbonded/nonbonded.c
@@ -71,31 +71,31 @@
  /* Different default (c) and accelerated interaction-specific kernels */
  #include "nb_kernel_c/nb_kernel_c.h"
  
-#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
  #    include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE)
  #    include "nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
  #    include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
  #    include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
  #    include "nb_kernel_sse2_double/nb_kernel_sse2_double.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE)
  #    include "nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
  #    include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
  #    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
  #endif
-#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
  #    include "nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h"
  #endif
  
@@ -117,36 +117,36 @@ gmx_nonbonded_setup(t_forcerec *   fr,
              /* Add the generic kernels to the structure stored statically in nb_kernel.c */
              nb_kernel_list_add_kernels(kernellist_c, kernellist_c_size);
  
-            if (!(fr != NULL && fr->use_cpu_acceleration == FALSE))
+            if (!(fr != NULL && fr->use_simd_kernels == FALSE))
              {
                  /* Add interaction-specific kernels for different architectures */
                  /* Single precision */
-#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_sse2_single, kernellist_sse2_single_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_sse4_1_single, kernellist_sse4_1_single_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_128_fma_single, kernellist_avx_128_fma_single_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_256_single, kernellist_avx_256_single_size);
  #endif
                  /* Double precision */
-#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_sse2_double, kernellist_sse2_double_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_sse4_1_double, kernellist_sse4_1_double_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_128_fma_double, kernellist_avx_128_fma_double_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
  #endif
-#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_sparc64_hpc_ace_double, kernellist_sparc64_hpc_ace_double_size);
  #endif
                  ; /* empty statement to avoid a completely empty block */
@@ -181,38 +181,38 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
      arch_and_padding[] =
      {
          /* Single precision */
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
          { "avx_256_single", 8 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
          { "avx_128_fma_single", 4 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE)
          { "sse4_1_single", 4 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
          { "sse2_single", 4 },
  #endif
          /* Double precision */
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
          { "avx_256_double", 4 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
          /* Sic. Double precision 2-way SIMD does not require neighbor list padding,
           * since the kernels execute a loop unrolled a factor 2, followed by
           * a possible single odd-element epilogue.
           */
          { "avx_128_fma_double", 1 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
          /* No padding - see comment above */
          { "sse2_double", 1 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE)
          /* No padding - see comment above */
          { "sse4_1_double", 1 },
  #endif
-#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
          /* No padding - see comment above */
          { "sparc64_hpc_ace_double", 1 },
  #endif
diff --git a/src/gromacs/gmxpreprocess/calc_verletbuf.c b/src/gromacs/gmxpreprocess/calc_verletbuf.c

index 7c28034583338e39f53e9626b15ce7f827b32920..13baaf0ce528de81f5cc63e71affef2a50525ec9 100644 (file)
--- a/src/gromacs/gmxpreprocess/calc_verletbuf.c
+++ b/src/gromacs/gmxpreprocess/calc_verletbuf.c
@@ -131,7 +131,7 @@ void verletbuf_get_list_setup(gmx_bool                bGPU,
  #ifndef GMX_NBNXN_SIMD
          list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
  #else
-        list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE;
+        list_setup->cluster_size_j = GMX_SIMD_REAL_WIDTH;
  #ifdef GMX_NBNXN_SIMD_2XNN
          /* We assume the smallest cluster size to be on the safe side */
          list_setup->cluster_size_j /= 2;
diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h

index 15c2b3bbf356a7696ad1d4cb336bce8a1838c4df..a0e1e0a8bb5793de7d6226e9f88c47b5af49c46c 100644 (file)
--- a/src/gromacs/legacyheaders/gmx_cpuid.h
+++ b/src/gromacs/legacyheaders/gmx_cpuid.h
@@ -115,23 +115,23 @@ enum gmx_cpuid_feature
  };
  
  
-/* Currently supported acceleration instruction sets, intrinsics or other similar combinations
+/* Currently supported SIMD instruction sets, intrinsics or other similar combinations
   * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD
   * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed),
   * and we still haven't written the AVX2 kernels.
   */
-enum gmx_cpuid_acceleration
+enum gmx_cpuid_simd
  {
-    GMX_CPUID_ACCELERATION_CANNOTDETECT,    /* Should only be used if something fails */
-    GMX_CPUID_ACCELERATION_NONE,
-    GMX_CPUID_ACCELERATION_X86_SSE2,
-    GMX_CPUID_ACCELERATION_X86_SSE4_1,
-    GMX_CPUID_ACCELERATION_X86_AVX_128_FMA,
-    GMX_CPUID_ACCELERATION_X86_AVX_256,
-    GMX_CPUID_ACCELERATION_X86_AVX2_256,
-    GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE,
-    GMX_CPUID_ACCELERATION_IBM_QPX,
-    GMX_CPUID_NACCELERATIONS
+    GMX_CPUID_SIMD_CANNOTDETECT,    /* Should only be used if something fails */
+    GMX_CPUID_SIMD_NONE,
+    GMX_CPUID_SIMD_X86_SSE2,
+    GMX_CPUID_SIMD_X86_SSE4_1,
+    GMX_CPUID_SIMD_X86_AVX_128_FMA,
+    GMX_CPUID_SIMD_X86_AVX_256,
+    GMX_CPUID_SIMD_X86_AVX2_256,
+    GMX_CPUID_SIMD_SPARC64_HPC_ACE,
+    GMX_CPUID_SIMD_IBM_QPX,
+    GMX_CPUID_NSIMD
  };
  
  /* Text strings corresponding to CPU vendors */
@@ -142,9 +142,9 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS];
  extern const char *
  gmx_cpuid_feature_string[GMX_CPUID_NFEATURES];
  
-/* Text strings for Gromacs acceleration/instruction sets */
+/* Text strings for Gromacs SIMD instruction sets */
  extern const char *
-gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS];
+gmx_cpuid_simd_string[GMX_CPUID_NSIMD];
  
  
  /* Abstract data type with CPU detection information. Set by gmx_cpuid_init(). */
@@ -281,22 +281,22 @@ gmx_cpuid_formatstring      (gmx_cpuid_t                cpuid,
                               int                        n);
  
  
-/* Suggests a suitable gromacs acceleration based on the support in the
+/* Suggests a suitable gromacs SIMD based on the support in the
   * hardware.
   */
-enum gmx_cpuid_acceleration
-gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                    cpuid);
+enum gmx_cpuid_simd
+gmx_cpuid_simd_suggest  (gmx_cpuid_t                    cpuid);
  
  
-/* Check if this binary was compiled with the same acceleration as we
+/* Check if this binary was compiled with the same SIMD instructions as we
   * would suggest for the current hardware. Always print stats to the log file
   * if it is non-NULL, and if we don't have a match, print a warning in log
   * (if non-NULL) and if print_to_stderr!=0 also to stderr.
   */
  int
-gmx_cpuid_acceleration_check    (gmx_cpuid_t                cpuid,
-                                 FILE *                     log,
-                                 int                        print_to_stderr);
+gmx_cpuid_simd_check    (gmx_cpuid_t                cpuid,
+                         FILE *                     log,
+                         int                        print_to_stderr);
  
  
  /* Release resources used by data structure. Note that the pointer to the
diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h

index c936ccd0e4d99e046784caaf562806e74deb3c53..e1cf22ff95ae60a37857b9745bc8ae9f2f1dfff5 100644 (file)
--- a/src/gromacs/legacyheaders/types/forcerec.h
+++ b/src/gromacs/legacyheaders/types/forcerec.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -202,7 +202,7 @@ typedef struct {
  
      const gmx_hw_info_t *hwinfo;
      const gmx_gpu_opt_t *gpu_opt;
-    gmx_bool             use_cpu_acceleration;
+    gmx_bool             use_simd_kernels;
  
      /* Interaction for calculated in kernels. In many cases this is similar to
       * the electrostatics settings in the inputrecord, but the difference is that
diff --git a/src/gromacs/legacyheaders/types/nb_verlet.h b/src/gromacs/legacyheaders/types/nb_verlet.h

index 7099912d66b309ddeacb85244c1eab144f79215d..8c6228e2768288509c973a4fcee54091ae5fdf1f 100644 (file)
--- a/src/gromacs/legacyheaders/types/nb_verlet.h
+++ b/src/gromacs/legacyheaders/types/nb_verlet.h
@@ -43,11 +43,11 @@
  extern "C" {
  #endif
  
-#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+#ifdef GMX_SIMD_REFERENCE
  #define GMX_NBNXN_SIMD
  #endif
  
-#if (defined GMX_X86_SSE2) || (defined GMX_CPU_ACCELERATION_IBM_QPX)
+#if (defined GMX_SIMD_X86_SSE2_OR_HIGHER) || (defined GMX_SIMD_IBM_QPX)
  /* Use SIMD accelerated nbnxn search and kernels */
  #define GMX_NBNXN_SIMD
  
@@ -60,7 +60,7 @@ extern "C" {
   * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
   */
  #define GMX_NBNXN_SIMD_4XN
-#if defined GMX_X86_AVX_256 && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
+#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
  #define GMX_NBNXN_SIMD_2XNN
  #endif
  
diff --git a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h

index 3422968b38431f3cac7367f2715cee45df84b568..2de1dde4eae454e22b33dfe3412059af64f2fc60 100644 (file)
--- a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
+++ b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
@@ -79,7 +79,7 @@ typedef void nbnxn_free_t (void *ptr);
  typedef struct {
      int      cj;    /* The j-cluster                    */
      unsigned excl;  /* The exclusion (interaction) bits */
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
      /* Indices into the arrays of SIMD interaction masks. */
      char     interaction_mask_indices[4];
  #endif
@@ -264,7 +264,7 @@ typedef struct {
       */
      unsigned                *simd_exclusion_filter1;
      unsigned                *simd_exclusion_filter2;
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
      real                    *simd_interaction_array; /* Array of masks needed for exclusions on QPX */
  #endif
      int                      nout;                   /* The number of force arrays                         */
diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c

index decbeffd52dbc22690859ac3397d365dccbecf0f..8db994eb5fd5decec543d8a6119788c0d271671b 100644 (file)
--- a/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.c
@@ -1495,7 +1495,7 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *
  
      if (bAllvsAll && fp && MASTER(cr))
      {
-        fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+        fprintf(fp, "\nUsing SIMD all-vs-all kernels.\n\n");
      }
  
      return bAllvsAll;
@@ -1545,7 +1545,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
          *kernel_type = nbnxnk4xN_SIMD_2xNN;
  #endif
  
-#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
          if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
          {
              /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
@@ -1578,7 +1578,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
           * of precision. In single precision, this is faster on
           * Bulldozer, and slightly faster on Sandy Bridge.
           */
-#if ((defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256 || defined __MIC__) && !defined GMX_DOUBLE) || (defined GMX_CPU_ACCELERATION_IBM_QPX)
+#if ((defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__) && !defined GMX_DOUBLE) || (defined GMX_SIMD_IBM_QPX)
          *ewald_excl = ewaldexclAnalytical;
  #endif
          if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
@@ -1609,12 +1609,12 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
          case nbnxnk4xN_SIMD_4xN:
          case nbnxnk4xN_SIMD_2xNN:
  #ifdef GMX_NBNXN_SIMD
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
              /* We have x86 SSE2 compatible SIMD */
-#ifdef GMX_X86_AVX_128_FMA
+#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
              returnvalue = "AVX-128-FMA";
  #else
-#if defined GMX_X86_AVX_256 || defined __AVX__
+#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __AVX__
              /* x86 SIMD intrinsics can be converted to SSE or AVX depending
               * on compiler flags. As we use nearly identical intrinsics,
               * compiling for AVX without an AVX macros effectively results
@@ -1622,23 +1622,23 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
               * For gcc we check for __AVX__
               * At least a check for icc should be added (if there is a macro)
               */
-#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
+#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_NBNXN_HALF_WIDTH_SIMD
              returnvalue = "AVX-256";
  #else
              returnvalue = "AVX-128";
  #endif
  #else
-#ifdef GMX_X86_SSE4_1
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
              returnvalue  = "SSE4.1";
  #else
              returnvalue  = "SSE2";
  #endif
  #endif
  #endif
-#else   /* GMX_X86_SSE2 */
-            /* not GMX_X86_SSE2, but other SIMD */
+#else   /* GMX_SIMD_X86_SSE2_OR_HIGHER */
+            /* not GMX_SIMD_X86_SSE2_OR_HIGHER, but other SIMD */
              returnvalue  = "SIMD";
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
  #else  /* GMX_NBNXN_SIMD */
              returnvalue = "not available";
  #endif /* GMX_NBNXN_SIMD */
@@ -1657,7 +1657,7 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
  
  static void pick_nbnxn_kernel(FILE                *fp,
                                const t_commrec     *cr,
-                              gmx_bool             use_cpu_acceleration,
+                              gmx_bool             use_simd_kernels,
                                gmx_bool             bUseGPU,
                                gmx_bool             bEmulateGPU,
                                const t_inputrec    *ir,
@@ -1686,7 +1686,7 @@ static void pick_nbnxn_kernel(FILE                *fp,
  
      if (*kernel_type == nbnxnkNotSet)
      {
-        if (use_cpu_acceleration)
+        if (use_simd_kernels)
          {
              pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
          }
@@ -1985,7 +1985,7 @@ static void init_nb_verlet(FILE                *fp,
  
          if (i == 0) /* local */
          {
-            pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+            pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
                                nbv->bUseGPU, bEmulateGPU, ir,
                                &nbv->grp[i].kernel_type,
                                &nbv->grp[i].ewald_excl,
@@ -1996,7 +1996,7 @@ static void init_nb_verlet(FILE                *fp,
              if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
              {
                  /* Use GPU for local, select a CPU kernel for non-local */
-                pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+                pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
                                    FALSE, FALSE, ir,
                                    &nbv->grp[i].kernel_type,
                                    &nbv->grp[i].ewald_excl,
@@ -2133,8 +2133,8 @@ void init_forcerec(FILE              *fp,
          fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
      }
  
-    /* By default we turn acceleration on, but it might be turned off further down... */
-    fr->use_cpu_acceleration = TRUE;
+    /* By default we turn SIMD kernels on, but it might be turned off further down... */
+    fr->use_simd_kernels = TRUE;
  
      fr->bDomDec = DOMAINDECOMP(cr);
  
@@ -2274,12 +2274,12 @@ void init_forcerec(FILE              *fp,
  
      if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
      {
-        fr->use_cpu_acceleration = FALSE;
+        fr->use_simd_kernels = FALSE;
          if (fp != NULL)
          {
              fprintf(fp,
                      "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
-                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+                    "Disabling the usage of most SIMD-specific kernel (e.g. SSE2/SSE4/AVX) routines.\n\n");
          }
      }
  
@@ -2291,12 +2291,12 @@ void init_forcerec(FILE              *fp,
      fr->AllvsAll_workgb = NULL;
  
      /* All-vs-all kernels have not been implemented in 4.6, and
-     * the SIMD group kernels are also buggy in this case. Non-accelerated
+     * the SIMD group kernels are also buggy in this case. Non-SIMD
       * group kernels are OK. See Redmine #1249. */
      if (fr->bAllvsAll)
      {
          fr->bAllvsAll            = FALSE;
-        fr->use_cpu_acceleration = FALSE;
+        fr->use_simd_kernels     = FALSE;
          if (fp != NULL)
          {
              fprintf(fp,
diff --git a/src/gromacs/mdlib/genborn.c b/src/gromacs/mdlib/genborn.c

index 582d432780c30ddbb70a1f254bc8a945ecc72533..d320af6bf839c03f6903d9d7b84492e18f9dec23 100644 (file)
--- a/src/gromacs/mdlib/genborn.c
+++ b/src/gromacs/mdlib/genborn.c
@@ -60,7 +60,7 @@
  
  #include "gromacs/utility/gmxmpi.h"
  
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
  #  ifdef GMX_DOUBLE
  #    include "genborn_sse2_double.h"
  #    include "genborn_allvsall_sse2_double.h"
@@ -1090,8 +1090,8 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
  
          if (ir->gb_algorithm == egbSTILL)
          {
-#if 0 && defined (GMX_X86_SSE2)
-            if (fr->use_acceleration)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
+            if (fr->use_simd_kernels)
              {
  #  ifdef GMX_DOUBLE
                  genborn_allvsall_calc_still_radii_sse2_double(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
@@ -1111,8 +1111,8 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
          }
          else if (ir->gb_algorithm == egbHCT || ir->gb_algorithm == egbOBC)
          {
-#if 0 && defined (GMX_X86_SSE2)
-            if (fr->use_acceleration)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
+            if (fr->use_simd_kernels)
              {
  #  ifdef GMX_DOUBLE
                  genborn_allvsall_calc_hct_obc_radii_sse2_double(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
@@ -1140,12 +1140,12 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
      /* Switch for determining which algorithm to use for Born radii calculation */
  #ifdef GMX_DOUBLE
  
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
      /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
      switch (ir->gb_algorithm)
      {
          case egbSTILL:
-            if (fr->use_acceleration)
+            if (fr->use_simd_kernels)
              {
                  calc_gb_rad_still_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born);
              }
@@ -1155,7 +1155,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
              }
              break;
          case egbHCT:
-            if (fr->use_acceleration)
+            if (fr->use_simd_kernels)
              {
                  calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm);
              }
@@ -1165,7 +1165,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
              }
              break;
          case egbOBC:
-            if (fr->use_acceleration)
+            if (fr->use_simd_kernels)
              {
                  calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm);
              }
@@ -1199,12 +1199,12 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
  
  #else
  
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
      /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
      switch (ir->gb_algorithm)
      {
          case egbSTILL:
-            if (fr->use_acceleration)
+            if (fr->use_simd_kernels)
              {
                  calc_gb_rad_still_sse2_single(cr, fr, born->nr, top, x[0], nl, born);
              }
@@ -1214,7 +1214,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
              }
              break;
          case egbHCT:
-            if (fr->use_acceleration)
+            if (fr->use_simd_kernels)
              {
                  calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm);
              }
@@ -1225,7 +1225,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
              break;
  
          case egbOBC:
-            if (fr->use_acceleration)
+            if (fr->use_simd_kernels)
              {
                  calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm);
              }
@@ -1654,8 +1654,8 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t
  
      if (fr->bAllvsAll)
      {
-#if 0 && defined (GMX_X86_SSE2)
-        if (fr->use_acceleration)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
+        if (fr->use_simd_kernels)
          {
  #  ifdef GMX_DOUBLE
              genborn_allvsall_calc_chainrule_sse2_double(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
@@ -1676,8 +1676,8 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t
          return;
      }
  
-#if 0 && defined (GMX_X86_SSE2)
-    if (fr->use_acceleration)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
+    if (fr->use_simd_kernels)
      {
  #  ifdef GMX_DOUBLE
          calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0],
diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c b/src/gromacs/mdlib/genborn_allvsall_sse2_double.c

index bcfea3c33873e3eb995bc894fbc95502af8efe0b..b0e6cd3d823a72e042165bcacf7e42fa965af050 100644 (file)
--- a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c
+++ b/src/gromacs/mdlib/genborn_allvsall_sse2_double.c
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2012, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -51,7 +51,7 @@
  #include "genborn_allvsall.h"
  
  
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  
  #include <gmx_sse2_double.h>
  
diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c b/src/gromacs/mdlib/genborn_allvsall_sse2_single.c

index 2a96cc14222e7f9bb608c0df11d3e44ea897c97c..27d5c910c3d654e9f56a96a80ffa25d88d7fa36d 100644 (file)
--- a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c
+++ b/src/gromacs/mdlib/genborn_allvsall_sse2_single.c
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2012, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -50,7 +50,7 @@
  #include "genborn.h"
  #include "genborn_allvsall.h"
  
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  
  #include <gmx_sse2_single.h>
  
diff --git a/src/gromacs/mdlib/genborn_sse2_double.c b/src/gromacs/mdlib/genborn_sse2_double.c

index cdb7ecaf15b35d13c81b5a090220dbfe977b7c0d..40407177e8fe2bde2f39cc701b8f93364fce8263 100644 (file)
--- a/src/gromacs/mdlib/genborn_sse2_double.c
+++ b/src/gromacs/mdlib/genborn_sse2_double.c
@@ -58,7 +58,7 @@
  #include "gromacs/utility/gmxmpi.h"
  
  /* Only compile this file if SSE2 intrinsics are available */
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  #include <gmx_sse2_double.h>
  #include <emmintrin.h>
  
diff --git a/src/gromacs/mdlib/genborn_sse2_single.c b/src/gromacs/mdlib/genborn_sse2_single.c

index 1f62188b7ee1130a41ddda224dc08a42ce811e4f..74f700ba60d2bcdf7a560ac71c5f9fe4718a645f 100644 (file)
--- a/src/gromacs/mdlib/genborn_sse2_single.c
+++ b/src/gromacs/mdlib/genborn_sse2_single.c
@@ -59,7 +59,7 @@
  
  
  /* Only compile this file if SSE intrinsics are available */
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  
  #include <gmx_sse2_single.h>
  #include <emmintrin.h>
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c

index 6ab0da6d50715eb884410d69cb8513532b10c4bc..1abfe492e8d220ebcaa3e81ae9779f558948f724 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -432,7 +432,7 @@ static void
  nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat)
  {
      int       i, j;
-    const int simd_width = GMX_SIMD_WIDTH_HERE;
+    const int simd_width = GMX_SIMD_REAL_WIDTH;
      int       simd_excl_size;
      /* Set the diagonal cluster pair exclusion mask setup data.
       * In the kernel we check 0 < j - i to generate the masks.
@@ -482,7 +482,7 @@ nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat)
          nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
      }
  
-#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+#if (defined GMX_SIMD_IBM_QPX)
      /* The QPX kernels shouldn't do the bit masking that is done on
       * x86, because the SIMD units lack bit-wise operations. Instead,
       * we generate a vector of all 2^4 possible ways an i atom
@@ -497,7 +497,7 @@ nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat)
       * indices are used in the kernels. */
  
      simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
-    const int qpx_simd_width = GMX_SIMD_WIDTH_HERE;
+    const int qpx_simd_width = GMX_SIMD_REAL_WIDTH;
      snew_aligned(simd_interaction_array, simd_excl_size * qpx_simd_width, NBNXN_MEM_ALIGN);
      for (j = 0; j < simd_excl_size; j++)
      {
@@ -1158,33 +1158,33 @@ nbnxn_atomdata_reduce_reals_simd(real gmx_unused * gmx_restrict dest,
  /* The SIMD width here is actually independent of that in the kernels,
   * but we use the same width for simplicity (usually optimal anyhow).
   */
-    int       i, s;
-    gmx_mm_pr dest_SSE, src_SSE;
+    int             i, s;
+    gmx_simd_real_t dest_SSE, src_SSE;
  
      if (bDestSet)
      {
-        for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
+        for (i = i0; i < i1; i += GMX_SIMD_REAL_WIDTH)
          {
-            dest_SSE = gmx_load_pr(dest+i);
+            dest_SSE = gmx_simd_load_r(dest+i);
              for (s = 0; s < nsrc; s++)
              {
-                src_SSE  = gmx_load_pr(src[s]+i);
-                dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
+                src_SSE  = gmx_simd_load_r(src[s]+i);
+                dest_SSE = gmx_simd_add_r(dest_SSE, src_SSE);
              }
-            gmx_store_pr(dest+i, dest_SSE);
+            gmx_simd_store_r(dest+i, dest_SSE);
          }
      }
      else
      {
-        for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE)
+        for (i = i0; i < i1; i += GMX_SIMD_REAL_WIDTH)
          {
-            dest_SSE = gmx_load_pr(src[0]+i);
+            dest_SSE = gmx_simd_load_r(src[0]+i);
              for (s = 1; s < nsrc; s++)
              {
-                src_SSE  = gmx_load_pr(src[s]+i);
-                dest_SSE = gmx_add_pr(dest_SSE, src_SSE);
+                src_SSE  = gmx_simd_load_r(src[s]+i);
+                dest_SSE = gmx_simd_add_r(dest_SSE, src_SSE);
              }
-            gmx_store_pr(dest+i, dest_SSE);
+            gmx_simd_store_r(dest+i, dest_SSE);
          }
      }
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h

index 1e8891c129b82567f50a9b40194591507cf84f6d..e71921699faf38462fc0f05bd04981c19b8ee55e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_internal.h
+++ b/src/gromacs/mdlib/nbnxn_internal.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -70,7 +70,7 @@ extern "C" {
  
  #ifdef GMX_NBNXN_SIMD
  /* Memory alignment in bytes as required by SIMD aligned loads/stores */
-#define NBNXN_MEM_ALIGN  (GMX_SIMD_WIDTH_HERE*sizeof(real))
+#define NBNXN_MEM_ALIGN  (GMX_SIMD_REAL_WIDTH*sizeof(real))
  #else
  /* No alignment required, but set it so we can call the same routines */
  #define NBNXN_MEM_ALIGN  32
@@ -153,16 +153,16 @@ typedef struct {
  
  typedef struct nbnxn_x_ci_simd_4xn {
      /* The i-cluster coordinates for simple search */
-    gmx_mm_pr ix_S0, iy_S0, iz_S0;
-    gmx_mm_pr ix_S1, iy_S1, iz_S1;
-    gmx_mm_pr ix_S2, iy_S2, iz_S2;
-    gmx_mm_pr ix_S3, iy_S3, iz_S3;
+    gmx_simd_real_t ix_S0, iy_S0, iz_S0;
+    gmx_simd_real_t ix_S1, iy_S1, iz_S1;
+    gmx_simd_real_t ix_S2, iy_S2, iz_S2;
+    gmx_simd_real_t ix_S3, iy_S3, iz_S3;
  } nbnxn_x_ci_simd_4xn_t;
  
  typedef struct nbnxn_x_ci_simd_2xnn {
      /* The i-cluster coordinates for simple search */
-    gmx_mm_pr ix_S0, iy_S0, iz_S0;
-    gmx_mm_pr ix_S2, iy_S2, iz_S2;
+    gmx_simd_real_t ix_S0, iy_S0, iz_S0;
+    gmx_simd_real_t ix_S2, iy_S2, iz_S2;
  } nbnxn_x_ci_simd_2xnn_t;
  
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py

index cc109228c5deeca22480353dc24b7820ae091831..2d6c91824aec29abec6c233a995fc7b9d4373fb7 100755 (executable)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py
@@ -2,7 +2,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2013, by the GROMACS development team, led by
+# Copyright (c) 2013,2014, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -132,7 +132,7 @@ VerletKernelTypeDict = {
      '2xnn' : {
          'Define' : 'GMX_NBNXN_SIMD_2XNN',
          'WidthSetup' : '/* Include the full-width SIMD macros */\n',
-        'WidthCheck' : ('#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)\n' \
+        'WidthCheck' : ('#if !(GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16)\n' \
                          '#error "unsupported SIMD width"\n' \
                          '#endif\n'),
          'UnrollSize' : 2,
@@ -142,7 +142,7 @@ VerletKernelTypeDict = {
          'WidthSetup' : ('#ifdef GMX_NBNXN_HALF_WIDTH_SIMD\n' \
                          '#define GMX_USE_HALF_WIDTH_SIMD_HERE\n' \
                          '#endif\n'),
-        'WidthCheck' : ('#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)\n' \
+        'WidthCheck' : ('#if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8)\n' \
                          '#error "unsupported SIMD width"\n' \
                          '#endif\n'),
          'UnrollSize' : 1,
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre

index f2478956fc686844e09daa5effdcdec5df36739c..2b5419414d7b3743f604625891dfa556400a47b4 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -69,7 +69,7 @@ reduce_group_energies(int ng, int ng_2log,
                        const real *VSvdw, const real *VSc,
                        real *Vvdw, real *Vc)
  {{
-    const int unrollj      = GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE;
+    const int unrollj      = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE;
      const int unrollj_half = unrollj/2;
      int       ng_p2, i, j, j0, j1, c, s;
  
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index 75c1313f4f8d68b09a90b62c02b39b30339651d6..59be0fbd75944668917272fc3341aa076cb7e5d9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -54,7 +54,7 @@
  #error "Must define an NBNxN kernel flavour before including NBNxN kernel utility functions"
  #endif
  
-#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+#ifdef GMX_SIMD_REFERENCE
  
  /* Align a stack-based thread-local working array. */
  static gmx_inline int *
@@ -65,9 +65,9 @@ prepare_table_load_buffer(const int gmx_unused *array)
  
  #include "nbnxn_kernel_simd_utils_ref.h"
  
-#else /* GMX_SIMD_REFERENCE_PLAIN_C */
+#else /* GMX_SIMD_REFERENCE */
  
-#if defined  GMX_X86_SSE2 && !defined __MIC__
+#if defined  GMX_SIMD_X86_SSE2_OR_HIGHER && !defined __MIC__
  /* Include x86 SSE2 compatible SIMD functions */
  
  /* Set the stride for the lookup of the two LJ parameters from their
@@ -85,30 +85,29 @@ static const int nbfp_stride = 4;
  static gmx_inline int *
  prepare_table_load_buffer(const int gmx_unused *array)
  {
-#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
-    return gmx_simd_align_int(array);
+#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+    return gmx_simd_align_i(array);
  #else
      return NULL;
  #endif
  }
  
-#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
  
  /* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
-#if GMX_SIMD_WIDTH_HERE == 8
+#if GMX_SIMD_REAL_WIDTH == 8
  #define TAB_FDV0
  #endif
-
  #ifdef GMX_DOUBLE
  #include "nbnxn_kernel_simd_utils_x86_256d.h"
  #else  /* GMX_DOUBLE */
  #include "nbnxn_kernel_simd_utils_x86_256s.h"
  #endif /* GMX_DOUBLE */
  
-#else  /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+#else  /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
  
  /* We use the FDV0 table layout when we can use aligned table loads */
-#if GMX_SIMD_WIDTH_HERE == 4
+#if GMX_SIMD_REAL_WIDTH == 4
  #define TAB_FDV0
  #endif
  
@@ -118,48 +117,48 @@ prepare_table_load_buffer(const int gmx_unused *array)
  #include "nbnxn_kernel_simd_utils_x86_128s.h"
  #endif /* GMX_DOUBLE */
  
-#endif /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+#endif /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
  
-#else  /* GMX_X86_SSE2 */
+#else  /* GMX_SIMD_X86_SSE2_OR_HIGHER */
  
-#if GMX_SIMD_WIDTH_HERE > 4
+#if GMX_SIMD_REAL_WIDTH > 4
  static const int nbfp_stride = 4;
  #else
-static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
+static const int nbfp_stride = GMX_SIMD_REAL_WIDTH;
  #endif
  
  /* We use the FDV0 table layout when we can use aligned table loads */
-#if GMX_SIMD_WIDTH_HERE == 4
+#if GMX_SIMD_REAL_WIDTH == 4
  #define TAB_FDV0
  #endif
  
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
  #include "nbnxn_kernel_simd_utils_ibm_qpx.h"
-#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+#endif /* GMX_SIMD_IBM_QPX */
  
  #ifdef __MIC__
  #include "nbnxn_kernel_simd_utils_x86_mic.h"
  #endif
  
-#endif /* GMX_X86_SSE2 */
-#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
+#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
+#endif /* GMX_SIMD_REFERENCE */
  
-#if GMX_SIMD_WIDTH_HERE == 4 && !defined GMX_SIMD_REFERENCE_PLAIN_C
-#define gmx_mm_pr4    gmx_mm_pr
-#define gmx_load_pr4  gmx_load_pr
-#define gmx_store_pr4 gmx_store_pr
-#define gmx_add_pr4   gmx_add_pr
+#if GMX_SIMD_REAL_WIDTH == 4 && !defined GMX_SIMD_REFERENCE
+#define gmx_mm_pr4    gmx_simd_real_t
+#define gmx_load_pr4  gmx_simd_load_r
+#define gmx_store_pr4 gmx_simd_store_r
+#define gmx_add_pr4   gmx_simd_add_r
  #endif
  
  #ifndef HAVE_GMX_SUM_SIMD /* should be defined for arch with hardware reduce */
  static gmx_inline real
-gmx_sum_simd2(gmx_mm_pr x, real* b)
+gmx_sum_simd2(gmx_simd_real_t x, real* b)
  {
-    gmx_store_pr(b, x);
+    gmx_simd_store_r(b, x);
      return b[0]+b[1];
  }
  
-#if GMX_SIMD_WIDTH_HERE >= 4
+#if GMX_SIMD_REAL_WIDTH >= 4
  static gmx_inline real
  gmx_sum_simd4(gmx_mm_pr4 x, real* b)
  {
@@ -168,31 +167,31 @@ gmx_sum_simd4(gmx_mm_pr4 x, real* b)
  }
  #endif
  
-#if GMX_SIMD_WIDTH_HERE == 2
-static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b)
+#if GMX_SIMD_REAL_WIDTH == 2
+static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
  {
-    gmx_store_pr(b, x);
+    gmx_simd_store_r(b, x);
      return b[0]+b[1];
  }
-#elif GMX_SIMD_WIDTH_HERE == 4
-static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b)
+#elif GMX_SIMD_REAL_WIDTH == 4
+static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
  {
-    gmx_store_pr(b, x);
+    gmx_simd_store_r(b, x);
      return b[0]+b[1]+b[2]+b[3];
  }
-#elif GMX_SIMD_WIDTH_HERE == 8
-static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b)
+#elif GMX_SIMD_REAL_WIDTH == 8
+static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
  {
-    gmx_store_pr(b, x);
+    gmx_simd_store_r(b, x);
      return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7];
  }
-#elif GMX_SIMD_WIDTH_HERE == 16
+#elif GMX_SIMD_REAL_WIDTH == 16
  /* This is getting ridiculous, SIMD horizontal adds would help,
   * but this is not performance critical (only used to reduce energies)
   */
-static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b)
+static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
  {
-    gmx_store_pr(b, x);
+    gmx_simd_store_r(b, x);
      return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7]+b[8]+b[9]+b[10]+b[11]+b[12]+b[13]+b[14]+b[15];
  }
  #else
@@ -202,7 +201,7 @@ static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b)
  
  #ifdef UNROLLJ
  /* Add energy register to possibly multiple terms in the energy array */
-static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
+static inline void add_ener_grp(gmx_simd_real_t e_S, real *v, const int *offset_jj)
  {
      int jj;
  
@@ -212,10 +211,10 @@ static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
       */
      for (jj = 0; jj < (UNROLLJ/2); jj++)
      {
-        gmx_mm_pr v_S;
+        gmx_simd_real_t v_S;
  
-        v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
-        gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
+        v_S = gmx_simd_load_r(v+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH);
+        gmx_simd_store_r(v+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH, gmx_simd_add_r(v_S, e_S));
      }
  }
  #endif
@@ -225,7 +224,7 @@ static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
   * a single SIMD register.
   */
  static inline void
-add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
+add_ener_grp_halves(gmx_simd_real_t e_S, real *v0, real *v1, const int *offset_jj)
  {
      gmx_mm_hpr e_S0, e_S1;
      int        jj;
@@ -236,15 +235,15 @@ add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
      {
          gmx_mm_hpr v_S;
  
-        gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
-        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
+        gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2);
+        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2, gmx_add_hpr(v_S, e_S0));
      }
      for (jj = 0; jj < (UNROLLJ/2); jj++)
      {
          gmx_mm_hpr v_S;
  
-        gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
-        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
+        gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2);
+        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2, gmx_add_hpr(v_S, e_S1));
      }
  }
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h

index bfbf9e24eecc43f61ef86e6a50f0738b269c6253..fd857475b4291108a59fb7891fde890995bb3598 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2013, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,11 +35,11 @@
  #ifndef _nbnxn_kernel_simd_utils_ibm_qpx_h_
  #define _nbnxn_kernel_simd_utils_ibm_qpx_h_
  
-typedef gmx_mm_pr gmx_exclfilter;
+typedef gmx_simd_real_t gmx_exclfilter;
  static const int filter_stride = 1;
  
  /* The 4xn kernel operates on 4-wide i-force registers */
-typedef gmx_mm_pr gmx_mm_pr4;
+typedef gmx_simd_real_t gmx_mm_pr4;
  
  /* This files contains all functions/macros for the SIMD kernels
   * which have explicit dependencies on the j-cluster size and/or SIMD-width.
@@ -51,9 +51,10 @@ typedef gmx_mm_pr gmx_mm_pr4;
  
  /* Collect all [0123] elements of the 4 inputs to out[0123], respectively */
  static gmx_inline void
-gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
-                   gmx_mm_pr *out0, gmx_mm_pr *out1,
-                   gmx_mm_pr *out2, gmx_mm_pr *out3)
+gmx_transpose_4_ps(gmx_simd_real_t a, gmx_simd_real_t b,
+                   gmx_simd_real_t c, gmx_simd_real_t d,
+                   gmx_simd_real_t *out0, gmx_simd_real_t *out1,
+                   gmx_simd_real_t *out2, gmx_simd_real_t *out3)
  {
      /* Prepare control vectors for swizzling. In its third input,
         vec_perm accepts indices into the effective 8-wide SIMD vector
@@ -63,14 +64,14 @@ gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
         vec_gpci() converts an octal literal of the indices into the
         correct form for vec_perm() to use. That form is an octal digit
         in bits 0-2 of the mantissa of each double. */
-    gmx_mm_pr p6420 = vec_gpci(06420);
-    gmx_mm_pr p7531 = vec_gpci(07531);
+    gmx_simd_real_t p6420 = vec_gpci(06420);
+    gmx_simd_real_t p7531 = vec_gpci(07531);
  
      /* Four-way swizzle (i.e. transpose) of vectors a = a0a1a2a3, etc. */
-    gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
-    gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531);
-    gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
-    gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531);
+    gmx_simd_real_t b2b0a2a0 = vec_perm(a, b, p6420);
+    gmx_simd_real_t b3b1a3a1 = vec_perm(a, b, p7531);
+    gmx_simd_real_t d2d0c2c0 = vec_perm(c, d, p6420);
+    gmx_simd_real_t d3d1c3c1 = vec_perm(c, d, p7531);
      *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531);
      *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531);
      *out2 = vec_perm(d2d0c2c0, b2b0a2a0, p6420);
@@ -79,30 +80,32 @@ gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
  
  /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
  static gmx_inline void
-gmx_shuffle_4_ps_fil01_to_2_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
-                               gmx_mm_pr *out0, gmx_mm_pr *out1)
+gmx_shuffle_4_ps_fil01_to_2_ps(gmx_simd_real_t a, gmx_simd_real_t b,
+                               gmx_simd_real_t c, gmx_simd_real_t d,
+                               gmx_simd_real_t *out0, gmx_simd_real_t *out1)
  {
-    gmx_mm_pr p6420 = vec_gpci(06420);
-    gmx_mm_pr p7531 = vec_gpci(07531);
+    gmx_simd_real_t p6420 = vec_gpci(06420);
+    gmx_simd_real_t p7531 = vec_gpci(07531);
  
      /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */
-    gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
-    gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531);
-    gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
-    gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531);
+    gmx_simd_real_t b2b0a2a0 = vec_perm(a, b, p6420);
+    gmx_simd_real_t b3b1a3a1 = vec_perm(a, b, p7531);
+    gmx_simd_real_t d2d0c2c0 = vec_perm(c, d, p6420);
+    gmx_simd_real_t d3d1c3c1 = vec_perm(c, d, p7531);
      *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531);
      *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531);
  }
  
  /* Collect element 2 of the 4 inputs to out */
-static gmx_inline gmx_mm_pr
-gmx_shuffle_4_ps_fil2_to_1_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d)
+static gmx_inline gmx_simd_real_t
+gmx_shuffle_4_ps_fil2_to_1_ps(gmx_simd_real_t a, gmx_simd_real_t b,
+                              gmx_simd_real_t c, gmx_simd_real_t d)
  {
-    gmx_mm_pr p6420 = vec_gpci(06420);
+    gmx_simd_real_t p6420 = vec_gpci(06420);
  
      /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */
-    gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
-    gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+    gmx_simd_real_t b2b0a2a0 = vec_perm(a, b, p6420);
+    gmx_simd_real_t d2d0c2c0 = vec_perm(c, d, p6420);
      return vec_perm(d2d0c2c0, b2b0a2a0, p6420);
  }
  
@@ -112,12 +115,12 @@ gmx_shuffle_4_ps_fil2_to_1_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d
  static gmx_inline int *
  prepare_table_load_buffer(const int *array)
  {
-    return gmx_simd_align_int(array);
+    return gmx_simd_align_i(array);
  }
  
  static gmx_inline void
-load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
-             gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S)
+load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
+             gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
  {
  #ifdef NDEBUG
      /* Just like 256-bit AVX, we need to use memory to get indices
@@ -128,19 +131,19 @@ load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
  #endif
  
      /* Here we load 4 aligned reals, but we need just 2 elements of each */
-    gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride);
-    gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride);
-    gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride);
-    gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride);
+    gmx_simd_real_t a = gmx_simd_load_r(tab_coul_FDV0 + ti[0] * nbfp_stride);
+    gmx_simd_real_t b = gmx_simd_load_r(tab_coul_FDV0 + ti[1] * nbfp_stride);
+    gmx_simd_real_t c = gmx_simd_load_r(tab_coul_FDV0 + ti[2] * nbfp_stride);
+    gmx_simd_real_t d = gmx_simd_load_r(tab_coul_FDV0 + ti[3] * nbfp_stride);
  
      gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S);
  }
  
  static gmx_inline void
  load_table_f_v(const real *tab_coul_FDV0,
-               gmx_epi32 ti_S, int *ti,
-               gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S,
-               gmx_mm_pr *ctabv_S)
+               gmx_simd_int32_t ti_S, int *ti,
+               gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
+               gmx_simd_real_t *ctabv_S)
  {
  #ifdef NDEBUG
      /* Just like 256-bit AVX, we need to use memory to get indices
@@ -151,10 +154,10 @@ load_table_f_v(const real *tab_coul_FDV0,
  #endif
  
      /* Here we load 4 aligned reals, but we need just 3 elements of each. */
-    gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride);
-    gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride);
-    gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride);
-    gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride);
+    gmx_simd_real_t a = gmx_simd_load_r(tab_coul_FDV0 + ti[0] * nbfp_stride);
+    gmx_simd_real_t b = gmx_simd_load_r(tab_coul_FDV0 + ti[1] * nbfp_stride);
+    gmx_simd_real_t c = gmx_simd_load_r(tab_coul_FDV0 + ti[2] * nbfp_stride);
+    gmx_simd_real_t d = gmx_simd_load_r(tab_coul_FDV0 + ti[3] * nbfp_stride);
  
      gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S);
      *ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(a, b, c, d);
@@ -167,20 +170,20 @@ load_table_f_v(const real *tab_coul_FDV0,
  
  /* Sum the elements within each input register and store the sums in out.
   */
-static gmx_inline gmx_mm_pr
-gmx_mm_transpose_sum4_pr(gmx_mm_pr a, gmx_mm_pr b,
-                         gmx_mm_pr c, gmx_mm_pr d)
+static gmx_inline gmx_simd_real_t
+gmx_mm_transpose_sum4_pr(gmx_simd_real_t a, gmx_simd_real_t b,
+                         gmx_simd_real_t c, gmx_simd_real_t d)
  {
-    gmx_mm_pr a0b0c0d0, a1b1c1d1, a2b2c2d2, a3b3c3d3;
+    gmx_simd_real_t a0b0c0d0, a1b1c1d1, a2b2c2d2, a3b3c3d3;
      gmx_transpose_4_ps(a, b, c, d,
                         &a0b0c0d0,
                         &a1b1c1d1,
                         &a2b2c2d2,
                         &a3b3c3d3);
      /* Now reduce the transposed vectors */
-    gmx_mm_pr sum01 = gmx_add_pr(a0b0c0d0, a1b1c1d1);
-    gmx_mm_pr sim23 = gmx_add_pr(a2b2c2d2, a3b3c3d3);
-    return gmx_add_pr(sum01, sim23);
+    gmx_simd_real_t sum01 = gmx_simd_add_r(a0b0c0d0, a1b1c1d1);
+    gmx_simd_real_t sim23 = gmx_simd_add_r(a2b2c2d2, a3b3c3d3);
+    return gmx_simd_add_r(sum01, sim23);
  }
  
  #ifdef GMX_DOUBLE
@@ -191,23 +194,23 @@ gmx_mm_transpose_sum4_pr(gmx_mm_pr a, gmx_mm_pr b,
   * reciprocal square roots.
   */
  static gmx_inline void
-gmx_mm_invsqrt2_pd(gmx_mm_pr in0, gmx_mm_pr in1,
-                   gmx_mm_pr *out0, gmx_mm_pr *out1)
+gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
+                   gmx_simd_real_t *out0, gmx_simd_real_t *out1)
  {
-    *out0 = gmx_invsqrt_pr(in0);
-    *out1 = gmx_invsqrt_pr(in1);
+    *out0 = gmx_simd_invsqrt_r(in0);
+    *out1 = gmx_simd_invsqrt_r(in1);
  }
  #endif
  
  static gmx_inline void
  load_lj_pair_params(const real *nbfp, const int *type, int aj,
-                    gmx_mm_pr *c6_S, gmx_mm_pr *c12_S)
+                    gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
  {
      /* Here we load 4 aligned reals, but we need just 2 elemnts of each. */
-    gmx_mm_pr a = gmx_load_pr(nbfp + type[aj+0] * nbfp_stride);
-    gmx_mm_pr b = gmx_load_pr(nbfp + type[aj+1] * nbfp_stride);
-    gmx_mm_pr c = gmx_load_pr(nbfp + type[aj+2] * nbfp_stride);
-    gmx_mm_pr d = gmx_load_pr(nbfp + type[aj+3] * nbfp_stride);
+    gmx_simd_real_t a = gmx_simd_load_r(nbfp + type[aj+0] * nbfp_stride);
+    gmx_simd_real_t b = gmx_simd_load_r(nbfp + type[aj+1] * nbfp_stride);
+    gmx_simd_real_t c = gmx_simd_load_r(nbfp + type[aj+2] * nbfp_stride);
+    gmx_simd_real_t d = gmx_simd_load_r(nbfp + type[aj+3] * nbfp_stride);
  
      gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, c6_S, c12_S);
  }
@@ -234,7 +237,7 @@ static gmx_inline gmx_exclfilter gmx_load_exclusion_filter(const unsigned *a)
  /* Code for handling loading and applying exclusion masks. Note that
     parameter a is not treated like an array index; it is naively added
     to b, so should be in bytes. */
-static gmx_inline gmx_mm_pb gmx_load_interaction_mask_pb(long a, const real *b)
+static gmx_inline gmx_simd_bool_t gmx_load_interaction_mask_pb(long a, const real *b)
  {
  #ifdef NDEBUG
      return vec_ld(a, (real *) b);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

index e757a8d56177069104c80d4c0c59ec8e028c390f..90f52c7a2f108fefe2d9f37a9629fc66af441705 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
@@ -37,7 +37,7 @@
  
  typedef gmx_simd_ref_epi32      gmx_simd_ref_exclfilter;
  typedef gmx_simd_ref_exclfilter gmx_exclfilter;
-static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
  /* Set the stride for the lookup of the two LJ parameters from their
     (padded) array. Only strides of 2 and 4 are currently supported. */
@@ -49,7 +49,7 @@ static const int nbfp_stride = 2;
  static const int nbfp_stride = 4;
  #endif
  
-#if GMX_SIMD_WIDTH_HERE > 4
+#if GMX_SIMD_REAL_WIDTH > 4
  /* The 4xn kernel operates on 4-wide i-force registers */
  
  /* float/double SIMD register type */
@@ -114,7 +114,7 @@ typedef gmx_simd_ref_pr gmx_mm_pr4;
  /* Half-width SIMD real type */
  /* float/double SIMD register type */
  typedef struct {
-    real r[GMX_SIMD_WIDTH_HERE/2];
+    real r[GMX_SIMD_REAL_WIDTH/2];
  } gmx_mm_hpr;
  
  /* Half-width SIMD operations */
@@ -125,7 +125,7 @@ gmx_load_hpr(gmx_mm_hpr *a, const real *b)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          a->r[i] = b[i];
      }
@@ -137,7 +137,7 @@ gmx_set1_hpr(gmx_mm_hpr *a, real b)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          a->r[i] = b;
      }
@@ -149,10 +149,10 @@ gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          a->r[                        i] = b[0];
-        a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1];
+        a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[1];
      }
  }
  
@@ -162,10 +162,10 @@ gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          a->r[i]                         = b[i];
-        a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i];
+        a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[i];
      }
  }
  
@@ -175,7 +175,7 @@ gmx_store_hpr(real *a, gmx_mm_hpr b)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          a[i] = b.r[i];
      }
@@ -187,7 +187,7 @@ gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
      gmx_mm_hpr c;
      int        i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          c.r[i] = a.r[i] + b.r[i];
      }
@@ -201,7 +201,7 @@ gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
      gmx_mm_hpr c;
      int        i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          c.r[i] = a.r[i] - b.r[i];
      }
@@ -216,13 +216,13 @@ gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
      gmx_mm_hpr c;
      int        i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          c.r[i] =
              a.r[i] +
-            a.r[GMX_SIMD_WIDTH_HERE/2+i] +
+            a.r[GMX_SIMD_REAL_WIDTH/2+i] +
              b.r[i] +
-            b.r[GMX_SIMD_WIDTH_HERE/2+i];
+            b.r[GMX_SIMD_REAL_WIDTH/2+i];
      }
  
      return c;
@@ -241,12 +241,12 @@ gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
      sum.r[2] = 0;
      sum.r[3] = 0;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          sum.r[0] += a.r[i];
-        sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i];
+        sum.r[1] += a.r[GMX_SIMD_REAL_WIDTH/2+i];
          sum.r[2] += b.r[i];
-        sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i];
+        sum.r[3] += b.r[GMX_SIMD_REAL_WIDTH/2+i];
      }
  
      return sum;
@@ -258,10 +258,10 @@ gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          b->r[i] = a.r[i];
-        c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i];
+        c->r[i] = a.r[GMX_SIMD_REAL_WIDTH/2 + i];
      }
  }
  static gmx_inline void
@@ -269,10 +269,10 @@ gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          c->r[i]                         = a.r[i];
-        c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i];
+        c->r[GMX_SIMD_REAL_WIDTH/2 + i] = b.r[i];
      }
  }
  
@@ -287,13 +287,13 @@ load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S,
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
          ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
          ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
      }
  
-    *ctab1_S  = gmx_sub_pr(*ctab1_S, *ctab0_S);
+    *ctab1_S  = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
  }
  
  static gmx_inline void
@@ -306,7 +306,7 @@ load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
  
      load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
          ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
      }
@@ -320,7 +320,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
          ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
          ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
@@ -337,7 +337,7 @@ load_table_f_v(const real *tab_coul_FDV0,
  
      load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
          ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
      }
@@ -347,7 +347,7 @@ load_table_f_v(const real *tab_coul_FDV0,
  /* Sum the elements within each input register and store the sums in out.
   * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
   */
-#if GMX_SIMD_WIDTH_HERE == 2
+#if GMX_SIMD_REAL_WIDTH == 2
  static gmx_inline gmx_simd_ref_pr
  gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
  {
@@ -360,8 +360,8 @@ gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
  }
  #endif
  
-#if GMX_SIMD_WIDTH_HERE >= 4
-#if GMX_SIMD_WIDTH_HERE == 4
+#if GMX_SIMD_REAL_WIDTH >= 4
+#if GMX_SIMD_REAL_WIDTH == 4
  static gmx_inline gmx_simd_ref_pr
  #else
  static gmx_inline gmx_mm_pr4
@@ -369,7 +369,7 @@ static gmx_inline gmx_mm_pr4
  gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
                           gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
  {
-#if GMX_SIMD_WIDTH_HERE == 4
+#if GMX_SIMD_REAL_WIDTH == 4
      gmx_simd_ref_pr sum;
  #else
      gmx_mm_pr4      sum;
@@ -381,7 +381,7 @@ gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
      sum.r[2] = 0;
      sum.r[3] = 0;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
          sum.r[0] += in0.r[i];
          sum.r[1] += in1.r[i];
@@ -403,8 +403,8 @@ static gmx_inline void
  gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
                     gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
  {
-    *out0 = gmx_invsqrt_pr(in0);
-    *out1 = gmx_invsqrt_pr(in1);
+    *out0 = gmx_simd_invsqrt_r(in0);
+    *out1 = gmx_simd_invsqrt_r(in1);
  }
  #endif
  
@@ -414,7 +414,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
          c6_S->r[i]  = nbfp[type[aj+i]*nbfp_stride];
          c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
@@ -429,12 +429,12 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
  {
      int i;
  
-    for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
      {
          c6_S->r[i]                          = nbfp0[type[aj+i]*nbfp_stride];
-        c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i]  = nbfp1[type[aj+i]*nbfp_stride];
+        c6_S->r[GMX_SIMD_REAL_WIDTH/2 + i]  = nbfp1[type[aj+i]*nbfp_stride];
          c12_S->r[i]                         = nbfp0[type[aj+i]*nbfp_stride+1];
-        c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
+        c12_S->r[GMX_SIMD_REAL_WIDTH/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
      }
  }
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h

index f866758218515c49e1bc8bfba5a9cfdbb6ed5292..92be81d99f3b073c1ad201d2ab201e12ae047926 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -45,8 +45,8 @@
   *   energy group pair energy storage
   */
  
-typedef gmx_epi32 gmx_exclfilter;
-static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+typedef gmx_simd_int32_t gmx_exclfilter;
+static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
  /* Transpose 2 double precision registers */
  static gmx_inline void
@@ -130,7 +130,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
   * AVX_256. */
  
  static gmx_inline void
-load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int gmx_unused *ti,
+load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int gmx_unused *ti,
               __m128d *ctab0_S, __m128d *ctab1_S)
  {
      int     idx[2];
@@ -150,7 +150,7 @@ load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int gmx_unused *ti,
  
  static gmx_inline void
  load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
-               gmx_epi32 ti_S, int gmx_unused *ti,
+               gmx_simd_int32_t ti_S, int gmx_unused *ti,
                 __m128d *ctab0_S, __m128d *ctab1_S, __m128d *ctabv_S)
  {
      int     idx[2];
@@ -186,7 +186,7 @@ gmx_load_exclusion_filter(const unsigned *i)
      return _mm_load_si128((__m128i *) i);
  }
  
-static gmx_inline gmx_mm_pb
+static gmx_inline gmx_simd_bool_t
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      return gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h

index 02c6ca1a5b08b148e2f23cff7236288e63ed39f2..0571a6cd1f836ff9449f1cec5d2be0f21efb77a3 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -45,8 +45,8 @@
   *   energy group pair energy storage
   */
  
-typedef gmx_epi32 gmx_exclfilter;
-static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+typedef gmx_simd_int32_t gmx_exclfilter;
+static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
  /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
  static gmx_inline void
@@ -95,7 +95,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
      for (p = 0; p < UNROLLJ; p++)
      {
          /* Here we load 4 aligned floats, but we need just 2 */
-        clj_S[p] = gmx_load_pr(nbfp+type[aj+p]*nbfp_stride);
+        clj_S[p] = gmx_simd_load_r(nbfp+type[aj+p]*nbfp_stride);
      }
      gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[0], clj_S[1], clj_S[2], clj_S[3], c6_S, c12_S);
  }
@@ -116,7 +116,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
   * AVX_256. */
  
  static gmx_inline void
-load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int gmx_unused *ti,
+load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
               __m128 *ctab0_S, __m128 *ctab1_S)
  {
      int    idx[4];
@@ -139,7 +139,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int gmx_unused *ti,
  }
  
  static gmx_inline void
-load_table_f_v(const real *tab_coul_FDV0, gmx_epi32 ti_S, int gmx_unused *ti,
+load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
                 __m128 *ctab0_S, __m128 *ctab1_S, __m128 *ctabv_S)
  {
      int    idx[4];
@@ -175,7 +175,7 @@ gmx_load_exclusion_filter(const unsigned *i)
      return _mm_load_si128((__m128i *) i);
  }
  
-static gmx_inline gmx_mm_pb
+static gmx_inline gmx_simd_bool_t
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      return gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h

index 97f25fa3d452df531b880cc4523a65365cd48be5..d5a013d412163b111d95e3b76598f56af660871f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -43,7 +43,7 @@
   *   energy group pair energy storage
   */
  
-typedef gmx_mm_pr gmx_exclfilter;
+typedef gmx_simd_real_t gmx_exclfilter;
  static const int filter_stride = 2;
  
  /* Transpose 2 double precision registers */
@@ -193,10 +193,10 @@ gmx_load1_exclfilter(int e)
  static gmx_inline gmx_exclfilter
  gmx_load_exclusion_filter(const unsigned *i)
  {
-    return gmx_load_pr((real *) (i));
+    return gmx_simd_load_r((real *) (i));
  }
  
-static gmx_inline gmx_mm_pb
+static gmx_inline gmx_simd_bool_t
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      /* With <= 16 bits used the cast and conversion should not be
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h

index a8be0686089bf20aba5ece5c08942be703a386f0..2c6fac5ba95f867bd388350543a18a87a40e453a 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -43,7 +43,7 @@
   *   energy group pair energy storage
   */
  
-typedef gmx_mm_pr gmx_exclfilter;
+typedef gmx_simd_real_t gmx_exclfilter;
  static const int filter_stride = 1;
  
  /* The 4xn kernel operates on 4-wide i-force registers */
@@ -76,7 +76,7 @@ static const int filter_stride = 1;
  #define gmx_sum4_hpr                 gmx_mm256_sum4h_m128
  
  static gmx_inline void
-gmx_pr_to_2hpr(gmx_mm_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  {
      *b = _mm256_extractf128_ps(a, 0);
      *c = _mm256_extractf128_ps(a, 1);
@@ -84,7 +84,7 @@ gmx_pr_to_2hpr(gmx_mm_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  
  /* Store half width SIMD registers a and b in full width register *c */
  static gmx_inline void
-gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_pr *c)
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
  {
      *c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1);
  }
@@ -217,7 +217,7 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
   * AVX_256. */
  
  static gmx_inline void
-load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
               __m256 *ctab0_S, __m256 *ctab1_S)
  {
      __m128 ctab_S[8], ctabt_S[4];
@@ -239,7 +239,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
  }
  
  static gmx_inline void
-load_table_f_v(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
                 __m256 *ctab0_S, __m256 *ctab1_S, __m256 *ctabv_S)
  {
      __m128 ctab_S[8], ctabt_S[4], ctabvt_S[2];
@@ -276,10 +276,10 @@ gmx_load1_exclfilter(int e)
  static gmx_inline gmx_exclfilter
  gmx_load_exclusion_filter(const unsigned *i)
  {
-    return gmx_load_pr((real *) (i));
+    return gmx_simd_load_r((real *) (i));
  }
  
-static gmx_inline gmx_mm_pb
+static gmx_inline gmx_simd_bool_t
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h

index c54c6ae79a819309931060f52ba327dac74fa1bf..e50d5a9d61d840020932e03ca7fba8d1e7bc67eb 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2013, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,8 +35,8 @@
  #ifndef _nbnxn_kernel_simd_utils_x86_mic_h_
  #define _nbnxn_kernel_simd_utils_x86_mic_h_
  
-typedef gmx_epi32      gmx_exclfilter;
-static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+typedef gmx_simd_int32_t      gmx_exclfilter;
+static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
  #define nbfp_stride 2
  
@@ -152,14 +152,14 @@ gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
  }
  
  static gmx_inline void
-gmx_2hepi_to_epi(gmx_epi32 a, gmx_epi32 b, gmx_epi32 *c)
+gmx_2hepi_to_epi(gmx_simd_int32_t a, gmx_simd_int32_t b, gmx_simd_int32_t *c)
  {
      *c = _mm512_mask_permute4f128_epi32(a, mask_hih, b, PERM_LOW2HIGH);
  }
  
  /* recombine the 2 high half into c */
  static gmx_inline void
-gmx_2hepi_high_to_epi(gmx_epi32 a, gmx_epi32 b, gmx_epi32 *c)
+gmx_2hepi_high_to_epi(gmx_simd_int32_t a, gmx_simd_int32_t b, gmx_simd_int32_t *c)
  {
      *c = _mm512_mask_permute4f128_epi32(b, mask_loh, a, PERM_HIGH2LOW);
  }
@@ -178,7 +178,7 @@ prepare_table_load_buffer(const int *array)
     instead of low/high.
   */
  static gmx_inline void
-load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti,
+load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int *ti,
               gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S)
  {
      __m512i idx;
@@ -190,12 +190,12 @@ load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti,
  
      gmx_2hpr_to_pr(tmp1, tmp2, ctab0_S);
      gmx_2hpr_high_to_pr(tmp1, tmp2, ctab1_S);
-    *ctab1_S  = gmx_sub_pr(*ctab1_S, *ctab0_S);
+    *ctab1_S  = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
  }
  
  static gmx_inline void
  load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
-               gmx_epi32 ti_S, int *ti,
+               gmx_simd_int32_t ti_S, int *ti,
                 gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S,
                 gmx_mm_ps *ctabv_S)
  {
@@ -241,12 +241,12 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
  
  #define HAVE_GMX_SUM_SIMD
  static gmx_inline real
-gmx_sum_simd(gmx_mm_pr x, real* b)
+gmx_sum_simd(gmx_simd_real_t x, real* b)
  {
      return _mm512_reduce_add_ps(x);
  }
  static gmx_inline real
-gmx_sum_simd4(gmx_mm_pr x, real* b)
+gmx_sum_simd4(gmx_simd_real_t x, real* b)
  {
      return _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x);
  }
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c

index 0f5190a50bc34fa76071a2422889d9d3d78270c5..660c12cfc27b8f6f7a7cec3862569e3d6520431a 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -49,7 +49,7 @@
  #include "gromacs/simd/macros.h"
  #include "gromacs/simd/vector_operations.h"
  
-#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)
+#if !(GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16)
  #error "unsupported SIMD width"
  #endif
  
@@ -159,7 +159,7 @@ reduce_group_energies(int ng, int ng_2log,
                        const real *VSvdw, const real *VSc,
                        real *Vvdw, real *Vc)
  {
-    const int unrollj      = GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE;
+    const int unrollj      = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE;
      const int unrollj_half = unrollj/2;
      int       ng_p2, i, j, j0, j1, c, s;
  
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h

index 9d4c7c60b02319cb9448ae5cbf6f5d732dd3b766..6dbfb975024e27cd46d23dd8abb336859623b3e6 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
@@ -44,19 +44,19 @@
  #endif
  
  #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ    (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE)
+#define UNROLLJ    (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE)
  
  /* The stride of all the atom data arrays is equal to half the SIMD width */
-#define STRIDE     (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE)
+#define STRIDE     (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE)
  
  #include "../nbnxn_kernel_simd_utils.h"
  
  static gmx_inline void
-gmx_load_simd_2xnn_interactions(int            excl,
-                                gmx_exclfilter filter_S0,
-                                gmx_exclfilter filter_S2,
-                                gmx_mm_pb     *interact_S0,
-                                gmx_mm_pb     *interact_S2)
+gmx_load_simd_2xnn_interactions(int                  excl,
+                                gmx_exclfilter       filter_S0,
+                                gmx_exclfilter       filter_S2,
+                                gmx_simd_bool_t     *interact_S0,
+                                gmx_simd_bool_t     *interact_S2)
  {
      /* Load integer interaction mask */
      gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h

index bbff9ed3681729a90ff1267606618a6f8ab76470..8e8eae7966f856faa41506e1cdded014d0a8c99e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -70,14 +70,14 @@
   * With gcc this is slower, except for RF on Sandy Bridge.
   * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
   */
-#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER))
  #define CUTOFF_BLENDV
  #endif
  /* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
   * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
   * Tested with icc 13.
   */
-#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
  #define CUTOFF_BLENDV
  #endif
  #endif
@@ -92,99 +92,99 @@
  
  #ifdef CHECK_EXCLS
      /* Interaction (non-exclusion) mask of all 1's or 0's */
-    gmx_mm_pb  interact_S0;
-    gmx_mm_pb  interact_S2;
+    gmx_simd_bool_t  interact_S0;
+    gmx_simd_bool_t  interact_S2;
  #endif
  
-    gmx_mm_pr  jx_S, jy_S, jz_S;
-    gmx_mm_pr  dx_S0, dy_S0, dz_S0;
-    gmx_mm_pr  dx_S2, dy_S2, dz_S2;
-    gmx_mm_pr  tx_S0, ty_S0, tz_S0;
-    gmx_mm_pr  tx_S2, ty_S2, tz_S2;
-    gmx_mm_pr  rsq_S0, rinv_S0, rinvsq_S0;
-    gmx_mm_pr  rsq_S2, rinv_S2, rinvsq_S2;
+    gmx_simd_real_t  jx_S, jy_S, jz_S;
+    gmx_simd_real_t  dx_S0, dy_S0, dz_S0;
+    gmx_simd_real_t  dx_S2, dy_S2, dz_S2;
+    gmx_simd_real_t  tx_S0, ty_S0, tz_S0;
+    gmx_simd_real_t  tx_S2, ty_S2, tz_S2;
+    gmx_simd_real_t  rsq_S0, rinv_S0, rinvsq_S0;
+    gmx_simd_real_t  rsq_S2, rinv_S2, rinvsq_S2;
  #ifndef CUTOFF_BLENDV
      /* wco: within cut-off, mask of all 1's or 0's */
-    gmx_mm_pb  wco_S0;
-    gmx_mm_pb  wco_S2;
+    gmx_simd_bool_t  wco_S0;
+    gmx_simd_bool_t  wco_S2;
  #endif
  #ifdef VDW_CUTOFF_CHECK
-    gmx_mm_pb  wco_vdw_S0;
+    gmx_simd_bool_t  wco_vdw_S0;
  #ifndef HALF_LJ
-    gmx_mm_pb  wco_vdw_S2;
+    gmx_simd_bool_t  wco_vdw_S2;
  #endif
  #endif
  #ifdef CALC_COULOMB
  #ifdef CHECK_EXCLS
      /* 1/r masked with the interaction mask */
-    gmx_mm_pr  rinv_ex_S0;
-    gmx_mm_pr  rinv_ex_S2;
+    gmx_simd_real_t  rinv_ex_S0;
+    gmx_simd_real_t  rinv_ex_S2;
  #endif
-    gmx_mm_pr  jq_S;
-    gmx_mm_pr  qq_S0;
-    gmx_mm_pr  qq_S2;
+    gmx_simd_real_t  jq_S;
+    gmx_simd_real_t  qq_S0;
+    gmx_simd_real_t  qq_S2;
  #ifdef CALC_COUL_TAB
      /* The force (PME mesh force) we need to subtract from 1/r^2 */
-    gmx_mm_pr  fsub_S0;
-    gmx_mm_pr  fsub_S2;
+    gmx_simd_real_t  fsub_S0;
+    gmx_simd_real_t  fsub_S2;
  #endif
  #ifdef CALC_COUL_EWALD
-    gmx_mm_pr  brsq_S0, brsq_S2;
-    gmx_mm_pr  ewcorr_S0, ewcorr_S2;
+    gmx_simd_real_t  brsq_S0, brsq_S2;
+    gmx_simd_real_t  ewcorr_S0, ewcorr_S2;
  #endif
  
      /* frcoul = (1/r - fsub)*r */
-    gmx_mm_pr  frcoul_S0;
-    gmx_mm_pr  frcoul_S2;
+    gmx_simd_real_t  frcoul_S0;
+    gmx_simd_real_t  frcoul_S2;
  #ifdef CALC_COUL_TAB
      /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
-    gmx_mm_pr  r_S0, rs_S0, rf_S0, frac_S0;
-    gmx_mm_pr  r_S2, rs_S2, rf_S2, frac_S2;
+    gmx_simd_real_t         r_S0, rs_S0, rf_S0, frac_S0;
+    gmx_simd_real_t         r_S2, rs_S2, rf_S2, frac_S2;
      /* Table index: rs truncated to an int */
-    gmx_epi32  ti_S0, ti_S2;
+    gmx_simd_int32_t        ti_S0, ti_S2;
      /* Linear force table values */
-    gmx_mm_pr  ctab0_S0, ctab1_S0;
-    gmx_mm_pr  ctab0_S2, ctab1_S2;
+    gmx_simd_real_t         ctab0_S0, ctab1_S0;
+    gmx_simd_real_t         ctab0_S2, ctab1_S2;
  #ifdef CALC_ENERGIES
      /* Quadratic energy table value */
-    gmx_mm_pr  ctabv_S0;
-    gmx_mm_pr  ctabv_S2;
+    gmx_simd_real_t  ctabv_S0;
+    gmx_simd_real_t  ctabv_S2;
  #endif
  #endif
  #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
      /* The potential (PME mesh) we need to subtract from 1/r */
-    gmx_mm_pr  vc_sub_S0;
-    gmx_mm_pr  vc_sub_S2;
+    gmx_simd_real_t  vc_sub_S0;
+    gmx_simd_real_t  vc_sub_S2;
  #endif
  #ifdef CALC_ENERGIES
      /* Electrostatic potential */
-    gmx_mm_pr  vcoul_S0;
-    gmx_mm_pr  vcoul_S2;
+    gmx_simd_real_t  vcoul_S0;
+    gmx_simd_real_t  vcoul_S2;
  #endif
  #endif
      /* The force times 1/r */
-    gmx_mm_pr  fscal_S0;
-    gmx_mm_pr  fscal_S2;
+    gmx_simd_real_t  fscal_S0;
+    gmx_simd_real_t  fscal_S2;
  
  #ifdef CALC_LJ
  #ifdef LJ_COMB_LB
      /* LJ sigma_j/2 and sqrt(epsilon_j) */
-    gmx_mm_pr  hsig_j_S, seps_j_S;
+    gmx_simd_real_t  hsig_j_S, seps_j_S;
      /* LJ sigma_ij and epsilon_ij */
-    gmx_mm_pr  sig_S0, eps_S0;
+    gmx_simd_real_t  sig_S0, eps_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  sig_S2, eps_S2;
+    gmx_simd_real_t  sig_S2, eps_S2;
  #endif
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  sig2_S0, sig6_S0;
+    gmx_simd_real_t  sig2_S0, sig6_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  sig2_S2, sig6_S2;
+    gmx_simd_real_t  sig2_S2, sig6_S2;
  #endif
  #endif /* LJ_COMB_LB */
  #endif /* CALC_LJ */
  
  #ifdef LJ_COMB_GEOM
-    gmx_mm_pr  c6s_j_S, c12s_j_S;
+    gmx_simd_real_t  c6s_j_S, c12s_j_S;
  #endif
  
  #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
@@ -194,34 +194,34 @@
  
  #ifndef FIX_LJ_C
      /* LJ C6 and C12 parameters, used with geometric comb. rule */
-    gmx_mm_pr  c6_S0, c12_S0;
+    gmx_simd_real_t  c6_S0, c12_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  c6_S2, c12_S2;
+    gmx_simd_real_t  c6_S2, c12_S2;
  #endif
  #endif
  
      /* Intermediate variables for LJ calculation */
  #ifndef LJ_COMB_LB
-    gmx_mm_pr  rinvsix_S0;
+    gmx_simd_real_t  rinvsix_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  rinvsix_S2;
+    gmx_simd_real_t  rinvsix_S2;
  #endif
  #endif
  #ifdef LJ_COMB_LB
-    gmx_mm_pr  sir_S0, sir2_S0, sir6_S0;
+    gmx_simd_real_t  sir_S0, sir2_S0, sir6_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  sir_S2, sir2_S2, sir6_S2;
+    gmx_simd_real_t  sir_S2, sir2_S2, sir6_S2;
  #endif
  #endif
  
-    gmx_mm_pr  FrLJ6_S0, FrLJ12_S0;
+    gmx_simd_real_t  FrLJ6_S0, FrLJ12_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  FrLJ6_S2, FrLJ12_S2;
+    gmx_simd_real_t  FrLJ6_S2, FrLJ12_S2;
  #endif
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  VLJ6_S0, VLJ12_S0, VLJ_S0;
+    gmx_simd_real_t  VLJ6_S0, VLJ12_S0, VLJ_S0;
  #ifndef HALF_LJ
-    gmx_mm_pr  VLJ6_S2, VLJ12_S2, VLJ_S2;
+    gmx_simd_real_t  VLJ6_S2, VLJ12_S2, VLJ_S2;
  #endif
  #endif
  #endif /* CALC_LJ */
@@ -260,20 +260,20 @@
      gmx_loaddh_pr(&jz_S, x+ajz);
  
      /* Calculate distance */
-    dx_S0       = gmx_sub_pr(ix_S0, jx_S);
-    dy_S0       = gmx_sub_pr(iy_S0, jy_S);
-    dz_S0       = gmx_sub_pr(iz_S0, jz_S);
-    dx_S2       = gmx_sub_pr(ix_S2, jx_S);
-    dy_S2       = gmx_sub_pr(iy_S2, jy_S);
-    dz_S2       = gmx_sub_pr(iz_S2, jz_S);
+    dx_S0       = gmx_simd_sub_r(ix_S0, jx_S);
+    dy_S0       = gmx_simd_sub_r(iy_S0, jy_S);
+    dz_S0       = gmx_simd_sub_r(iz_S0, jz_S);
+    dx_S2       = gmx_simd_sub_r(ix_S2, jx_S);
+    dy_S2       = gmx_simd_sub_r(iy_S2, jy_S);
+    dz_S2       = gmx_simd_sub_r(iz_S2, jz_S);
  
      /* rsq = dx*dx+dy*dy+dz*dz */
-    rsq_S0      = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-    rsq_S2      = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+    rsq_S0      = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+    rsq_S2      = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
  
  #ifndef CUTOFF_BLENDV
-    wco_S0      = gmx_cmplt_pr(rsq_S0, rc2_S);
-    wco_S2      = gmx_cmplt_pr(rsq_S2, rc2_S);
+    wco_S0      = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+    wco_S2      = gmx_simd_cmplt_r(rsq_S2, rc2_S);
  #endif
  
  #ifdef CHECK_EXCLS
@@ -282,20 +282,20 @@
  #if UNROLLJ == UNROLLI
      if (cj == ci_sh)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask_S0);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask_S2);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask_S0);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask_S2);
      }
  #else
  #if UNROLLJ == 2*UNROLLI
      if (cj*2 == ci_sh)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask0_S0);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask0_S2);
      }
      else if (cj*2 + 1 == ci_sh)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask1_S0);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask1_S2);
      }
  #else
  #error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
@@ -303,19 +303,19 @@
  #endif
  #else /* EXCL_FORCES */
        /* No exclusion forces: remove all excluded atom pairs from the list */
-    wco_S0      = gmx_and_pb(wco_S0, interact_S0);
-    wco_S2      = gmx_and_pb(wco_S2, interact_S2);
+    wco_S0      = gmx_simd_and_b(wco_S0, interact_S0);
+    wco_S2      = gmx_simd_and_b(wco_S2, interact_S2);
  #endif
  #endif
  
  #ifdef COUNT_PAIRS
      {
          int  i, j;
-        real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
-        tmp = gmx_simd_align_real(tmpa);
+        real tmpa[2*GMX_SIMD_REAL_WIDTH], *tmp;
+        tmp = gmx_simd_align_r(tmpa);
          for (i = 0; i < UNROLLI; i += 2)
          {
-            gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2);
+            gmx_simd_store_r(tmp, i == 0 ? wco_S0 : wco_S2);
              for (j = 0; j < 2*UNROLLJ; j++)
              {
                  if (!(tmp[j] == 0))
@@ -334,14 +334,14 @@
  #endif
  
      /* Calculate 1/r */
-    rinv_S0     = gmx_invsqrt_pr(rsq_S0);
-    rinv_S2     = gmx_invsqrt_pr(rsq_S2);
+    rinv_S0     = gmx_simd_invsqrt_r(rsq_S0);
+    rinv_S2     = gmx_simd_invsqrt_r(rsq_S2);
  
  #ifdef CALC_COULOMB
      /* Load parameters for j atom */
      gmx_loaddh_pr(&jq_S, q+aj);
-    qq_S0       = gmx_mul_pr(iq_S0, jq_S);
-    qq_S2       = gmx_mul_pr(iq_S2, jq_S);
+    qq_S0       = gmx_simd_mul_r(iq_S0, jq_S);
+    qq_S2       = gmx_simd_mul_r(iq_S2, jq_S);
  #endif
  
  #ifdef CALC_LJ
@@ -356,13 +356,13 @@
  #ifdef LJ_COMB_GEOM
      gmx_loaddh_pr(&c6s_j_S,  ljc+aj2+0);
      gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE);
-    c6_S0       = gmx_mul_pr(c6s_S0, c6s_j_S );
+    c6_S0       = gmx_simd_mul_r(c6s_S0, c6s_j_S );
  #ifndef HALF_LJ
-    c6_S2       = gmx_mul_pr(c6s_S2, c6s_j_S );
+    c6_S2       = gmx_simd_mul_r(c6s_S2, c6s_j_S );
  #endif
-    c12_S0      = gmx_mul_pr(c12s_S0, c12s_j_S);
+    c12_S0      = gmx_simd_mul_r(c12s_S0, c12s_j_S);
  #ifndef HALF_LJ
-    c12_S2      = gmx_mul_pr(c12s_S2, c12s_j_S);
+    c12_S2      = gmx_simd_mul_r(c12s_S2, c12s_j_S);
  #endif
  #endif /* LJ_COMB_GEOM */
  
@@ -370,27 +370,27 @@
      gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0);
      gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE);
  
-    sig_S0      = gmx_add_pr(hsig_i_S0, hsig_j_S);
-    eps_S0      = gmx_mul_pr(seps_i_S0, seps_j_S);
+    sig_S0      = gmx_simd_add_r(hsig_i_S0, hsig_j_S);
+    eps_S0      = gmx_simd_mul_r(seps_i_S0, seps_j_S);
  #ifndef HALF_LJ
-    sig_S2      = gmx_add_pr(hsig_i_S2, hsig_j_S);
-    eps_S2      = gmx_mul_pr(seps_i_S2, seps_j_S);
+    sig_S2      = gmx_simd_add_r(hsig_i_S2, hsig_j_S);
+    eps_S2      = gmx_simd_mul_r(seps_i_S2, seps_j_S);
  #endif
  #endif /* LJ_COMB_LB */
  
  #endif /* CALC_LJ */
  
  #ifndef CUTOFF_BLENDV
-    rinv_S0     = gmx_blendzero_pr(rinv_S0, wco_S0);
-    rinv_S2     = gmx_blendzero_pr(rinv_S2, wco_S2);
+    rinv_S0     = gmx_simd_blendzero_r(rinv_S0, wco_S0);
+    rinv_S2     = gmx_simd_blendzero_r(rinv_S2, wco_S2);
  #else
      /* We only need to mask for the cut-off: blendv is faster */
-    rinv_S0     = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
-    rinv_S2     = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+    rinv_S0     = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0));
+    rinv_S2     = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2));
  #endif
  
-    rinvsq_S0   = gmx_mul_pr(rinv_S0, rinv_S0);
-    rinvsq_S2   = gmx_mul_pr(rinv_S2, rinv_S2);
+    rinvsq_S0   = gmx_simd_mul_r(rinv_S0, rinv_S0);
+    rinvsq_S2   = gmx_simd_mul_r(rinv_S2, rinv_S2);
  
  #ifdef CALC_COULOMB
      /* Note that here we calculate force*r, not the usual force/r.
@@ -401,8 +401,8 @@
  
  #ifdef EXCL_FORCES
      /* Only add 1/r for non-excluded atom pairs */
-    rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, interact_S0);
-    rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, interact_S2);
+    rinv_ex_S0  = gmx_simd_blendzero_r(rinv_S0, interact_S0);
+    rinv_ex_S2  = gmx_simd_blendzero_r(rinv_S2, interact_S2);
  #else
      /* No exclusion forces, we always need 1/r */
  #define     rinv_ex_S0    rinv_S0
@@ -411,12 +411,12 @@
  
  #ifdef CALC_COUL_RF
      /* Electrostatic interactions */
-    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
-    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
+    frcoul_S0   = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0));
+    frcoul_S2   = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2));
  
  #ifdef CALC_ENERGIES
-    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
-    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+    vcoul_S0    = gmx_simd_mul_r(qq_S0, gmx_simd_add_r(rinv_ex_S0, gmx_simd_add_r(gmx_simd_mul_r(rsq_S0, hrc_3_S), moh_rc_S)));
+    vcoul_S2    = gmx_simd_mul_r(qq_S2, gmx_simd_add_r(rinv_ex_S2, gmx_simd_add_r(gmx_simd_mul_r(rsq_S2, hrc_3_S), moh_rc_S)));
  #endif
  #endif
  
@@ -425,44 +425,44 @@
       * as large distances can cause an overflow in gmx_pmecorrF/V.
       */
  #ifndef CUTOFF_BLENDV
-    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
-    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+    brsq_S0     = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S0, wco_S0));
+    brsq_S2     = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S2, wco_S2));
  #else
      /* Strangely, putting mul on a separate line is slower (icc 13) */
-    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
-    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+    brsq_S0     = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0)));
+    brsq_S2     = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2)));
  #endif
-    ewcorr_S0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
-    ewcorr_S2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
-    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
-    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
+    ewcorr_S0   = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S0), beta_S);
+    ewcorr_S2   = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S2), beta_S);
+    frcoul_S0   = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0));
+    frcoul_S2   = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2));
  
  #ifdef CALC_ENERGIES
-    vc_sub_S0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
-    vc_sub_S2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+    vc_sub_S0   = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S0), beta_S);
+    vc_sub_S2   = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S2), beta_S);
  #endif
  
  #endif /* CALC_COUL_EWALD */
  
  #ifdef CALC_COUL_TAB
      /* Electrostatic interactions */
-    r_S0        = gmx_mul_pr(rsq_S0, rinv_S0);
-    r_S2        = gmx_mul_pr(rsq_S2, rinv_S2);
+    r_S0        = gmx_simd_mul_r(rsq_S0, rinv_S0);
+    r_S2        = gmx_simd_mul_r(rsq_S2, rinv_S2);
      /* Convert r to scaled table units */
-    rs_S0       = gmx_mul_pr(r_S0, invtsp_S);
-    rs_S2       = gmx_mul_pr(r_S2, invtsp_S);
+    rs_S0       = gmx_simd_mul_r(r_S0, invtsp_S);
+    rs_S2       = gmx_simd_mul_r(r_S2, invtsp_S);
      /* Truncate scaled r to an int */
-    ti_S0       = gmx_cvttpr_epi32(rs_S0);
-    ti_S2       = gmx_cvttpr_epi32(rs_S2);
+    ti_S0       = gmx_simd_cvtt_r2i(rs_S0);
+    ti_S2       = gmx_simd_cvtt_r2i(rs_S2);
  #ifdef GMX_SIMD_HAVE_FLOOR
-    rf_S0       = gmx_floor_pr(rs_S0);
-    rf_S2       = gmx_floor_pr(rs_S2);
+    rf_S0       = gmx_simd_floor_r(rs_S0);
+    rf_S2       = gmx_simd_floor_r(rs_S2);
  #else
-    rf_S0       = gmx_cvtepi32_pr(ti_S0);
-    rf_S2       = gmx_cvtepi32_pr(ti_S2);
+    rf_S0       = gmx_simd_cvt_i2r(ti_S0);
+    rf_S2       = gmx_simd_cvt_i2r(ti_S2);
  #endif
-    frac_S0     = gmx_sub_pr(rs_S0, rf_S0);
-    frac_S2     = gmx_sub_pr(rs_S2, rf_S2);
+    frac_S0     = gmx_simd_sub_r(rs_S0, rf_S0);
+    frac_S2     = gmx_simd_sub_r(rs_S2, rf_S2);
  
      /* Load and interpolate table forces and possibly energies.
       * Force and energy can be combined in one table, stride 4: FDV0
@@ -481,14 +481,14 @@
      load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
  #endif
  #endif
-    fsub_S0     = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
-    fsub_S2     = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
-    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
-    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+    fsub_S0     = gmx_simd_add_r(ctab0_S0, gmx_simd_mul_r(frac_S0, ctab1_S0));
+    fsub_S2     = gmx_simd_add_r(ctab0_S2, gmx_simd_mul_r(frac_S2, ctab1_S2));
+    frcoul_S0   = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, gmx_simd_mul_r(fsub_S0, r_S0)));
+    frcoul_S2   = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, gmx_simd_mul_r(fsub_S2, r_S2)));
  
  #ifdef CALC_ENERGIES
-    vc_sub_S0   = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
-    vc_sub_S2   = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+    vc_sub_S0   = gmx_simd_add_r(ctabv_S0, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S0), gmx_simd_add_r(ctab0_S0, fsub_S0)));
+    vc_sub_S2   = gmx_simd_add_r(ctabv_S2, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S2), gmx_simd_add_r(ctab0_S2, fsub_S2)));
  #endif
  #endif /* CALC_COUL_TAB */
  
@@ -496,22 +496,22 @@
  #ifndef NO_SHIFT_EWALD
      /* Add Ewald potential shift to vc_sub for convenience */
  #ifdef CHECK_EXCLS
-    vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
-    vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
+    vc_sub_S0   = gmx_simd_add_r(vc_sub_S0, gmx_simd_blendzero_r(sh_ewald_S, interact_S0));
+    vc_sub_S2   = gmx_simd_add_r(vc_sub_S2, gmx_simd_blendzero_r(sh_ewald_S, interact_S2));
  #else
-    vc_sub_S0   = gmx_add_pr(vc_sub_S0, sh_ewald_S);
-    vc_sub_S2   = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+    vc_sub_S0   = gmx_simd_add_r(vc_sub_S0, sh_ewald_S);
+    vc_sub_S2   = gmx_simd_add_r(vc_sub_S2, sh_ewald_S);
  #endif
  #endif
  
-    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
-    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+    vcoul_S0    = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, vc_sub_S0));
+    vcoul_S2    = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, vc_sub_S2));
  #endif
  
  #ifdef CALC_ENERGIES
      /* Mask energy for cut-off and diagonal */
-    vcoul_S0    = gmx_blendzero_pr(vcoul_S0, wco_S0);
-    vcoul_S2    = gmx_blendzero_pr(vcoul_S2, wco_S2);
+    vcoul_S0    = gmx_simd_blendzero_r(vcoul_S0, wco_S0);
+    vcoul_S2    = gmx_simd_blendzero_r(vcoul_S2, wco_S2);
  #endif
  
  #endif /* CALC_COULOMB */
@@ -520,9 +520,9 @@
      /* Lennard-Jones interaction */
  
  #ifdef VDW_CUTOFF_CHECK
-    wco_vdw_S0  = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+    wco_vdw_S0  = gmx_simd_cmplt_r(rsq_S0, rcvdw2_S);
  #ifndef HALF_LJ
-    wco_vdw_S2  = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+    wco_vdw_S2  = gmx_simd_cmplt_r(rsq_S2, rcvdw2_S);
  #endif
  #else
      /* Same cut-off for Coulomb and VdW, reuse the registers */
@@ -531,82 +531,82 @@
  #endif
  
  #ifndef LJ_COMB_LB
-    rinvsix_S0  = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+    rinvsix_S0  = gmx_simd_mul_r(rinvsq_S0, gmx_simd_mul_r(rinvsq_S0, rinvsq_S0));
  #ifdef EXCL_FORCES
-    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, interact_S0);
+    rinvsix_S0  = gmx_simd_blendzero_r(rinvsix_S0, interact_S0);
  #endif
  #ifndef HALF_LJ
-    rinvsix_S2  = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+    rinvsix_S2  = gmx_simd_mul_r(rinvsq_S2, gmx_simd_mul_r(rinvsq_S2, rinvsq_S2));
  #ifdef EXCL_FORCES
-    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, interact_S2);
+    rinvsix_S2  = gmx_simd_blendzero_r(rinvsix_S2, interact_S2);
  #endif
  #endif
  #ifdef VDW_CUTOFF_CHECK
-    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+    rinvsix_S0  = gmx_simd_blendzero_r(rinvsix_S0, wco_vdw_S0);
  #ifndef HALF_LJ
-    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+    rinvsix_S2  = gmx_simd_blendzero_r(rinvsix_S2, wco_vdw_S2);
  #endif
  #endif
-    FrLJ6_S0    = gmx_mul_pr(c6_S0, rinvsix_S0);
+    FrLJ6_S0    = gmx_simd_mul_r(c6_S0, rinvsix_S0);
  #ifndef HALF_LJ
-    FrLJ6_S2    = gmx_mul_pr(c6_S2, rinvsix_S2);
+    FrLJ6_S2    = gmx_simd_mul_r(c6_S2, rinvsix_S2);
  #endif
-    FrLJ12_S0   = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+    FrLJ12_S0   = gmx_simd_mul_r(c12_S0, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0));
  #ifndef HALF_LJ
-    FrLJ12_S2   = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+    FrLJ12_S2   = gmx_simd_mul_r(c12_S2, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2));
  #endif
  #endif /* not LJ_COMB_LB */
  
  #ifdef LJ_COMB_LB
-    sir_S0      = gmx_mul_pr(sig_S0, rinv_S0);
+    sir_S0      = gmx_simd_mul_r(sig_S0, rinv_S0);
  #ifndef HALF_LJ
-    sir_S2      = gmx_mul_pr(sig_S2, rinv_S2);
+    sir_S2      = gmx_simd_mul_r(sig_S2, rinv_S2);
  #endif
-    sir2_S0     = gmx_mul_pr(sir_S0, sir_S0);
+    sir2_S0     = gmx_simd_mul_r(sir_S0, sir_S0);
  #ifndef HALF_LJ
-    sir2_S2     = gmx_mul_pr(sir_S2, sir_S2);
+    sir2_S2     = gmx_simd_mul_r(sir_S2, sir_S2);
  #endif
-    sir6_S0     = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+    sir6_S0     = gmx_simd_mul_r(sir2_S0, gmx_simd_mul_r(sir2_S0, sir2_S0));
  #ifdef EXCL_FORCES
-    sir6_S0     = gmx_blendzero_pr(sir6_S0, interact_S0);
+    sir6_S0     = gmx_simd_blendzero_r(sir6_S0, interact_S0);
  #endif
  #ifndef HALF_LJ
-    sir6_S2     = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+    sir6_S2     = gmx_simd_mul_r(sir2_S2, gmx_simd_mul_r(sir2_S2, sir2_S2));
  #ifdef EXCL_FORCES
-    sir6_S2     = gmx_blendzero_pr(sir6_S2, interact_S2);
+    sir6_S2     = gmx_simd_blendzero_r(sir6_S2, interact_S2);
  #endif
  #endif
  #ifdef VDW_CUTOFF_CHECK
-    sir6_S0     = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+    sir6_S0     = gmx_simd_blendzero_r(sir6_S0, wco_vdw_S0);
  #ifndef HALF_LJ
-    sir6_S2     = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+    sir6_S2     = gmx_simd_blendzero_r(sir6_S2, wco_vdw_S2);
  #endif
  #endif
-    FrLJ6_S0    = gmx_mul_pr(eps_S0, sir6_S0);
+    FrLJ6_S0    = gmx_simd_mul_r(eps_S0, sir6_S0);
  #ifndef HALF_LJ
-    FrLJ6_S2    = gmx_mul_pr(eps_S2, sir6_S2);
+    FrLJ6_S2    = gmx_simd_mul_r(eps_S2, sir6_S2);
  #endif
-    FrLJ12_S0   = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+    FrLJ12_S0   = gmx_simd_mul_r(FrLJ6_S0, sir6_S0);
  #ifndef HALF_LJ
-    FrLJ12_S2   = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+    FrLJ12_S2   = gmx_simd_mul_r(FrLJ6_S2, sir6_S2);
  #endif
  #if defined CALC_ENERGIES
      /* We need C6 and C12 to calculate the LJ potential shift */
-    sig2_S0     = gmx_mul_pr(sig_S0, sig_S0);
+    sig2_S0     = gmx_simd_mul_r(sig_S0, sig_S0);
  #ifndef HALF_LJ
-    sig2_S2     = gmx_mul_pr(sig_S2, sig_S2);
+    sig2_S2     = gmx_simd_mul_r(sig_S2, sig_S2);
  #endif
-    sig6_S0     = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+    sig6_S0     = gmx_simd_mul_r(sig2_S0, gmx_simd_mul_r(sig2_S0, sig2_S0));
  #ifndef HALF_LJ
-    sig6_S2     = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+    sig6_S2     = gmx_simd_mul_r(sig2_S2, gmx_simd_mul_r(sig2_S2, sig2_S2));
  #endif
-    c6_S0       = gmx_mul_pr(eps_S0, sig6_S0);
+    c6_S0       = gmx_simd_mul_r(eps_S0, sig6_S0);
  #ifndef HALF_LJ
-    c6_S2       = gmx_mul_pr(eps_S2, sig6_S2);
+    c6_S2       = gmx_simd_mul_r(eps_S2, sig6_S2);
  #endif
-    c12_S0      = gmx_mul_pr(c6_S0, sig6_S0);
+    c12_S0      = gmx_simd_mul_r(c6_S0, sig6_S0);
  #ifndef HALF_LJ
-    c12_S2      = gmx_mul_pr(c6_S2, sig6_S2);
+    c12_S2      = gmx_simd_mul_r(c6_S2, sig6_S2);
  #endif
  #endif
  #endif /* LJ_COMB_LB */
@@ -642,7 +642,7 @@
  
  #ifdef CALC_COULOMB
  #ifndef ENERGY_GROUPS
-    vctot_S      = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2));
+    vctot_S      = gmx_simd_add_r(vctot_S, gmx_simd_add_r(vcoul_S0, vcoul_S2));
  #else
      add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
      add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
@@ -651,39 +651,39 @@
  
  #ifdef CALC_LJ
      /* Calculate the LJ energies */
-    VLJ6_S0     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
+    VLJ6_S0     = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S0, gmx_simd_mul_r(c6_S0, sh_invrc6_S)));
  #ifndef HALF_LJ
-    VLJ6_S2     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
+    VLJ6_S2     = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S2, gmx_simd_mul_r(c6_S2, sh_invrc6_S)));
  #endif
-    VLJ12_S0    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
+    VLJ12_S0    = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S0, gmx_simd_mul_r(c12_S0, sh_invrc12_S)));
  #ifndef HALF_LJ
-    VLJ12_S2    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
+    VLJ12_S2    = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S2, gmx_simd_mul_r(c12_S2, sh_invrc12_S)));
  #endif
  
-    VLJ_S0      = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
+    VLJ_S0      = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0);
  #ifndef HALF_LJ
-    VLJ_S2      = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
+    VLJ_S2      = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2);
  #endif
      /* The potential shift should be removed for pairs beyond cut-off */
-    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
+    VLJ_S0      = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0);
  #ifndef HALF_LJ
-    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
+    VLJ_S2      = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2);
  #endif
  #ifdef CHECK_EXCLS
      /* The potential shift should be removed for excluded pairs */
-    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, interact_S0);
+    VLJ_S0      = gmx_simd_blendzero_r(VLJ_S0, interact_S0);
  #ifndef HALF_LJ
-    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, interact_S2);
+    VLJ_S2      = gmx_simd_blendzero_r(VLJ_S2, interact_S2);
  #endif
  #endif
  #ifndef ENERGY_GROUPS
-    Vvdwtot_S    = gmx_add_pr(Vvdwtot_S,
+    Vvdwtot_S    = gmx_simd_add_r(Vvdwtot_S,
  #ifndef HALF_LJ
-                              gmx_add_pr(VLJ_S0, VLJ_S2)
+                                  gmx_simd_add_r(VLJ_S0, VLJ_S2)
  #else
-                              VLJ_S0
+                                  VLJ_S0
  #endif
-                              );
+                                  );
  #else
      add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
  #ifndef HALF_LJ
@@ -695,47 +695,47 @@
  
  #ifdef CALC_LJ
  #ifdef CALC_COULOMB
-    fscal_S0    = gmx_mul_pr(rinvsq_S0,
-                             gmx_add_pr(frcoul_S0,
-                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+    fscal_S0    = gmx_simd_mul_r(rinvsq_S0,
+                                 gmx_simd_add_r(frcoul_S0,
+                                                gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0)));
  #else
-    fscal_S0    = gmx_mul_pr(rinvsq_S0,
-                             (
-                                 gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+    fscal_S0    = gmx_simd_mul_r(rinvsq_S0,
+                                 (
+                                     gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0)));
  #endif
  #else
-    fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+    fscal_S0    = gmx_simd_mul_r(rinvsq_S0, frcoul_S0);
  #endif /* CALC_LJ */
  #if defined CALC_LJ && !defined HALF_LJ
  #ifdef CALC_COULOMB
-    fscal_S2    = gmx_mul_pr(rinvsq_S2,
-                             gmx_add_pr(frcoul_S2,
-                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+    fscal_S2    = gmx_simd_mul_r(rinvsq_S2,
+                                 gmx_simd_add_r(frcoul_S2,
+                                                gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2)));
  #else
-    fscal_S2    = gmx_mul_pr(rinvsq_S2,
-                             (
-                                 gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+    fscal_S2    = gmx_simd_mul_r(rinvsq_S2,
+                                 (
+                                     gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2)));
  #endif
  #else
      /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
-    fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+    fscal_S2    = gmx_simd_mul_r(rinvsq_S2, frcoul_S2);
  #endif
  
      /* Calculate temporary vectorial force */
-    tx_S0       = gmx_mul_pr(fscal_S0, dx_S0);
-    tx_S2       = gmx_mul_pr(fscal_S2, dx_S2);
-    ty_S0       = gmx_mul_pr(fscal_S0, dy_S0);
-    ty_S2       = gmx_mul_pr(fscal_S2, dy_S2);
-    tz_S0       = gmx_mul_pr(fscal_S0, dz_S0);
-    tz_S2       = gmx_mul_pr(fscal_S2, dz_S2);
+    tx_S0       = gmx_simd_mul_r(fscal_S0, dx_S0);
+    tx_S2       = gmx_simd_mul_r(fscal_S2, dx_S2);
+    ty_S0       = gmx_simd_mul_r(fscal_S0, dy_S0);
+    ty_S2       = gmx_simd_mul_r(fscal_S2, dy_S2);
+    tz_S0       = gmx_simd_mul_r(fscal_S0, dz_S0);
+    tz_S2       = gmx_simd_mul_r(fscal_S2, dz_S2);
  
      /* Increment i atom force */
-    fix_S0      = gmx_add_pr(fix_S0, tx_S0);
-    fix_S2      = gmx_add_pr(fix_S2, tx_S2);
-    fiy_S0      = gmx_add_pr(fiy_S0, ty_S0);
-    fiy_S2      = gmx_add_pr(fiy_S2, ty_S2);
-    fiz_S0      = gmx_add_pr(fiz_S0, tz_S0);
-    fiz_S2      = gmx_add_pr(fiz_S2, tz_S2);
+    fix_S0      = gmx_simd_add_r(fix_S0, tx_S0);
+    fix_S2      = gmx_simd_add_r(fix_S2, tx_S2);
+    fiy_S0      = gmx_simd_add_r(fiy_S0, ty_S0);
+    fiy_S2      = gmx_simd_add_r(fiy_S2, ty_S2);
+    fiz_S0      = gmx_simd_add_r(fiz_S0, tz_S0);
+    fiz_S2      = gmx_simd_add_r(fiz_S2, tz_S2);
  
      /* Decrement j atom force */
      gmx_load_hpr(&fjx_S, f+ajx);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h

index eb7409906c7f13bf02cccf98ebc54826fa49757d..0c3c7a9cadbf990beea8bfcd0a1437541194787d 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -89,101 +89,103 @@
      real       *vctp[UNROLLI];
  #endif
  
-    gmx_mm_pr  shX_S;
-    gmx_mm_pr  shY_S;
-    gmx_mm_pr  shZ_S;
-    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
-    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
-    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
-    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
+    gmx_simd_real_t  shX_S;
+    gmx_simd_real_t  shY_S;
+    gmx_simd_real_t  shZ_S;
+    gmx_simd_real_t  ix_S0, iy_S0, iz_S0;
+    gmx_simd_real_t  ix_S2, iy_S2, iz_S2;
+    gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
+    gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
      /* We use an i-force SIMD register width of 4 */
      /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
-    gmx_mm_pr4 fix_S, fiy_S, fiz_S;
+    gmx_mm_pr4       fix_S, fiy_S, fiz_S;
  
-    gmx_mm_pr  diagonal_jmi_S;
+    gmx_simd_real_t  diagonal_jmi_S;
  #if UNROLLI == UNROLLJ
-    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S2;
+    gmx_simd_bool_t  diagonal_mask_S0, diagonal_mask_S2;
  #else
-    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S2;
-    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S2;
+    gmx_simd_bool_t  diagonal_mask0_S0, diagonal_mask0_S2;
+    gmx_simd_bool_t  diagonal_mask1_S0, diagonal_mask1_S2;
  #endif
  
-    unsigned      *exclusion_filter;
-    gmx_exclfilter filter_S0, filter_S2;
+    unsigned            *exclusion_filter;
+    gmx_exclfilter       filter_S0, filter_S2;
  
-    gmx_mm_pr      zero_S = gmx_set1_pr(0);
+    gmx_simd_real_t      zero_S = gmx_simd_set1_r(0);
  
-    gmx_mm_pr      one_S = gmx_set1_pr(1.0);
-    gmx_mm_pr      iq_S0 = gmx_setzero_pr();
-    gmx_mm_pr      iq_S2 = gmx_setzero_pr();
-    gmx_mm_pr      mrc_3_S;
+    gmx_simd_real_t      one_S = gmx_simd_set1_r(1.0);
+    gmx_simd_real_t      iq_S0 = gmx_simd_setzero_r();
+    gmx_simd_real_t      iq_S2 = gmx_simd_setzero_r();
+    gmx_simd_real_t      mrc_3_S;
  #ifdef CALC_ENERGIES
-    gmx_mm_pr      hrc_3_S, moh_rc_S;
+    gmx_simd_real_t      hrc_3_S, moh_rc_S;
  #endif
  
  #ifdef CALC_COUL_TAB
      /* Coulomb table variables */
-    gmx_mm_pr   invtsp_S;
-    const real *tab_coul_F;
+    gmx_simd_real_t   invtsp_S;
+    const real       *tab_coul_F;
  #ifndef TAB_FDV0
-    const real *tab_coul_V;
+    const real       *tab_coul_V;
  #endif
-    int         ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
-    int         ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
+    int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0;
+    int               ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2;
  #ifdef CALC_ENERGIES
-    gmx_mm_pr   mhalfsp_S;
+    gmx_simd_real_t   mhalfsp_S;
  #endif
  #endif
  
  #ifdef CALC_COUL_EWALD
-    gmx_mm_pr beta2_S, beta_S;
+    gmx_simd_real_t beta2_S, beta_S;
  #endif
  
  #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
-    gmx_mm_pr  sh_ewald_S;
+    gmx_simd_real_t  sh_ewald_S;
  #endif
  
  #ifdef LJ_COMB_LB
-    const real *ljc;
+    const real       *ljc;
  
-    gmx_mm_pr   hsig_i_S0, seps_i_S0;
-    gmx_mm_pr   hsig_i_S2, seps_i_S2;
+    gmx_simd_real_t   hsig_i_S0, seps_i_S0;
+    gmx_simd_real_t   hsig_i_S2, seps_i_S2;
  #else
  #ifdef FIX_LJ_C
-    real        pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
-    real       *pvdw_c6, *pvdw_c12;
-    gmx_mm_pr   c6_S0, c12_S0;
-    gmx_mm_pr   c6_S2, c12_S2;
+    real              pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_REAL_WIDTH];
+    real             *pvdw_c6, *pvdw_c12;
+    gmx_simd_real_t   c6_S0, c12_S0;
+    gmx_simd_real_t   c6_S2, c12_S2;
  #endif
  
  #ifdef LJ_COMB_GEOM
-    const real *ljc;
+    const real       *ljc;
  
-    gmx_mm_pr   c6s_S0, c12s_S0;
-    gmx_mm_pr   c6s_S1, c12s_S1;
-    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
-    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
+    gmx_simd_real_t   c6s_S0, c12s_S0;
+    gmx_simd_real_t   c6s_S1, c12s_S1;
+    gmx_simd_real_t   c6s_S2  = gmx_simd_setzero_r();
+    gmx_simd_real_t   c12s_S2 = gmx_simd_setzero_r();
+    gmx_simd_real_t   c6s_S3  = gmx_simd_setzero_r();
+    gmx_simd_real_t   c12s_S3 = gmx_simd_setzero_r();
  #endif
  #endif /* LJ_COMB_LB */
  
-    gmx_mm_pr  vctot_S, Vvdwtot_S;
-    gmx_mm_pr  sixth_S, twelveth_S;
+    gmx_simd_real_t  vctot_S, Vvdwtot_S;
+    gmx_simd_real_t  sixth_S, twelveth_S;
  
-    gmx_mm_pr  avoid_sing_S;
-    gmx_mm_pr  rc2_S;
+    gmx_simd_real_t  avoid_sing_S;
+    gmx_simd_real_t  rc2_S;
  #ifdef VDW_CUTOFF_CHECK
-    gmx_mm_pr  rcvdw2_S;
+    gmx_simd_real_t  rcvdw2_S;
  #endif
  
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
+    gmx_simd_real_t  sh_invrc6_S, sh_invrc12_S;
  
      /* cppcheck-suppress unassignedVariable */
-    real       tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
+    real       tmpsum_array[2*GMX_SIMD_REAL_WIDTH], *tmpsum;
  #endif
  #ifdef CALC_SHIFTFORCES
      /* cppcheck-suppress unassignedVariable */
-    real       shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
+    real       shf_array[2*GMX_SIMD_REAL_WIDTH], *shf;
  #endif
  
      int ninner;
@@ -200,25 +202,25 @@
  #endif
  
      /* Load j-i for the first i */
-    diagonal_jmi_S    = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
+    diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_2xnn_diagonal_j_minus_i);
      /* Generate all the diagonal masks as comparison results */
  #if UNROLLI == UNROLLJ
-    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+    diagonal_mask_S0  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask_S2  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
  #else
  #if 2*UNROLLI == UNROLLJ
-    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+    diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
  #endif
  #endif
  
@@ -246,9 +248,9 @@
      ti0 = prepare_table_load_buffer(ti0_array);
      ti2 = prepare_table_load_buffer(ti2_array);
  
-    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
+    invtsp_S  = gmx_simd_set1_r(ic->tabq_scale);
  #ifdef CALC_ENERGIES
-    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
+    mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale);
  #endif
  
  #ifdef TAB_FDV0
@@ -260,12 +262,12 @@
  #endif /* CALC_COUL_TAB */
  
  #ifdef CALC_COUL_EWALD
-    beta2_S = gmx_set1_pr(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
-    beta_S  = gmx_set1_pr(ic->ewaldcoeff_q);
+    beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
+    beta_S  = gmx_simd_set1_r(ic->ewaldcoeff_q);
  #endif
  
  #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
-    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
+    sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald);
  #endif
  
      q                   = nbat->q;
@@ -274,39 +276,39 @@
      shiftvec            = shift_vec[0];
      x                   = nbat->x;
  
-    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+    avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC);
  
      /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
-    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+    rc2_S    = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb);
  #ifdef VDW_CUTOFF_CHECK
-    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
+    rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw);
  #endif
  
  #ifdef CALC_ENERGIES
-    sixth_S      = gmx_set1_pr(1.0/6.0);
-    twelveth_S   = gmx_set1_pr(1.0/12.0);
+    sixth_S      = gmx_simd_set1_r(1.0/6.0);
+    twelveth_S   = gmx_simd_set1_r(1.0/12.0);
  
-    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
-    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+    sh_invrc6_S  = gmx_simd_set1_r(ic->sh_invrc6);
+    sh_invrc12_S = gmx_simd_set1_r(ic->sh_invrc6*ic->sh_invrc6);
  #endif
  
-    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
+    mrc_3_S  = gmx_simd_set1_r(-2*ic->k_rf);
  
  #ifdef CALC_ENERGIES
-    hrc_3_S  = gmx_set1_pr(ic->k_rf);
+    hrc_3_S  = gmx_simd_set1_r(ic->k_rf);
  
-    moh_rc_S = gmx_set1_pr(-ic->c_rf);
+    moh_rc_S = gmx_simd_set1_r(-ic->c_rf);
  #endif
  
  #ifdef CALC_ENERGIES
-    tmpsum   = gmx_simd_align_real(tmpsum_array);
+    tmpsum   = gmx_simd_align_r(tmpsum_array);
  #endif
  #ifdef CALC_SHIFTFORCES
-    shf      = gmx_simd_align_real(shf_array);
+    shf      = gmx_simd_align_r(shf_array);
  #endif
  
  #ifdef FIX_LJ_C
-    pvdw_c6  = gmx_simd_align_real(pvdw_array);
+    pvdw_c6  = gmx_simd_align_r(pvdw_array);
      pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
  
      for (jp = 0; jp < UNROLLJ; jp++)
@@ -321,15 +323,15 @@
          pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
          pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
      }
-    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
-    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
-    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
-    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
-
-    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
-    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
-    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
-    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+    c6_S0            = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ);
+    c6_S1            = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ);
+    c6_S2            = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ);
+    c6_S3            = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ);
+
+    c12_S0           = gmx_simd_load_r(pvdw_c12+0*UNROLLJ);
+    c12_S1           = gmx_simd_load_r(pvdw_c12+1*UNROLLJ);
+    c12_S2           = gmx_simd_load_r(pvdw_c12+2*UNROLLJ);
+    c12_S3           = gmx_simd_load_r(pvdw_c12+3*UNROLLJ);
  #endif /* FIX_LJ_C */
  
  #ifdef ENERGY_GROUPS
@@ -356,9 +358,9 @@
          ci               = nbln->ci;
          ci_sh            = (ish == CENTRAL ? ci : -1);
  
-        shX_S = gmx_load1_pr(shiftvec+ish3);
-        shY_S = gmx_load1_pr(shiftvec+ish3+1);
-        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
+        shX_S = gmx_simd_load1_r(shiftvec+ish3);
+        shY_S = gmx_simd_load1_r(shiftvec+ish3+1);
+        shZ_S = gmx_simd_load1_r(shiftvec+ish3+2);
  
  #if UNROLLJ <= 4
          sci              = ci*STRIDE;
@@ -444,23 +446,23 @@
          gmx_load1p1_pr(&iy_S2, x+sciy+2);
          gmx_load1p1_pr(&iz_S0, x+sciz);
          gmx_load1p1_pr(&iz_S2, x+sciz+2);
-        ix_S0          = gmx_add_pr(ix_S0, shX_S);
-        ix_S2          = gmx_add_pr(ix_S2, shX_S);
-        iy_S0          = gmx_add_pr(iy_S0, shY_S);
-        iy_S2          = gmx_add_pr(iy_S2, shY_S);
-        iz_S0          = gmx_add_pr(iz_S0, shZ_S);
-        iz_S2          = gmx_add_pr(iz_S2, shZ_S);
+        ix_S0          = gmx_simd_add_r(ix_S0, shX_S);
+        ix_S2          = gmx_simd_add_r(ix_S2, shX_S);
+        iy_S0          = gmx_simd_add_r(iy_S0, shY_S);
+        iy_S2          = gmx_simd_add_r(iy_S2, shY_S);
+        iz_S0          = gmx_simd_add_r(iz_S0, shZ_S);
+        iz_S2          = gmx_simd_add_r(iz_S2, shZ_S);
  
          if (do_coul)
          {
-            gmx_mm_pr facel_S;
+            gmx_simd_real_t facel_S;
  
-            facel_S    = gmx_set1_pr(facel);
+            facel_S    = gmx_simd_set1_r(facel);
  
              gmx_load1p1_pr(&iq_S0, q+sci);
              gmx_load1p1_pr(&iq_S2, q+sci+2);
-            iq_S0      = gmx_mul_pr(facel_S, iq_S0);
-            iq_S2      = gmx_mul_pr(facel_S, iq_S2);
+            iq_S0      = gmx_simd_mul_r(facel_S, iq_S0);
+            iq_S2      = gmx_simd_mul_r(facel_S, iq_S2);
          }
  
  #ifdef LJ_COMB_LB
@@ -492,16 +494,16 @@
  #endif
  
          /* Zero the potential energy for this list */
-        Vvdwtot_S        = gmx_setzero_pr();
-        vctot_S          = gmx_setzero_pr();
+        Vvdwtot_S        = gmx_simd_setzero_r();
+        vctot_S          = gmx_simd_setzero_r();
  
          /* Clear i atom forces */
-        fix_S0           = gmx_setzero_pr();
-        fix_S2           = gmx_setzero_pr();
-        fiy_S0           = gmx_setzero_pr();
-        fiy_S2           = gmx_setzero_pr();
-        fiz_S0           = gmx_setzero_pr();
-        fiz_S2           = gmx_setzero_pr();
+        fix_S0           = gmx_simd_setzero_r();
+        fix_S2           = gmx_simd_setzero_r();
+        fiy_S0           = gmx_simd_setzero_r();
+        fiy_S2           = gmx_simd_setzero_r();
+        fiz_S0           = gmx_simd_setzero_r();
+        fiz_S2           = gmx_simd_setzero_r();
  
          cjind = cjind0;
  
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c

index 3e9245a426e5300d34199ff84aedff931d29850c..13810fa8807671134f6791aafece4714af85caf6 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -51,7 +51,7 @@
  #include "gromacs/simd/macros.h"
  #include "gromacs/simd/vector_operations.h"
  
-#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)
+#if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8)
  #error "unsupported SIMD width"
  #endif
  
@@ -161,7 +161,7 @@ reduce_group_energies(int ng, int ng_2log,
                        const real *VSvdw, const real *VSc,
                        real *Vvdw, real *Vc)
  {
-    const int unrollj      = GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE;
+    const int unrollj      = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE;
      const int unrollj_half = unrollj/2;
      int       ng_p2, i, j, j0, j1, c, s;
  
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h

index 57b0e460994459225b9c4f77a2e1ded8a35cdade..e3b3380bc534e08340cb6f80d403574d16547da7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
@@ -44,11 +44,11 @@
  #endif
  
  #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ    (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE)
+#define UNROLLJ    (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE)
  
  /* The stride of all the atom data arrays is max(UNROLLI,unrollj) */
-#if GMX_SIMD_WIDTH_HERE >= UNROLLI
-#define STRIDE     (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE)
+#if GMX_SIMD_REAL_WIDTH >= UNROLLI
+#define STRIDE     (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE)
  #else
  #define STRIDE     (UNROLLI)
  #endif
@@ -56,19 +56,19 @@
  #include "../nbnxn_kernel_simd_utils.h"
  
  static inline void
-gmx_load_simd_4xn_interactions(int                    excl,
-                               gmx_exclfilter         filter_S0,
-                               gmx_exclfilter         filter_S1,
-                               gmx_exclfilter         filter_S2,
-                               gmx_exclfilter         filter_S3,
-                               const char gmx_unused *interaction_mask_indices,
-                               real gmx_unused       *simd_interaction_array,
-                               gmx_mm_pb             *interact_S0,
-                               gmx_mm_pb             *interact_S1,
-                               gmx_mm_pb             *interact_S2,
-                               gmx_mm_pb             *interact_S3)
+gmx_load_simd_4xn_interactions(int                          excl,
+                               gmx_exclfilter               filter_S0,
+                               gmx_exclfilter               filter_S1,
+                               gmx_exclfilter               filter_S2,
+                               gmx_exclfilter               filter_S3,
+                               const char gmx_unused       *interaction_mask_indices,
+                               real gmx_unused             *simd_interaction_array,
+                               gmx_simd_bool_t             *interact_S0,
+                               gmx_simd_bool_t             *interact_S1,
+                               gmx_simd_bool_t             *interact_S2,
+                               gmx_simd_bool_t             *interact_S3)
  {
-#if defined GMX_X86_SSE2 || defined GMX_SIMD_REFERENCE_PLAIN_C
+#if defined GMX_SIMD_X86_SSE2_OR_HIGHER || defined GMX_SIMD_REFERENCE
      /* Load integer interaction mask */
      gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
      *interact_S0  = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
@@ -76,8 +76,8 @@ gmx_load_simd_4xn_interactions(int                    excl,
      *interact_S2  = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
      *interact_S3  = gmx_checkbitmask_pb(mask_pr_S, filter_S3);
  #endif
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
-    const int size = GMX_SIMD_WIDTH_HERE * sizeof(real);
+#ifdef GMX_SIMD_IBM_QPX
+    const int size = GMX_SIMD_REAL_WIDTH * sizeof(real);
      *interact_S0  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[0], simd_interaction_array);
      *interact_S1  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[1], simd_interaction_array);
      *interact_S2  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[2], simd_interaction_array);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h

index 0817d76793ce90f4c4ddf8891208bd5ae590af45..703b31740f56615b8da909d9ac7b83efb2545167 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -50,7 +50,7 @@
  #endif
  
  /* Without exclusions and energies we only need to mask the cut-off,
- * this can be faster when we have defined gmx_blendv_pr, i.e. an instruction
+ * this can be faster when we have defined gmx_simd_blendv_r, i.e. an instruction
   * that selects from two SIMD registers based on the contents of a third.
   */
  #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
@@ -58,14 +58,14 @@
   * With gcc this is slower, except for RF on Sandy Bridge.
   * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
   */
-#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER))
  #define NBNXN_CUTOFF_USE_BLENDV
  #endif
  /* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
   * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
   * Tested with icc 13.
   */
-#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
  #define NBNXN_CUTOFF_USE_BLENDV
  #endif
  #endif
@@ -80,135 +80,135 @@
  
  #ifdef CHECK_EXCLS
      /* Interaction (non-exclusion) mask of all 1's or 0's */
-    gmx_mm_pb  interact_S0;
-    gmx_mm_pb  interact_S1;
-    gmx_mm_pb  interact_S2;
-    gmx_mm_pb  interact_S3;
-#endif
-
-    gmx_mm_pr  jx_S, jy_S, jz_S;
-    gmx_mm_pr  dx_S0, dy_S0, dz_S0;
-    gmx_mm_pr  dx_S1, dy_S1, dz_S1;
-    gmx_mm_pr  dx_S2, dy_S2, dz_S2;
-    gmx_mm_pr  dx_S3, dy_S3, dz_S3;
-    gmx_mm_pr  tx_S0, ty_S0, tz_S0;
-    gmx_mm_pr  tx_S1, ty_S1, tz_S1;
-    gmx_mm_pr  tx_S2, ty_S2, tz_S2;
-    gmx_mm_pr  tx_S3, ty_S3, tz_S3;
-    gmx_mm_pr  rsq_S0, rinv_S0, rinvsq_S0;
-    gmx_mm_pr  rsq_S1, rinv_S1, rinvsq_S1;
-    gmx_mm_pr  rsq_S2, rinv_S2, rinvsq_S2;
-    gmx_mm_pr  rsq_S3, rinv_S3, rinvsq_S3;
+    gmx_simd_bool_t  interact_S0;
+    gmx_simd_bool_t  interact_S1;
+    gmx_simd_bool_t  interact_S2;
+    gmx_simd_bool_t  interact_S3;
+#endif
+
+    gmx_simd_real_t  jx_S, jy_S, jz_S;
+    gmx_simd_real_t  dx_S0, dy_S0, dz_S0;
+    gmx_simd_real_t  dx_S1, dy_S1, dz_S1;
+    gmx_simd_real_t  dx_S2, dy_S2, dz_S2;
+    gmx_simd_real_t  dx_S3, dy_S3, dz_S3;
+    gmx_simd_real_t  tx_S0, ty_S0, tz_S0;
+    gmx_simd_real_t  tx_S1, ty_S1, tz_S1;
+    gmx_simd_real_t  tx_S2, ty_S2, tz_S2;
+    gmx_simd_real_t  tx_S3, ty_S3, tz_S3;
+    gmx_simd_real_t  rsq_S0, rinv_S0, rinvsq_S0;
+    gmx_simd_real_t  rsq_S1, rinv_S1, rinvsq_S1;
+    gmx_simd_real_t  rsq_S2, rinv_S2, rinvsq_S2;
+    gmx_simd_real_t  rsq_S3, rinv_S3, rinvsq_S3;
  #ifndef NBNXN_CUTOFF_USE_BLENDV
      /* wco: within cut-off, mask of all 1's or 0's */
-    gmx_mm_pb  wco_S0;
-    gmx_mm_pb  wco_S1;
-    gmx_mm_pb  wco_S2;
-    gmx_mm_pb  wco_S3;
+    gmx_simd_bool_t  wco_S0;
+    gmx_simd_bool_t  wco_S1;
+    gmx_simd_bool_t  wco_S2;
+    gmx_simd_bool_t  wco_S3;
  #endif
  #ifdef VDW_CUTOFF_CHECK
-    gmx_mm_pb  wco_vdw_S0;
-    gmx_mm_pb  wco_vdw_S1;
+    gmx_simd_bool_t  wco_vdw_S0;
+    gmx_simd_bool_t  wco_vdw_S1;
  #ifndef HALF_LJ
-    gmx_mm_pb  wco_vdw_S2;
-    gmx_mm_pb  wco_vdw_S3;
+    gmx_simd_bool_t  wco_vdw_S2;
+    gmx_simd_bool_t  wco_vdw_S3;
  #endif
  #endif
  #ifdef CALC_COULOMB
  #ifdef CHECK_EXCLS
      /* 1/r masked with the interaction mask */
-    gmx_mm_pr  rinv_ex_S0;
-    gmx_mm_pr  rinv_ex_S1;
-    gmx_mm_pr  rinv_ex_S2;
-    gmx_mm_pr  rinv_ex_S3;
-#endif
-    gmx_mm_pr  jq_S;
-    gmx_mm_pr  qq_S0;
-    gmx_mm_pr  qq_S1;
-    gmx_mm_pr  qq_S2;
-    gmx_mm_pr  qq_S3;
+    gmx_simd_real_t  rinv_ex_S0;
+    gmx_simd_real_t  rinv_ex_S1;
+    gmx_simd_real_t  rinv_ex_S2;
+    gmx_simd_real_t  rinv_ex_S3;
+#endif
+    gmx_simd_real_t  jq_S;
+    gmx_simd_real_t  qq_S0;
+    gmx_simd_real_t  qq_S1;
+    gmx_simd_real_t  qq_S2;
+    gmx_simd_real_t  qq_S3;
  #ifdef CALC_COUL_TAB
      /* The force (PME mesh force) we need to subtract from 1/r^2 */
-    gmx_mm_pr  fsub_S0;
-    gmx_mm_pr  fsub_S1;
-    gmx_mm_pr  fsub_S2;
-    gmx_mm_pr  fsub_S3;
+    gmx_simd_real_t  fsub_S0;
+    gmx_simd_real_t  fsub_S1;
+    gmx_simd_real_t  fsub_S2;
+    gmx_simd_real_t  fsub_S3;
  #endif
  #ifdef CALC_COUL_EWALD
-    gmx_mm_pr  brsq_S0, brsq_S1, brsq_S2, brsq_S3;
-    gmx_mm_pr  ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
+    gmx_simd_real_t  brsq_S0, brsq_S1, brsq_S2, brsq_S3;
+    gmx_simd_real_t  ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
  #endif
  
      /* frcoul = (1/r - fsub)*r */
-    gmx_mm_pr  frcoul_S0;
-    gmx_mm_pr  frcoul_S1;
-    gmx_mm_pr  frcoul_S2;
-    gmx_mm_pr  frcoul_S3;
+    gmx_simd_real_t  frcoul_S0;
+    gmx_simd_real_t  frcoul_S1;
+    gmx_simd_real_t  frcoul_S2;
+    gmx_simd_real_t  frcoul_S3;
  #ifdef CALC_COUL_TAB
      /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
-    gmx_mm_pr  r_S0, rs_S0, rf_S0, frac_S0;
-    gmx_mm_pr  r_S1, rs_S1, rf_S1, frac_S1;
-    gmx_mm_pr  r_S2, rs_S2, rf_S2, frac_S2;
-    gmx_mm_pr  r_S3, rs_S3, rf_S3, frac_S3;
+    gmx_simd_real_t         r_S0, rs_S0, rf_S0, frac_S0;
+    gmx_simd_real_t         r_S1, rs_S1, rf_S1, frac_S1;
+    gmx_simd_real_t         r_S2, rs_S2, rf_S2, frac_S2;
+    gmx_simd_real_t         r_S3, rs_S3, rf_S3, frac_S3;
      /* Table index: rs truncated to an int */
-    gmx_epi32  ti_S0, ti_S1, ti_S2, ti_S3;
+    gmx_simd_int32_t        ti_S0, ti_S1, ti_S2, ti_S3;
      /* Linear force table values */
-    gmx_mm_pr  ctab0_S0, ctab1_S0;
-    gmx_mm_pr  ctab0_S1, ctab1_S1;
-    gmx_mm_pr  ctab0_S2, ctab1_S2;
-    gmx_mm_pr  ctab0_S3, ctab1_S3;
+    gmx_simd_real_t         ctab0_S0, ctab1_S0;
+    gmx_simd_real_t         ctab0_S1, ctab1_S1;
+    gmx_simd_real_t         ctab0_S2, ctab1_S2;
+    gmx_simd_real_t         ctab0_S3, ctab1_S3;
  #ifdef CALC_ENERGIES
      /* Quadratic energy table value */
-    gmx_mm_pr  ctabv_S0;
-    gmx_mm_pr  ctabv_S1;
-    gmx_mm_pr  ctabv_S2;
-    gmx_mm_pr  ctabv_S3;
+    gmx_simd_real_t  ctabv_S0;
+    gmx_simd_real_t  ctabv_S1;
+    gmx_simd_real_t  ctabv_S2;
+    gmx_simd_real_t  ctabv_S3;
  #endif
  #endif
  #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
      /* The potential (PME mesh) we need to subtract from 1/r */
-    gmx_mm_pr  vc_sub_S0;
-    gmx_mm_pr  vc_sub_S1;
-    gmx_mm_pr  vc_sub_S2;
-    gmx_mm_pr  vc_sub_S3;
+    gmx_simd_real_t  vc_sub_S0;
+    gmx_simd_real_t  vc_sub_S1;
+    gmx_simd_real_t  vc_sub_S2;
+    gmx_simd_real_t  vc_sub_S3;
  #endif
  #ifdef CALC_ENERGIES
      /* Electrostatic potential */
-    gmx_mm_pr  vcoul_S0;
-    gmx_mm_pr  vcoul_S1;
-    gmx_mm_pr  vcoul_S2;
-    gmx_mm_pr  vcoul_S3;
+    gmx_simd_real_t  vcoul_S0;
+    gmx_simd_real_t  vcoul_S1;
+    gmx_simd_real_t  vcoul_S2;
+    gmx_simd_real_t  vcoul_S3;
  #endif
  #endif
      /* The force times 1/r */
-    gmx_mm_pr  fscal_S0;
-    gmx_mm_pr  fscal_S1;
-    gmx_mm_pr  fscal_S2;
-    gmx_mm_pr  fscal_S3;
+    gmx_simd_real_t  fscal_S0;
+    gmx_simd_real_t  fscal_S1;
+    gmx_simd_real_t  fscal_S2;
+    gmx_simd_real_t  fscal_S3;
  
  #ifdef CALC_LJ
  #ifdef LJ_COMB_LB
      /* LJ sigma_j/2 and sqrt(epsilon_j) */
-    gmx_mm_pr  hsig_j_S, seps_j_S;
+    gmx_simd_real_t  hsig_j_S, seps_j_S;
      /* LJ sigma_ij and epsilon_ij */
-    gmx_mm_pr  sig_S0, eps_S0;
-    gmx_mm_pr  sig_S1, eps_S1;
+    gmx_simd_real_t  sig_S0, eps_S0;
+    gmx_simd_real_t  sig_S1, eps_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  sig_S2, eps_S2;
-    gmx_mm_pr  sig_S3, eps_S3;
+    gmx_simd_real_t  sig_S2, eps_S2;
+    gmx_simd_real_t  sig_S3, eps_S3;
  #endif
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  sig2_S0, sig6_S0;
-    gmx_mm_pr  sig2_S1, sig6_S1;
+    gmx_simd_real_t  sig2_S0, sig6_S0;
+    gmx_simd_real_t  sig2_S1, sig6_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  sig2_S2, sig6_S2;
-    gmx_mm_pr  sig2_S3, sig6_S3;
+    gmx_simd_real_t  sig2_S2, sig6_S2;
+    gmx_simd_real_t  sig2_S3, sig6_S3;
  #endif
  #endif /* LJ_COMB_LB */
  #endif /* CALC_LJ */
  
  #ifdef LJ_COMB_GEOM
-    gmx_mm_pr  c6s_j_S, c12s_j_S;
+    gmx_simd_real_t  c6s_j_S, c12s_j_S;
  #endif
  
  #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
@@ -218,44 +218,44 @@
  
  #ifndef FIX_LJ_C
      /* LJ C6 and C12 parameters, used with geometric comb. rule */
-    gmx_mm_pr  c6_S0, c12_S0;
-    gmx_mm_pr  c6_S1, c12_S1;
+    gmx_simd_real_t  c6_S0, c12_S0;
+    gmx_simd_real_t  c6_S1, c12_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  c6_S2, c12_S2;
-    gmx_mm_pr  c6_S3, c12_S3;
+    gmx_simd_real_t  c6_S2, c12_S2;
+    gmx_simd_real_t  c6_S3, c12_S3;
  #endif
  #endif
  
      /* Intermediate variables for LJ calculation */
  #ifndef LJ_COMB_LB
-    gmx_mm_pr  rinvsix_S0;
-    gmx_mm_pr  rinvsix_S1;
+    gmx_simd_real_t  rinvsix_S0;
+    gmx_simd_real_t  rinvsix_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  rinvsix_S2;
-    gmx_mm_pr  rinvsix_S3;
+    gmx_simd_real_t  rinvsix_S2;
+    gmx_simd_real_t  rinvsix_S3;
  #endif
  #endif
  #ifdef LJ_COMB_LB
-    gmx_mm_pr  sir_S0, sir2_S0, sir6_S0;
-    gmx_mm_pr  sir_S1, sir2_S1, sir6_S1;
+    gmx_simd_real_t  sir_S0, sir2_S0, sir6_S0;
+    gmx_simd_real_t  sir_S1, sir2_S1, sir6_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  sir_S2, sir2_S2, sir6_S2;
-    gmx_mm_pr  sir_S3, sir2_S3, sir6_S3;
+    gmx_simd_real_t  sir_S2, sir2_S2, sir6_S2;
+    gmx_simd_real_t  sir_S3, sir2_S3, sir6_S3;
  #endif
  #endif
  
-    gmx_mm_pr  FrLJ6_S0, FrLJ12_S0;
-    gmx_mm_pr  FrLJ6_S1, FrLJ12_S1;
+    gmx_simd_real_t  FrLJ6_S0, FrLJ12_S0;
+    gmx_simd_real_t  FrLJ6_S1, FrLJ12_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  FrLJ6_S2, FrLJ12_S2;
-    gmx_mm_pr  FrLJ6_S3, FrLJ12_S3;
+    gmx_simd_real_t  FrLJ6_S2, FrLJ12_S2;
+    gmx_simd_real_t  FrLJ6_S3, FrLJ12_S3;
  #endif
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  VLJ6_S0, VLJ12_S0, VLJ_S0;
-    gmx_mm_pr  VLJ6_S1, VLJ12_S1, VLJ_S1;
+    gmx_simd_real_t  VLJ6_S0, VLJ12_S0, VLJ_S0;
+    gmx_simd_real_t  VLJ6_S1, VLJ12_S1, VLJ_S1;
  #ifndef HALF_LJ
-    gmx_mm_pr  VLJ6_S2, VLJ12_S2, VLJ_S2;
-    gmx_mm_pr  VLJ6_S3, VLJ12_S3, VLJ_S3;
+    gmx_simd_real_t  VLJ6_S2, VLJ12_S2, VLJ_S2;
+    gmx_simd_real_t  VLJ6_S3, VLJ12_S3, VLJ_S3;
  #endif
  #endif
  #endif /* CALC_LJ */
@@ -284,7 +284,7 @@
      gmx_load_simd_4xn_interactions(l_cj[cjind].excl,
                                     filter_S0, filter_S1,
                                     filter_S2, filter_S3,
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
                                     l_cj[cjind].interaction_mask_indices,
                                     nbat->simd_interaction_array,
  #else
@@ -298,35 +298,35 @@
  #endif /* CHECK_EXCLS */
  
      /* load j atom coordinates */
-    jx_S        = gmx_load_pr(x+ajx);
-    jy_S        = gmx_load_pr(x+ajy);
-    jz_S        = gmx_load_pr(x+ajz);
+    jx_S        = gmx_simd_load_r(x+ajx);
+    jy_S        = gmx_simd_load_r(x+ajy);
+    jz_S        = gmx_simd_load_r(x+ajz);
  
      /* Calculate distance */
-    dx_S0       = gmx_sub_pr(ix_S0, jx_S);
-    dy_S0       = gmx_sub_pr(iy_S0, jy_S);
-    dz_S0       = gmx_sub_pr(iz_S0, jz_S);
-    dx_S1       = gmx_sub_pr(ix_S1, jx_S);
-    dy_S1       = gmx_sub_pr(iy_S1, jy_S);
-    dz_S1       = gmx_sub_pr(iz_S1, jz_S);
-    dx_S2       = gmx_sub_pr(ix_S2, jx_S);
-    dy_S2       = gmx_sub_pr(iy_S2, jy_S);
-    dz_S2       = gmx_sub_pr(iz_S2, jz_S);
-    dx_S3       = gmx_sub_pr(ix_S3, jx_S);
-    dy_S3       = gmx_sub_pr(iy_S3, jy_S);
-    dz_S3       = gmx_sub_pr(iz_S3, jz_S);
+    dx_S0       = gmx_simd_sub_r(ix_S0, jx_S);
+    dy_S0       = gmx_simd_sub_r(iy_S0, jy_S);
+    dz_S0       = gmx_simd_sub_r(iz_S0, jz_S);
+    dx_S1       = gmx_simd_sub_r(ix_S1, jx_S);
+    dy_S1       = gmx_simd_sub_r(iy_S1, jy_S);
+    dz_S1       = gmx_simd_sub_r(iz_S1, jz_S);
+    dx_S2       = gmx_simd_sub_r(ix_S2, jx_S);
+    dy_S2       = gmx_simd_sub_r(iy_S2, jy_S);
+    dz_S2       = gmx_simd_sub_r(iz_S2, jz_S);
+    dx_S3       = gmx_simd_sub_r(ix_S3, jx_S);
+    dy_S3       = gmx_simd_sub_r(iy_S3, jy_S);
+    dz_S3       = gmx_simd_sub_r(iz_S3, jz_S);
  
      /* rsq = dx*dx+dy*dy+dz*dz */
-    rsq_S0      = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-    rsq_S1      = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
-    rsq_S2      = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
-    rsq_S3      = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+    rsq_S0      = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+    rsq_S1      = gmx_simd_calc_rsq_r(dx_S1, dy_S1, dz_S1);
+    rsq_S2      = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
+    rsq_S3      = gmx_simd_calc_rsq_r(dx_S3, dy_S3, dz_S3);
  
  #ifndef NBNXN_CUTOFF_USE_BLENDV
-    wco_S0      = gmx_cmplt_pr(rsq_S0, rc2_S);
-    wco_S1      = gmx_cmplt_pr(rsq_S1, rc2_S);
-    wco_S2      = gmx_cmplt_pr(rsq_S2, rc2_S);
-    wco_S3      = gmx_cmplt_pr(rsq_S3, rc2_S);
+    wco_S0      = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+    wco_S1      = gmx_simd_cmplt_r(rsq_S1, rc2_S);
+    wco_S2      = gmx_simd_cmplt_r(rsq_S2, rc2_S);
+    wco_S3      = gmx_simd_cmplt_r(rsq_S3, rc2_S);
  #endif
  
  #ifdef CHECK_EXCLS
@@ -335,61 +335,61 @@
  #if UNROLLJ == UNROLLI
      if (cj == ci_sh)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask_S0);
-        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask_S1);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask_S2);
-        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask_S3);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask_S0);
+        wco_S1  = gmx_simd_and_b(wco_S1, diagonal_mask_S1);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask_S2);
+        wco_S3  = gmx_simd_and_b(wco_S3, diagonal_mask_S3);
      }
  #else
  #if UNROLLJ < UNROLLI
      if (cj == ci_sh*2)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
-        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask0_S1);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
-        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask0_S3);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask0_S0);
+        wco_S1  = gmx_simd_and_b(wco_S1, diagonal_mask0_S1);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask0_S2);
+        wco_S3  = gmx_simd_and_b(wco_S3, diagonal_mask0_S3);
      }
      if (cj == ci_sh*2 + 1)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
-        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask1_S1);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
-        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask1_S3);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask1_S0);
+        wco_S1  = gmx_simd_and_b(wco_S1, diagonal_mask1_S1);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask1_S2);
+        wco_S3  = gmx_simd_and_b(wco_S3, diagonal_mask1_S3);
      }
  #else
      if (cj*2 == ci_sh)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
-        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask0_S1);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
-        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask0_S3);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask0_S0);
+        wco_S1  = gmx_simd_and_b(wco_S1, diagonal_mask0_S1);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask0_S2);
+        wco_S3  = gmx_simd_and_b(wco_S3, diagonal_mask0_S3);
      }
      else if (cj*2 + 1 == ci_sh)
      {
-        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
-        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask1_S1);
-        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
-        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask1_S3);
+        wco_S0  = gmx_simd_and_b(wco_S0, diagonal_mask1_S0);
+        wco_S1  = gmx_simd_and_b(wco_S1, diagonal_mask1_S1);
+        wco_S2  = gmx_simd_and_b(wco_S2, diagonal_mask1_S2);
+        wco_S3  = gmx_simd_and_b(wco_S3, diagonal_mask1_S3);
      }
  #endif
  #endif
  #else /* EXCL_FORCES */
        /* No exclusion forces: remove all excluded atom pairs from the list */
-    wco_S0      = gmx_and_pb(wco_S0, interact_S0);
-    wco_S1      = gmx_and_pb(wco_S1, interact_S1);
-    wco_S2      = gmx_and_pb(wco_S2, interact_S2);
-    wco_S3      = gmx_and_pb(wco_S3, interact_S3);
+    wco_S0      = gmx_simd_and_b(wco_S0, interact_S0);
+    wco_S1      = gmx_simd_and_b(wco_S1, interact_S1);
+    wco_S2      = gmx_simd_and_b(wco_S2, interact_S2);
+    wco_S3      = gmx_simd_and_b(wco_S3, interact_S3);
  #endif
  #endif
  
  #ifdef COUNT_PAIRS
      {
          int  i, j;
-        real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
-        tmp = gmx_simd_align_real(tmpa);
+        real tmpa[2*GMX_SIMD_REAL_WIDTH], *tmp;
+        tmp = gmx_simd_align_r(tmpa);
          for (i = 0; i < UNROLLI; i++)
          {
-            gmx_store_pr(tmp, gmx_sub_pr(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
+            gmx_simd_store_r(tmp, gmx_simd_sub_r(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
              for (j = 0; j < UNROLLJ; j++)
              {
                  if (tmp[j] >= 0)
@@ -411,10 +411,10 @@
  
      /* Calculate 1/r */
  #ifndef GMX_DOUBLE
-    rinv_S0     = gmx_invsqrt_pr(rsq_S0);
-    rinv_S1     = gmx_invsqrt_pr(rsq_S1);
-    rinv_S2     = gmx_invsqrt_pr(rsq_S2);
-    rinv_S3     = gmx_invsqrt_pr(rsq_S3);
+    rinv_S0     = gmx_simd_invsqrt_r(rsq_S0);
+    rinv_S1     = gmx_simd_invsqrt_r(rsq_S1);
+    rinv_S2     = gmx_simd_invsqrt_r(rsq_S2);
+    rinv_S3     = gmx_simd_invsqrt_r(rsq_S3);
  #else
      gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
      gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
@@ -422,11 +422,11 @@
  
  #ifdef CALC_COULOMB
      /* Load parameters for j atom */
-    jq_S        = gmx_load_pr(q+aj);
-    qq_S0       = gmx_mul_pr(iq_S0, jq_S);
-    qq_S1       = gmx_mul_pr(iq_S1, jq_S);
-    qq_S2       = gmx_mul_pr(iq_S2, jq_S);
-    qq_S3       = gmx_mul_pr(iq_S3, jq_S);
+    jq_S        = gmx_simd_load_r(q+aj);
+    qq_S0       = gmx_simd_mul_r(iq_S0, jq_S);
+    qq_S1       = gmx_simd_mul_r(iq_S1, jq_S);
+    qq_S2       = gmx_simd_mul_r(iq_S2, jq_S);
+    qq_S3       = gmx_simd_mul_r(iq_S3, jq_S);
  #endif
  
  #ifdef CALC_LJ
@@ -441,57 +441,57 @@
  #endif /* not defined any LJ rule */
  
  #ifdef LJ_COMB_GEOM
-    c6s_j_S     = gmx_load_pr(ljc+aj2+0);
-    c12s_j_S    = gmx_load_pr(ljc+aj2+STRIDE);
-    c6_S0       = gmx_mul_pr(c6s_S0, c6s_j_S );
-    c6_S1       = gmx_mul_pr(c6s_S1, c6s_j_S );
+    c6s_j_S     = gmx_simd_load_r(ljc+aj2+0);
+    c12s_j_S    = gmx_simd_load_r(ljc+aj2+STRIDE);
+    c6_S0       = gmx_simd_mul_r(c6s_S0, c6s_j_S );
+    c6_S1       = gmx_simd_mul_r(c6s_S1, c6s_j_S );
  #ifndef HALF_LJ
-    c6_S2       = gmx_mul_pr(c6s_S2, c6s_j_S );
-    c6_S3       = gmx_mul_pr(c6s_S3, c6s_j_S );
+    c6_S2       = gmx_simd_mul_r(c6s_S2, c6s_j_S );
+    c6_S3       = gmx_simd_mul_r(c6s_S3, c6s_j_S );
  #endif
-    c12_S0      = gmx_mul_pr(c12s_S0, c12s_j_S);
-    c12_S1      = gmx_mul_pr(c12s_S1, c12s_j_S);
+    c12_S0      = gmx_simd_mul_r(c12s_S0, c12s_j_S);
+    c12_S1      = gmx_simd_mul_r(c12s_S1, c12s_j_S);
  #ifndef HALF_LJ
-    c12_S2      = gmx_mul_pr(c12s_S2, c12s_j_S);
-    c12_S3      = gmx_mul_pr(c12s_S3, c12s_j_S);
+    c12_S2      = gmx_simd_mul_r(c12s_S2, c12s_j_S);
+    c12_S3      = gmx_simd_mul_r(c12s_S3, c12s_j_S);
  #endif
  #endif /* LJ_COMB_GEOM */
  
  #ifdef LJ_COMB_LB
-    hsig_j_S    = gmx_load_pr(ljc+aj2+0);
-    seps_j_S    = gmx_load_pr(ljc+aj2+STRIDE);
+    hsig_j_S    = gmx_simd_load_r(ljc+aj2+0);
+    seps_j_S    = gmx_simd_load_r(ljc+aj2+STRIDE);
  
-    sig_S0      = gmx_add_pr(hsig_i_S0, hsig_j_S);
-    sig_S1      = gmx_add_pr(hsig_i_S1, hsig_j_S);
-    eps_S0      = gmx_mul_pr(seps_i_S0, seps_j_S);
-    eps_S1      = gmx_mul_pr(seps_i_S1, seps_j_S);
+    sig_S0      = gmx_simd_add_r(hsig_i_S0, hsig_j_S);
+    sig_S1      = gmx_simd_add_r(hsig_i_S1, hsig_j_S);
+    eps_S0      = gmx_simd_mul_r(seps_i_S0, seps_j_S);
+    eps_S1      = gmx_simd_mul_r(seps_i_S1, seps_j_S);
  #ifndef HALF_LJ
-    sig_S2      = gmx_add_pr(hsig_i_S2, hsig_j_S);
-    sig_S3      = gmx_add_pr(hsig_i_S3, hsig_j_S);
-    eps_S2      = gmx_mul_pr(seps_i_S2, seps_j_S);
-    eps_S3      = gmx_mul_pr(seps_i_S3, seps_j_S);
+    sig_S2      = gmx_simd_add_r(hsig_i_S2, hsig_j_S);
+    sig_S3      = gmx_simd_add_r(hsig_i_S3, hsig_j_S);
+    eps_S2      = gmx_simd_mul_r(seps_i_S2, seps_j_S);
+    eps_S3      = gmx_simd_mul_r(seps_i_S3, seps_j_S);
  #endif
  #endif /* LJ_COMB_LB */
  
  #endif /* CALC_LJ */
  
  #ifndef NBNXN_CUTOFF_USE_BLENDV
-    rinv_S0     = gmx_blendzero_pr(rinv_S0, wco_S0);
-    rinv_S1     = gmx_blendzero_pr(rinv_S1, wco_S1);
-    rinv_S2     = gmx_blendzero_pr(rinv_S2, wco_S2);
-    rinv_S3     = gmx_blendzero_pr(rinv_S3, wco_S3);
+    rinv_S0     = gmx_simd_blendzero_r(rinv_S0, wco_S0);
+    rinv_S1     = gmx_simd_blendzero_r(rinv_S1, wco_S1);
+    rinv_S2     = gmx_simd_blendzero_r(rinv_S2, wco_S2);
+    rinv_S3     = gmx_simd_blendzero_r(rinv_S3, wco_S3);
  #else
      /* We only need to mask for the cut-off: blendv is faster */
-    rinv_S0     = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
-    rinv_S1     = gmx_blendv_pr(rinv_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1));
-    rinv_S2     = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
-    rinv_S3     = gmx_blendv_pr(rinv_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3));
+    rinv_S0     = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0));
+    rinv_S1     = gmx_simd_blendv_r(rinv_S1, zero_S, gmx_simd_sub_r(rc2_S, rsq_S1));
+    rinv_S2     = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2));
+    rinv_S3     = gmx_simd_blendv_r(rinv_S3, zero_S, gmx_simd_sub_r(rc2_S, rsq_S3));
  #endif
  
-    rinvsq_S0   = gmx_mul_pr(rinv_S0, rinv_S0);
-    rinvsq_S1   = gmx_mul_pr(rinv_S1, rinv_S1);
-    rinvsq_S2   = gmx_mul_pr(rinv_S2, rinv_S2);
-    rinvsq_S3   = gmx_mul_pr(rinv_S3, rinv_S3);
+    rinvsq_S0   = gmx_simd_mul_r(rinv_S0, rinv_S0);
+    rinvsq_S1   = gmx_simd_mul_r(rinv_S1, rinv_S1);
+    rinvsq_S2   = gmx_simd_mul_r(rinv_S2, rinv_S2);
+    rinvsq_S3   = gmx_simd_mul_r(rinv_S3, rinv_S3);
  
  #ifdef CALC_COULOMB
      /* Note that here we calculate force*r, not the usual force/r.
@@ -502,10 +502,10 @@
  
  #ifdef EXCL_FORCES
      /* Only add 1/r for non-excluded atom pairs */
-    rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, interact_S0);
-    rinv_ex_S1  = gmx_blendzero_pr(rinv_S1, interact_S1);
-    rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, interact_S2);
-    rinv_ex_S3  = gmx_blendzero_pr(rinv_S3, interact_S3);
+    rinv_ex_S0  = gmx_simd_blendzero_r(rinv_S0, interact_S0);
+    rinv_ex_S1  = gmx_simd_blendzero_r(rinv_S1, interact_S1);
+    rinv_ex_S2  = gmx_simd_blendzero_r(rinv_S2, interact_S2);
+    rinv_ex_S3  = gmx_simd_blendzero_r(rinv_S3, interact_S3);
  #else
      /* No exclusion forces, we always need 1/r */
  #define     rinv_ex_S0    rinv_S0
@@ -516,16 +516,16 @@
  
  #ifdef CALC_COUL_RF
      /* Electrostatic interactions */
-    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
-    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_madd_pr(rsq_S1, mrc_3_S, rinv_ex_S1));
-    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
-    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_madd_pr(rsq_S3, mrc_3_S, rinv_ex_S3));
+    frcoul_S0   = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0));
+    frcoul_S1   = gmx_simd_mul_r(qq_S1, gmx_simd_fmadd_r(rsq_S1, mrc_3_S, rinv_ex_S1));
+    frcoul_S2   = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2));
+    frcoul_S3   = gmx_simd_mul_r(qq_S3, gmx_simd_fmadd_r(rsq_S3, mrc_3_S, rinv_ex_S3));
  
  #ifdef CALC_ENERGIES
-    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
-    vcoul_S1    = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_add_pr(gmx_mul_pr(rsq_S1, hrc_3_S), moh_rc_S)));
-    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
-    vcoul_S3    = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_add_pr(gmx_mul_pr(rsq_S3, hrc_3_S), moh_rc_S)));
+    vcoul_S0    = gmx_simd_mul_r(qq_S0, gmx_simd_add_r(rinv_ex_S0, gmx_simd_add_r(gmx_simd_mul_r(rsq_S0, hrc_3_S), moh_rc_S)));
+    vcoul_S1    = gmx_simd_mul_r(qq_S1, gmx_simd_add_r(rinv_ex_S1, gmx_simd_add_r(gmx_simd_mul_r(rsq_S1, hrc_3_S), moh_rc_S)));
+    vcoul_S2    = gmx_simd_mul_r(qq_S2, gmx_simd_add_r(rinv_ex_S2, gmx_simd_add_r(gmx_simd_mul_r(rsq_S2, hrc_3_S), moh_rc_S)));
+    vcoul_S3    = gmx_simd_mul_r(qq_S3, gmx_simd_add_r(rinv_ex_S3, gmx_simd_add_r(gmx_simd_mul_r(rsq_S3, hrc_3_S), moh_rc_S)));
  #endif
  #endif
  
@@ -534,67 +534,67 @@
       * as large distances can cause an overflow in gmx_pmecorrF/V.
       */
  #ifndef NBNXN_CUTOFF_USE_BLENDV
-    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
-    brsq_S1     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S1, wco_S1));
-    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
-    brsq_S3     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S3, wco_S3));
+    brsq_S0     = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S0, wco_S0));
+    brsq_S1     = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S1, wco_S1));
+    brsq_S2     = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S2, wco_S2));
+    brsq_S3     = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S3, wco_S3));
  #else
      /* Strangely, putting mul on a separate line is slower (icc 13) */
-    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
-    brsq_S1     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1)));
-    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
-    brsq_S3     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3)));
-#endif
-    ewcorr_S0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
-    ewcorr_S1   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S);
-    ewcorr_S2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
-    ewcorr_S3   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S);
-    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
-    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_madd_pr(ewcorr_S1, brsq_S1, rinv_ex_S1));
-    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
-    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_madd_pr(ewcorr_S3, brsq_S3, rinv_ex_S3));
+    brsq_S0     = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0)));
+    brsq_S1     = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S1, zero_S, gmx_simd_sub_r(rc2_S, rsq_S1)));
+    brsq_S2     = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2)));
+    brsq_S3     = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S3, zero_S, gmx_simd_sub_r(rc2_S, rsq_S3)));
+#endif
+    ewcorr_S0   = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S0), beta_S);
+    ewcorr_S1   = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S1), beta_S);
+    ewcorr_S2   = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S2), beta_S);
+    ewcorr_S3   = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S3), beta_S);
+    frcoul_S0   = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0));
+    frcoul_S1   = gmx_simd_mul_r(qq_S1, gmx_simd_fmadd_r(ewcorr_S1, brsq_S1, rinv_ex_S1));
+    frcoul_S2   = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2));
+    frcoul_S3   = gmx_simd_mul_r(qq_S3, gmx_simd_fmadd_r(ewcorr_S3, brsq_S3, rinv_ex_S3));
  
  #ifdef CALC_ENERGIES
-    vc_sub_S0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
-    vc_sub_S1   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S1), beta_S);
-    vc_sub_S2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
-    vc_sub_S3   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S3), beta_S);
+    vc_sub_S0   = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S0), beta_S);
+    vc_sub_S1   = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S1), beta_S);
+    vc_sub_S2   = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S2), beta_S);
+    vc_sub_S3   = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S3), beta_S);
  #endif
  
  #endif /* CALC_COUL_EWALD */
  
  #ifdef CALC_COUL_TAB
      /* Electrostatic interactions */
-    r_S0        = gmx_mul_pr(rsq_S0, rinv_S0);
-    r_S1        = gmx_mul_pr(rsq_S1, rinv_S1);
-    r_S2        = gmx_mul_pr(rsq_S2, rinv_S2);
-    r_S3        = gmx_mul_pr(rsq_S3, rinv_S3);
+    r_S0        = gmx_simd_mul_r(rsq_S0, rinv_S0);
+    r_S1        = gmx_simd_mul_r(rsq_S1, rinv_S1);
+    r_S2        = gmx_simd_mul_r(rsq_S2, rinv_S2);
+    r_S3        = gmx_simd_mul_r(rsq_S3, rinv_S3);
      /* Convert r to scaled table units */
-    rs_S0       = gmx_mul_pr(r_S0, invtsp_S);
-    rs_S1       = gmx_mul_pr(r_S1, invtsp_S);
-    rs_S2       = gmx_mul_pr(r_S2, invtsp_S);
-    rs_S3       = gmx_mul_pr(r_S3, invtsp_S);
+    rs_S0       = gmx_simd_mul_r(r_S0, invtsp_S);
+    rs_S1       = gmx_simd_mul_r(r_S1, invtsp_S);
+    rs_S2       = gmx_simd_mul_r(r_S2, invtsp_S);
+    rs_S3       = gmx_simd_mul_r(r_S3, invtsp_S);
      /* Truncate scaled r to an int */
-    ti_S0       = gmx_cvttpr_epi32(rs_S0);
-    ti_S1       = gmx_cvttpr_epi32(rs_S1);
-    ti_S2       = gmx_cvttpr_epi32(rs_S2);
-    ti_S3       = gmx_cvttpr_epi32(rs_S3);
+    ti_S0       = gmx_simd_cvtt_r2i(rs_S0);
+    ti_S1       = gmx_simd_cvtt_r2i(rs_S1);
+    ti_S2       = gmx_simd_cvtt_r2i(rs_S2);
+    ti_S3       = gmx_simd_cvtt_r2i(rs_S3);
  #ifdef GMX_SIMD_HAVE_FLOOR
      /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
-    rf_S0       = gmx_floor_pr(rs_S0);
-    rf_S1       = gmx_floor_pr(rs_S1);
-    rf_S2       = gmx_floor_pr(rs_S2);
-    rf_S3       = gmx_floor_pr(rs_S3);
+    rf_S0       = gmx_simd_floor_r(rs_S0);
+    rf_S1       = gmx_simd_floor_r(rs_S1);
+    rf_S2       = gmx_simd_floor_r(rs_S2);
+    rf_S3       = gmx_simd_floor_r(rs_S3);
  #else
-    rf_S0       = gmx_cvtepi32_pr(ti_S0);
-    rf_S1       = gmx_cvtepi32_pr(ti_S1);
-    rf_S2       = gmx_cvtepi32_pr(ti_S2);
-    rf_S3       = gmx_cvtepi32_pr(ti_S3);
+    rf_S0       = gmx_simd_cvt_i2r(ti_S0);
+    rf_S1       = gmx_simd_cvt_i2r(ti_S1);
+    rf_S2       = gmx_simd_cvt_i2r(ti_S2);
+    rf_S3       = gmx_simd_cvt_i2r(ti_S3);
  #endif
-    frac_S0     = gmx_sub_pr(rs_S0, rf_S0);
-    frac_S1     = gmx_sub_pr(rs_S1, rf_S1);
-    frac_S2     = gmx_sub_pr(rs_S2, rf_S2);
-    frac_S3     = gmx_sub_pr(rs_S3, rf_S3);
+    frac_S0     = gmx_simd_sub_r(rs_S0, rf_S0);
+    frac_S1     = gmx_simd_sub_r(rs_S1, rf_S1);
+    frac_S2     = gmx_simd_sub_r(rs_S2, rf_S2);
+    frac_S3     = gmx_simd_sub_r(rs_S3, rf_S3);
  
      /* Load and interpolate table forces and possibly energies.
       * Force and energy can be combined in one table, stride 4: FDV0
@@ -619,20 +619,20 @@
      load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
  #endif
  #endif
-    fsub_S0     = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
-    fsub_S1     = gmx_add_pr(ctab0_S1, gmx_mul_pr(frac_S1, ctab1_S1));
-    fsub_S2     = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
-    fsub_S3     = gmx_add_pr(ctab0_S3, gmx_mul_pr(frac_S3, ctab1_S3));
-    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
-    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, gmx_mul_pr(fsub_S1, r_S1)));
-    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
-    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, gmx_mul_pr(fsub_S3, r_S3)));
+    fsub_S0     = gmx_simd_add_r(ctab0_S0, gmx_simd_mul_r(frac_S0, ctab1_S0));
+    fsub_S1     = gmx_simd_add_r(ctab0_S1, gmx_simd_mul_r(frac_S1, ctab1_S1));
+    fsub_S2     = gmx_simd_add_r(ctab0_S2, gmx_simd_mul_r(frac_S2, ctab1_S2));
+    fsub_S3     = gmx_simd_add_r(ctab0_S3, gmx_simd_mul_r(frac_S3, ctab1_S3));
+    frcoul_S0   = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, gmx_simd_mul_r(fsub_S0, r_S0)));
+    frcoul_S1   = gmx_simd_mul_r(qq_S1, gmx_simd_sub_r(rinv_ex_S1, gmx_simd_mul_r(fsub_S1, r_S1)));
+    frcoul_S2   = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, gmx_simd_mul_r(fsub_S2, r_S2)));
+    frcoul_S3   = gmx_simd_mul_r(qq_S3, gmx_simd_sub_r(rinv_ex_S3, gmx_simd_mul_r(fsub_S3, r_S3)));
  
  #ifdef CALC_ENERGIES
-    vc_sub_S0   = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
-    vc_sub_S1   = gmx_add_pr(ctabv_S1, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S1), gmx_add_pr(ctab0_S1, fsub_S1)));
-    vc_sub_S2   = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
-    vc_sub_S3   = gmx_add_pr(ctabv_S3, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S3), gmx_add_pr(ctab0_S3, fsub_S3)));
+    vc_sub_S0   = gmx_simd_add_r(ctabv_S0, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S0), gmx_simd_add_r(ctab0_S0, fsub_S0)));
+    vc_sub_S1   = gmx_simd_add_r(ctabv_S1, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S1), gmx_simd_add_r(ctab0_S1, fsub_S1)));
+    vc_sub_S2   = gmx_simd_add_r(ctabv_S2, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S2), gmx_simd_add_r(ctab0_S2, fsub_S2)));
+    vc_sub_S3   = gmx_simd_add_r(ctabv_S3, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S3), gmx_simd_add_r(ctab0_S3, fsub_S3)));
  #endif
  #endif /* CALC_COUL_TAB */
  
@@ -640,31 +640,31 @@
  #ifndef NO_SHIFT_EWALD
      /* Add Ewald potential shift to vc_sub for convenience */
  #ifdef CHECK_EXCLS
-    vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
-    vc_sub_S1   = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, interact_S1));
-    vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
-    vc_sub_S3   = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, interact_S3));
+    vc_sub_S0   = gmx_simd_add_r(vc_sub_S0, gmx_simd_blendzero_r(sh_ewald_S, interact_S0));
+    vc_sub_S1   = gmx_simd_add_r(vc_sub_S1, gmx_simd_blendzero_r(sh_ewald_S, interact_S1));
+    vc_sub_S2   = gmx_simd_add_r(vc_sub_S2, gmx_simd_blendzero_r(sh_ewald_S, interact_S2));
+    vc_sub_S3   = gmx_simd_add_r(vc_sub_S3, gmx_simd_blendzero_r(sh_ewald_S, interact_S3));
  #else
-    vc_sub_S0   = gmx_add_pr(vc_sub_S0, sh_ewald_S);
-    vc_sub_S1   = gmx_add_pr(vc_sub_S1, sh_ewald_S);
-    vc_sub_S2   = gmx_add_pr(vc_sub_S2, sh_ewald_S);
-    vc_sub_S3   = gmx_add_pr(vc_sub_S3, sh_ewald_S);
+    vc_sub_S0   = gmx_simd_add_r(vc_sub_S0, sh_ewald_S);
+    vc_sub_S1   = gmx_simd_add_r(vc_sub_S1, sh_ewald_S);
+    vc_sub_S2   = gmx_simd_add_r(vc_sub_S2, sh_ewald_S);
+    vc_sub_S3   = gmx_simd_add_r(vc_sub_S3, sh_ewald_S);
  #endif
  #endif
  
-    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
-    vcoul_S1    = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, vc_sub_S1));
-    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
-    vcoul_S3    = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, vc_sub_S3));
+    vcoul_S0    = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, vc_sub_S0));
+    vcoul_S1    = gmx_simd_mul_r(qq_S1, gmx_simd_sub_r(rinv_ex_S1, vc_sub_S1));
+    vcoul_S2    = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, vc_sub_S2));
+    vcoul_S3    = gmx_simd_mul_r(qq_S3, gmx_simd_sub_r(rinv_ex_S3, vc_sub_S3));
  
  #endif
  
  #ifdef CALC_ENERGIES
      /* Mask energy for cut-off and diagonal */
-    vcoul_S0    = gmx_blendzero_pr(vcoul_S0, wco_S0);
-    vcoul_S1    = gmx_blendzero_pr(vcoul_S1, wco_S1);
-    vcoul_S2    = gmx_blendzero_pr(vcoul_S2, wco_S2);
-    vcoul_S3    = gmx_blendzero_pr(vcoul_S3, wco_S3);
+    vcoul_S0    = gmx_simd_blendzero_r(vcoul_S0, wco_S0);
+    vcoul_S1    = gmx_simd_blendzero_r(vcoul_S1, wco_S1);
+    vcoul_S2    = gmx_simd_blendzero_r(vcoul_S2, wco_S2);
+    vcoul_S3    = gmx_simd_blendzero_r(vcoul_S3, wco_S3);
  #endif
  
  #endif /* CALC_COULOMB */
@@ -673,11 +673,11 @@
      /* Lennard-Jones interaction */
  
  #ifdef VDW_CUTOFF_CHECK
-    wco_vdw_S0  = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
-    wco_vdw_S1  = gmx_cmplt_pr(rsq_S1, rcvdw2_S);
+    wco_vdw_S0  = gmx_simd_cmplt_r(rsq_S0, rcvdw2_S);
+    wco_vdw_S1  = gmx_simd_cmplt_r(rsq_S1, rcvdw2_S);
  #ifndef HALF_LJ
-    wco_vdw_S2  = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
-    wco_vdw_S3  = gmx_cmplt_pr(rsq_S3, rcvdw2_S);
+    wco_vdw_S2  = gmx_simd_cmplt_r(rsq_S2, rcvdw2_S);
+    wco_vdw_S3  = gmx_simd_cmplt_r(rsq_S3, rcvdw2_S);
  #endif
  #else
      /* Same cut-off for Coulomb and VdW, reuse the registers */
@@ -688,114 +688,114 @@
  #endif
  
  #ifndef LJ_COMB_LB
-    rinvsix_S0  = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
-    rinvsix_S1  = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1));
+    rinvsix_S0  = gmx_simd_mul_r(rinvsq_S0, gmx_simd_mul_r(rinvsq_S0, rinvsq_S0));
+    rinvsix_S1  = gmx_simd_mul_r(rinvsq_S1, gmx_simd_mul_r(rinvsq_S1, rinvsq_S1));
  #ifdef EXCL_FORCES
-    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, interact_S0);
-    rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, interact_S1);
+    rinvsix_S0  = gmx_simd_blendzero_r(rinvsix_S0, interact_S0);
+    rinvsix_S1  = gmx_simd_blendzero_r(rinvsix_S1, interact_S1);
  #endif
  #ifndef HALF_LJ
-    rinvsix_S2  = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
-    rinvsix_S3  = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3));
+    rinvsix_S2  = gmx_simd_mul_r(rinvsq_S2, gmx_simd_mul_r(rinvsq_S2, rinvsq_S2));
+    rinvsix_S3  = gmx_simd_mul_r(rinvsq_S3, gmx_simd_mul_r(rinvsq_S3, rinvsq_S3));
  #ifdef EXCL_FORCES
-    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, interact_S2);
-    rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, interact_S3);
+    rinvsix_S2  = gmx_simd_blendzero_r(rinvsix_S2, interact_S2);
+    rinvsix_S3  = gmx_simd_blendzero_r(rinvsix_S3, interact_S3);
  #endif
  #endif
  #ifdef VDW_CUTOFF_CHECK
-    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
-    rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, wco_vdw_S1);
+    rinvsix_S0  = gmx_simd_blendzero_r(rinvsix_S0, wco_vdw_S0);
+    rinvsix_S1  = gmx_simd_blendzero_r(rinvsix_S1, wco_vdw_S1);
  #ifndef HALF_LJ
-    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
-    rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, wco_vdw_S3);
+    rinvsix_S2  = gmx_simd_blendzero_r(rinvsix_S2, wco_vdw_S2);
+    rinvsix_S3  = gmx_simd_blendzero_r(rinvsix_S3, wco_vdw_S3);
  #endif
  #endif
-    FrLJ6_S0    = gmx_mul_pr(c6_S0, rinvsix_S0);
-    FrLJ6_S1    = gmx_mul_pr(c6_S1, rinvsix_S1);
+    FrLJ6_S0    = gmx_simd_mul_r(c6_S0, rinvsix_S0);
+    FrLJ6_S1    = gmx_simd_mul_r(c6_S1, rinvsix_S1);
  #ifndef HALF_LJ
-    FrLJ6_S2    = gmx_mul_pr(c6_S2, rinvsix_S2);
-    FrLJ6_S3    = gmx_mul_pr(c6_S3, rinvsix_S3);
+    FrLJ6_S2    = gmx_simd_mul_r(c6_S2, rinvsix_S2);
+    FrLJ6_S3    = gmx_simd_mul_r(c6_S3, rinvsix_S3);
  #endif
-    FrLJ12_S0   = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
-    FrLJ12_S1   = gmx_mul_pr(c12_S1, gmx_mul_pr(rinvsix_S1, rinvsix_S1));
+    FrLJ12_S0   = gmx_simd_mul_r(c12_S0, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0));
+    FrLJ12_S1   = gmx_simd_mul_r(c12_S1, gmx_simd_mul_r(rinvsix_S1, rinvsix_S1));
  #ifndef HALF_LJ
-    FrLJ12_S2   = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
-    FrLJ12_S3   = gmx_mul_pr(c12_S3, gmx_mul_pr(rinvsix_S3, rinvsix_S3));
+    FrLJ12_S2   = gmx_simd_mul_r(c12_S2, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2));
+    FrLJ12_S3   = gmx_simd_mul_r(c12_S3, gmx_simd_mul_r(rinvsix_S3, rinvsix_S3));
  #endif
  #endif /* not LJ_COMB_LB */
  
  #ifdef LJ_COMB_LB
-    sir_S0      = gmx_mul_pr(sig_S0, rinv_S0);
-    sir_S1      = gmx_mul_pr(sig_S1, rinv_S1);
+    sir_S0      = gmx_simd_mul_r(sig_S0, rinv_S0);
+    sir_S1      = gmx_simd_mul_r(sig_S1, rinv_S1);
  #ifndef HALF_LJ
-    sir_S2      = gmx_mul_pr(sig_S2, rinv_S2);
-    sir_S3      = gmx_mul_pr(sig_S3, rinv_S3);
+    sir_S2      = gmx_simd_mul_r(sig_S2, rinv_S2);
+    sir_S3      = gmx_simd_mul_r(sig_S3, rinv_S3);
  #endif
-    sir2_S0     = gmx_mul_pr(sir_S0, sir_S0);
-    sir2_S1     = gmx_mul_pr(sir_S1, sir_S1);
+    sir2_S0     = gmx_simd_mul_r(sir_S0, sir_S0);
+    sir2_S1     = gmx_simd_mul_r(sir_S1, sir_S1);
  #ifndef HALF_LJ
-    sir2_S2     = gmx_mul_pr(sir_S2, sir_S2);
-    sir2_S3     = gmx_mul_pr(sir_S3, sir_S3);
+    sir2_S2     = gmx_simd_mul_r(sir_S2, sir_S2);
+    sir2_S3     = gmx_simd_mul_r(sir_S3, sir_S3);
  #endif
-    sir6_S0     = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
-    sir6_S1     = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1));
+    sir6_S0     = gmx_simd_mul_r(sir2_S0, gmx_simd_mul_r(sir2_S0, sir2_S0));
+    sir6_S1     = gmx_simd_mul_r(sir2_S1, gmx_simd_mul_r(sir2_S1, sir2_S1));
  #ifdef EXCL_FORCES
-    sir6_S0     = gmx_blendzero_pr(sir6_S0, interact_S0);
-    sir6_S1     = gmx_blendzero_pr(sir6_S1, interact_S1);
+    sir6_S0     = gmx_simd_blendzero_r(sir6_S0, interact_S0);
+    sir6_S1     = gmx_simd_blendzero_r(sir6_S1, interact_S1);
  #endif
  #ifndef HALF_LJ
-    sir6_S2     = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
-    sir6_S3     = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3));
+    sir6_S2     = gmx_simd_mul_r(sir2_S2, gmx_simd_mul_r(sir2_S2, sir2_S2));
+    sir6_S3     = gmx_simd_mul_r(sir2_S3, gmx_simd_mul_r(sir2_S3, sir2_S3));
  #ifdef EXCL_FORCES
-    sir6_S2     = gmx_blendzero_pr(sir6_S2, interact_S2);
-    sir6_S3     = gmx_blendzero_pr(sir6_S3, interact_S3);
+    sir6_S2     = gmx_simd_blendzero_r(sir6_S2, interact_S2);
+    sir6_S3     = gmx_simd_blendzero_r(sir6_S3, interact_S3);
  #endif
  #endif
  #ifdef VDW_CUTOFF_CHECK
-    sir6_S0     = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
-    sir6_S1     = gmx_blendzero_pr(sir6_S1, wco_vdw_S1);
+    sir6_S0     = gmx_simd_blendzero_r(sir6_S0, wco_vdw_S0);
+    sir6_S1     = gmx_simd_blendzero_r(sir6_S1, wco_vdw_S1);
  #ifndef HALF_LJ
-    sir6_S2     = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
-    sir6_S3     = gmx_blendzero_pr(sir6_S3, wco_vdw_S3);
+    sir6_S2     = gmx_simd_blendzero_r(sir6_S2, wco_vdw_S2);
+    sir6_S3     = gmx_simd_blendzero_r(sir6_S3, wco_vdw_S3);
  #endif
  #endif
-    FrLJ6_S0    = gmx_mul_pr(eps_S0, sir6_S0);
-    FrLJ6_S1    = gmx_mul_pr(eps_S1, sir6_S1);
+    FrLJ6_S0    = gmx_simd_mul_r(eps_S0, sir6_S0);
+    FrLJ6_S1    = gmx_simd_mul_r(eps_S1, sir6_S1);
  #ifndef HALF_LJ
-    FrLJ6_S2    = gmx_mul_pr(eps_S2, sir6_S2);
-    FrLJ6_S3    = gmx_mul_pr(eps_S3, sir6_S3);
+    FrLJ6_S2    = gmx_simd_mul_r(eps_S2, sir6_S2);
+    FrLJ6_S3    = gmx_simd_mul_r(eps_S3, sir6_S3);
  #endif
-    FrLJ12_S0   = gmx_mul_pr(FrLJ6_S0, sir6_S0);
-    FrLJ12_S1   = gmx_mul_pr(FrLJ6_S1, sir6_S1);
+    FrLJ12_S0   = gmx_simd_mul_r(FrLJ6_S0, sir6_S0);
+    FrLJ12_S1   = gmx_simd_mul_r(FrLJ6_S1, sir6_S1);
  #ifndef HALF_LJ
-    FrLJ12_S2   = gmx_mul_pr(FrLJ6_S2, sir6_S2);
-    FrLJ12_S3   = gmx_mul_pr(FrLJ6_S3, sir6_S3);
+    FrLJ12_S2   = gmx_simd_mul_r(FrLJ6_S2, sir6_S2);
+    FrLJ12_S3   = gmx_simd_mul_r(FrLJ6_S3, sir6_S3);
  #endif
  #if defined CALC_ENERGIES
      /* We need C6 and C12 to calculate the LJ potential shift */
-    sig2_S0     = gmx_mul_pr(sig_S0, sig_S0);
-    sig2_S1     = gmx_mul_pr(sig_S1, sig_S1);
+    sig2_S0     = gmx_simd_mul_r(sig_S0, sig_S0);
+    sig2_S1     = gmx_simd_mul_r(sig_S1, sig_S1);
  #ifndef HALF_LJ
-    sig2_S2     = gmx_mul_pr(sig_S2, sig_S2);
-    sig2_S3     = gmx_mul_pr(sig_S3, sig_S3);
+    sig2_S2     = gmx_simd_mul_r(sig_S2, sig_S2);
+    sig2_S3     = gmx_simd_mul_r(sig_S3, sig_S3);
  #endif
-    sig6_S0     = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
-    sig6_S1     = gmx_mul_pr(sig2_S1, gmx_mul_pr(sig2_S1, sig2_S1));
+    sig6_S0     = gmx_simd_mul_r(sig2_S0, gmx_simd_mul_r(sig2_S0, sig2_S0));
+    sig6_S1     = gmx_simd_mul_r(sig2_S1, gmx_simd_mul_r(sig2_S1, sig2_S1));
  #ifndef HALF_LJ
-    sig6_S2     = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
-    sig6_S3     = gmx_mul_pr(sig2_S3, gmx_mul_pr(sig2_S3, sig2_S3));
+    sig6_S2     = gmx_simd_mul_r(sig2_S2, gmx_simd_mul_r(sig2_S2, sig2_S2));
+    sig6_S3     = gmx_simd_mul_r(sig2_S3, gmx_simd_mul_r(sig2_S3, sig2_S3));
  #endif
-    c6_S0       = gmx_mul_pr(eps_S0, sig6_S0);
-    c6_S1       = gmx_mul_pr(eps_S1, sig6_S1);
+    c6_S0       = gmx_simd_mul_r(eps_S0, sig6_S0);
+    c6_S1       = gmx_simd_mul_r(eps_S1, sig6_S1);
  #ifndef HALF_LJ
-    c6_S2       = gmx_mul_pr(eps_S2, sig6_S2);
-    c6_S3       = gmx_mul_pr(eps_S3, sig6_S3);
+    c6_S2       = gmx_simd_mul_r(eps_S2, sig6_S2);
+    c6_S3       = gmx_simd_mul_r(eps_S3, sig6_S3);
  #endif
-    c12_S0      = gmx_mul_pr(c6_S0, sig6_S0);
-    c12_S1      = gmx_mul_pr(c6_S1, sig6_S1);
+    c12_S0      = gmx_simd_mul_r(c6_S0, sig6_S0);
+    c12_S1      = gmx_simd_mul_r(c6_S1, sig6_S1);
  #ifndef HALF_LJ
-    c12_S2      = gmx_mul_pr(c6_S2, sig6_S2);
-    c12_S3      = gmx_mul_pr(c6_S3, sig6_S3);
+    c12_S2      = gmx_simd_mul_r(c6_S2, sig6_S2);
+    c12_S3      = gmx_simd_mul_r(c6_S3, sig6_S3);
  #endif
  #endif
  #endif /* LJ_COMB_LB */
@@ -831,7 +831,7 @@
  
  #ifdef CALC_COULOMB
  #ifndef ENERGY_GROUPS
-    vctot_S      = gmx_add_pr(vctot_S, gmx_sum4_pr(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
+    vctot_S      = gmx_simd_add_r(vctot_S, gmx_simd_sum4_r(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
  #else
      add_ener_grp(vcoul_S0, vctp[0], egp_jj);
      add_ener_grp(vcoul_S1, vctp[1], egp_jj);
@@ -842,50 +842,50 @@
  
  #ifdef CALC_LJ
      /* Calculate the LJ energies */
-    VLJ6_S0     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
-    VLJ6_S1     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S1, gmx_mul_pr(c6_S1, sh_invrc6_S)));
+    VLJ6_S0     = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S0, gmx_simd_mul_r(c6_S0, sh_invrc6_S)));
+    VLJ6_S1     = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S1, gmx_simd_mul_r(c6_S1, sh_invrc6_S)));
  #ifndef HALF_LJ
-    VLJ6_S2     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
-    VLJ6_S3     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S3, gmx_mul_pr(c6_S3, sh_invrc6_S)));
+    VLJ6_S2     = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S2, gmx_simd_mul_r(c6_S2, sh_invrc6_S)));
+    VLJ6_S3     = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S3, gmx_simd_mul_r(c6_S3, sh_invrc6_S)));
  #endif
-    VLJ12_S0    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
-    VLJ12_S1    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S1, gmx_mul_pr(c12_S1, sh_invrc12_S)));
+    VLJ12_S0    = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S0, gmx_simd_mul_r(c12_S0, sh_invrc12_S)));
+    VLJ12_S1    = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S1, gmx_simd_mul_r(c12_S1, sh_invrc12_S)));
  #ifndef HALF_LJ
-    VLJ12_S2    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
-    VLJ12_S3    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S3, gmx_mul_pr(c12_S3, sh_invrc12_S)));
+    VLJ12_S2    = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S2, gmx_simd_mul_r(c12_S2, sh_invrc12_S)));
+    VLJ12_S3    = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S3, gmx_simd_mul_r(c12_S3, sh_invrc12_S)));
  #endif
  
-    VLJ_S0      = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
-    VLJ_S1      = gmx_sub_pr(VLJ12_S1, VLJ6_S1);
+    VLJ_S0      = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0);
+    VLJ_S1      = gmx_simd_sub_r(VLJ12_S1, VLJ6_S1);
  #ifndef HALF_LJ
-    VLJ_S2      = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
-    VLJ_S3      = gmx_sub_pr(VLJ12_S3, VLJ6_S3);
+    VLJ_S2      = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2);
+    VLJ_S3      = gmx_simd_sub_r(VLJ12_S3, VLJ6_S3);
  #endif
      /* The potential shift should be removed for pairs beyond cut-off */
-    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
-    VLJ_S1      = gmx_blendzero_pr(VLJ_S1, wco_vdw_S1);
+    VLJ_S0      = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0);
+    VLJ_S1      = gmx_simd_blendzero_r(VLJ_S1, wco_vdw_S1);
  #ifndef HALF_LJ
-    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
-    VLJ_S3      = gmx_blendzero_pr(VLJ_S3, wco_vdw_S3);
+    VLJ_S2      = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2);
+    VLJ_S3      = gmx_simd_blendzero_r(VLJ_S3, wco_vdw_S3);
  #endif
  #ifdef CHECK_EXCLS
      /* The potential shift should be removed for excluded pairs */
-    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, interact_S0);
-    VLJ_S1      = gmx_blendzero_pr(VLJ_S1, interact_S1);
+    VLJ_S0      = gmx_simd_blendzero_r(VLJ_S0, interact_S0);
+    VLJ_S1      = gmx_simd_blendzero_r(VLJ_S1, interact_S1);
  #ifndef HALF_LJ
-    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, interact_S2);
-    VLJ_S3      = gmx_blendzero_pr(VLJ_S3, interact_S3);
+    VLJ_S2      = gmx_simd_blendzero_r(VLJ_S2, interact_S2);
+    VLJ_S3      = gmx_simd_blendzero_r(VLJ_S3, interact_S3);
  #endif
  #endif
  #ifndef ENERGY_GROUPS
  #ifndef HALF_LJ
-    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
-                             gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
-                             );
+    Vvdwtot_S   = gmx_simd_add_r(Vvdwtot_S,
+                                 gmx_simd_sum4_r(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
+                                 );
  #else
-    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
-                             gmx_add_pr(VLJ_S0, VLJ_S1)
-                             );
+    Vvdwtot_S   = gmx_simd_add_r(Vvdwtot_S,
+                                 gmx_simd_add_r(VLJ_S0, VLJ_S1)
+                                 );
  #endif
  #else
      add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
@@ -900,87 +900,87 @@
  
  #ifdef CALC_LJ
  #ifdef CALC_COULOMB
-    fscal_S0    = gmx_mul_pr(rinvsq_S0,
-                             gmx_add_pr(frcoul_S0,
-                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+    fscal_S0    = gmx_simd_mul_r(rinvsq_S0,
+                                 gmx_simd_add_r(frcoul_S0,
+                                                gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0)));
  #else
-    fscal_S0    = gmx_mul_pr(rinvsq_S0,
-                             (
-                                 gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+    fscal_S0    = gmx_simd_mul_r(rinvsq_S0,
+                                 (
+                                     gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0)));
  #endif
  #ifdef CALC_COULOMB
-    fscal_S1    = gmx_mul_pr(rinvsq_S1,
-                             gmx_add_pr(frcoul_S1,
-                                        gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+    fscal_S1    = gmx_simd_mul_r(rinvsq_S1,
+                                 gmx_simd_add_r(frcoul_S1,
+                                                gmx_simd_sub_r(FrLJ12_S1, FrLJ6_S1)));
  #else
-    fscal_S1    = gmx_mul_pr(rinvsq_S1,
-                             (
-                                 gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+    fscal_S1    = gmx_simd_mul_r(rinvsq_S1,
+                                 (
+                                     gmx_simd_sub_r(FrLJ12_S1, FrLJ6_S1)));
  #endif
  #else
-    fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
-    fscal_S1    = gmx_mul_pr(rinvsq_S1, frcoul_S1);
+    fscal_S0    = gmx_simd_mul_r(rinvsq_S0, frcoul_S0);
+    fscal_S1    = gmx_simd_mul_r(rinvsq_S1, frcoul_S1);
  #endif /* CALC_LJ */
  #if defined CALC_LJ && !defined HALF_LJ
  #ifdef CALC_COULOMB
-    fscal_S2    = gmx_mul_pr(rinvsq_S2,
-                             gmx_add_pr(frcoul_S2,
-                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+    fscal_S2    = gmx_simd_mul_r(rinvsq_S2,
+                                 gmx_simd_add_r(frcoul_S2,
+                                                gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2)));
  #else
-    fscal_S2    = gmx_mul_pr(rinvsq_S2,
-                             (
-                                 gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+    fscal_S2    = gmx_simd_mul_r(rinvsq_S2,
+                                 (
+                                     gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2)));
  #endif
  #ifdef CALC_COULOMB
-    fscal_S3    = gmx_mul_pr(rinvsq_S3,
-                             gmx_add_pr(frcoul_S3,
-                                        gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+    fscal_S3    = gmx_simd_mul_r(rinvsq_S3,
+                                 gmx_simd_add_r(frcoul_S3,
+                                                gmx_simd_sub_r(FrLJ12_S3, FrLJ6_S3)));
  #else
-    fscal_S3    = gmx_mul_pr(rinvsq_S3,
-                             (
-                                 gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+    fscal_S3    = gmx_simd_mul_r(rinvsq_S3,
+                                 (
+                                     gmx_simd_sub_r(FrLJ12_S3, FrLJ6_S3)));
  #endif
  #else
      /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
-    fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
-    fscal_S3    = gmx_mul_pr(rinvsq_S3, frcoul_S3);
+    fscal_S2    = gmx_simd_mul_r(rinvsq_S2, frcoul_S2);
+    fscal_S3    = gmx_simd_mul_r(rinvsq_S3, frcoul_S3);
  #endif
  
      /* Calculate temporary vectorial force */
-    tx_S0       = gmx_mul_pr(fscal_S0, dx_S0);
-    tx_S1       = gmx_mul_pr(fscal_S1, dx_S1);
-    tx_S2       = gmx_mul_pr(fscal_S2, dx_S2);
-    tx_S3       = gmx_mul_pr(fscal_S3, dx_S3);
-    ty_S0       = gmx_mul_pr(fscal_S0, dy_S0);
-    ty_S1       = gmx_mul_pr(fscal_S1, dy_S1);
-    ty_S2       = gmx_mul_pr(fscal_S2, dy_S2);
-    ty_S3       = gmx_mul_pr(fscal_S3, dy_S3);
-    tz_S0       = gmx_mul_pr(fscal_S0, dz_S0);
-    tz_S1       = gmx_mul_pr(fscal_S1, dz_S1);
-    tz_S2       = gmx_mul_pr(fscal_S2, dz_S2);
-    tz_S3       = gmx_mul_pr(fscal_S3, dz_S3);
+    tx_S0       = gmx_simd_mul_r(fscal_S0, dx_S0);
+    tx_S1       = gmx_simd_mul_r(fscal_S1, dx_S1);
+    tx_S2       = gmx_simd_mul_r(fscal_S2, dx_S2);
+    tx_S3       = gmx_simd_mul_r(fscal_S3, dx_S3);
+    ty_S0       = gmx_simd_mul_r(fscal_S0, dy_S0);
+    ty_S1       = gmx_simd_mul_r(fscal_S1, dy_S1);
+    ty_S2       = gmx_simd_mul_r(fscal_S2, dy_S2);
+    ty_S3       = gmx_simd_mul_r(fscal_S3, dy_S3);
+    tz_S0       = gmx_simd_mul_r(fscal_S0, dz_S0);
+    tz_S1       = gmx_simd_mul_r(fscal_S1, dz_S1);
+    tz_S2       = gmx_simd_mul_r(fscal_S2, dz_S2);
+    tz_S3       = gmx_simd_mul_r(fscal_S3, dz_S3);
  
      /* Increment i atom force */
-    fix_S0      = gmx_add_pr(fix_S0, tx_S0);
-    fix_S1      = gmx_add_pr(fix_S1, tx_S1);
-    fix_S2      = gmx_add_pr(fix_S2, tx_S2);
-    fix_S3      = gmx_add_pr(fix_S3, tx_S3);
-    fiy_S0      = gmx_add_pr(fiy_S0, ty_S0);
-    fiy_S1      = gmx_add_pr(fiy_S1, ty_S1);
-    fiy_S2      = gmx_add_pr(fiy_S2, ty_S2);
-    fiy_S3      = gmx_add_pr(fiy_S3, ty_S3);
-    fiz_S0      = gmx_add_pr(fiz_S0, tz_S0);
-    fiz_S1      = gmx_add_pr(fiz_S1, tz_S1);
-    fiz_S2      = gmx_add_pr(fiz_S2, tz_S2);
-    fiz_S3      = gmx_add_pr(fiz_S3, tz_S3);
+    fix_S0      = gmx_simd_add_r(fix_S0, tx_S0);
+    fix_S1      = gmx_simd_add_r(fix_S1, tx_S1);
+    fix_S2      = gmx_simd_add_r(fix_S2, tx_S2);
+    fix_S3      = gmx_simd_add_r(fix_S3, tx_S3);
+    fiy_S0      = gmx_simd_add_r(fiy_S0, ty_S0);
+    fiy_S1      = gmx_simd_add_r(fiy_S1, ty_S1);
+    fiy_S2      = gmx_simd_add_r(fiy_S2, ty_S2);
+    fiy_S3      = gmx_simd_add_r(fiy_S3, ty_S3);
+    fiz_S0      = gmx_simd_add_r(fiz_S0, tz_S0);
+    fiz_S1      = gmx_simd_add_r(fiz_S1, tz_S1);
+    fiz_S2      = gmx_simd_add_r(fiz_S2, tz_S2);
+    fiz_S3      = gmx_simd_add_r(fiz_S3, tz_S3);
  
      /* Decrement j atom force */
-    gmx_store_pr(f+ajx,
-                 gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_S0, tx_S1, tx_S2, tx_S3) ));
-    gmx_store_pr(f+ajy,
-                 gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_S0, ty_S1, ty_S2, ty_S3) ));
-    gmx_store_pr(f+ajz,
-                 gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_S0, tz_S1, tz_S2, tz_S3) ));
+    gmx_simd_store_r(f+ajx,
+                     gmx_simd_sub_r( gmx_simd_load_r(f+ajx), gmx_simd_sum4_r(tx_S0, tx_S1, tx_S2, tx_S3) ));
+    gmx_simd_store_r(f+ajy,
+                     gmx_simd_sub_r( gmx_simd_load_r(f+ajy), gmx_simd_sum4_r(ty_S0, ty_S1, ty_S2, ty_S3) ));
+    gmx_simd_store_r(f+ajz,
+                     gmx_simd_sub_r( gmx_simd_load_r(f+ajz), gmx_simd_sum4_r(tz_S0, tz_S1, tz_S2, tz_S3) ));
  }
  
  #undef  rinv_ex_S0
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h

index 3ccc2daef14c24acc23c2a0412c791ffea2db142..28792136627ec4179560b9aeea20c2b24445c5af 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
@@ -59,119 +59,121 @@
      real       *vctp[UNROLLI];
  #endif
  
-    gmx_mm_pr  shX_S;
-    gmx_mm_pr  shY_S;
-    gmx_mm_pr  shZ_S;
-    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
-    gmx_mm_pr  ix_S1, iy_S1, iz_S1;
-    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
-    gmx_mm_pr  ix_S3, iy_S3, iz_S3;
-    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
-    gmx_mm_pr  fix_S1, fiy_S1, fiz_S1;
-    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
-    gmx_mm_pr  fix_S3, fiy_S3, fiz_S3;
+    gmx_simd_real_t  shX_S;
+    gmx_simd_real_t  shY_S;
+    gmx_simd_real_t  shZ_S;
+    gmx_simd_real_t  ix_S0, iy_S0, iz_S0;
+    gmx_simd_real_t  ix_S1, iy_S1, iz_S1;
+    gmx_simd_real_t  ix_S2, iy_S2, iz_S2;
+    gmx_simd_real_t  ix_S3, iy_S3, iz_S3;
+    gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
+    gmx_simd_real_t  fix_S1, fiy_S1, fiz_S1;
+    gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
+    gmx_simd_real_t  fix_S3, fiy_S3, fiz_S3;
  #if UNROLLJ >= 4
      /* We use an i-force SIMD register width of 4 */
      gmx_mm_pr4 fix_S, fiy_S, fiz_S;
  #else
      /* We use an i-force SIMD register width of 2 */
-    gmx_mm_pr  fix0_S, fiy0_S, fiz0_S;
-    gmx_mm_pr  fix2_S, fiy2_S, fiz2_S;
+    gmx_simd_real_t  fix0_S, fiy0_S, fiz0_S;
+    gmx_simd_real_t  fix2_S, fiy2_S, fiz2_S;
  #endif
  
-    gmx_mm_pr  diagonal_jmi_S;
+    gmx_simd_real_t  diagonal_jmi_S;
  #if UNROLLI == UNROLLJ
-    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
+    gmx_simd_bool_t  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
  #else
-    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
-    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
+    gmx_simd_bool_t  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
+    gmx_simd_bool_t  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
  #endif
  
-    unsigned      *exclusion_filter;
-    gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
+    unsigned            *exclusion_filter;
+    gmx_exclfilter       filter_S0, filter_S1, filter_S2, filter_S3;
  
-    gmx_mm_pr      zero_S = gmx_set1_pr(0.0);
+    gmx_simd_real_t      zero_S = gmx_simd_set1_r(0.0);
  
-    gmx_mm_pr      one_S  = gmx_set1_pr(1.0);
-    gmx_mm_pr      iq_S0  = gmx_setzero_pr();
-    gmx_mm_pr      iq_S1  = gmx_setzero_pr();
-    gmx_mm_pr      iq_S2  = gmx_setzero_pr();
-    gmx_mm_pr      iq_S3  = gmx_setzero_pr();
-    gmx_mm_pr      mrc_3_S;
+    gmx_simd_real_t      one_S  = gmx_simd_set1_r(1.0);
+    gmx_simd_real_t      iq_S0  = gmx_simd_setzero_r();
+    gmx_simd_real_t      iq_S1  = gmx_simd_setzero_r();
+    gmx_simd_real_t      iq_S2  = gmx_simd_setzero_r();
+    gmx_simd_real_t      iq_S3  = gmx_simd_setzero_r();
+    gmx_simd_real_t      mrc_3_S;
  #ifdef CALC_ENERGIES
-    gmx_mm_pr      hrc_3_S, moh_rc_S;
+    gmx_simd_real_t      hrc_3_S, moh_rc_S;
  #endif
  
  #ifdef CALC_COUL_TAB
      /* Coulomb table variables */
-    gmx_mm_pr   invtsp_S;
-    const real *tab_coul_F;
+    gmx_simd_real_t   invtsp_S;
+    const real       *tab_coul_F;
  #ifndef TAB_FDV0
-    const real *tab_coul_V;
+    const real       *tab_coul_V;
  #endif
      /* Thread-local working buffers for force and potential lookups */
-    int         ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0 = NULL;
-    int         ti1_array[2*GMX_SIMD_WIDTH_HERE], *ti1 = NULL;
-    int         ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2 = NULL;
-    int         ti3_array[2*GMX_SIMD_WIDTH_HERE], *ti3 = NULL;
+    int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
+    int               ti1_array[2*GMX_SIMD_REAL_WIDTH], *ti1 = NULL;
+    int               ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2 = NULL;
+    int               ti3_array[2*GMX_SIMD_REAL_WIDTH], *ti3 = NULL;
  #ifdef CALC_ENERGIES
-    gmx_mm_pr   mhalfsp_S;
+    gmx_simd_real_t   mhalfsp_S;
  #endif
  #endif
  
  #ifdef CALC_COUL_EWALD
-    gmx_mm_pr beta2_S, beta_S;
+    gmx_simd_real_t beta2_S, beta_S;
  #endif
  
  #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
-    gmx_mm_pr  sh_ewald_S;
+    gmx_simd_real_t  sh_ewald_S;
  #endif
  
  #ifdef LJ_COMB_LB
-    const real *ljc;
+    const real       *ljc;
  
-    gmx_mm_pr   hsig_i_S0, seps_i_S0;
-    gmx_mm_pr   hsig_i_S1, seps_i_S1;
-    gmx_mm_pr   hsig_i_S2, seps_i_S2;
-    gmx_mm_pr   hsig_i_S3, seps_i_S3;
+    gmx_simd_real_t   hsig_i_S0, seps_i_S0;
+    gmx_simd_real_t   hsig_i_S1, seps_i_S1;
+    gmx_simd_real_t   hsig_i_S2, seps_i_S2;
+    gmx_simd_real_t   hsig_i_S3, seps_i_S3;
  #else
  #ifdef FIX_LJ_C
-    real        pvdw_array[2*UNROLLI*UNROLLJ+3];
-    real       *pvdw_c6, *pvdw_c12;
-    gmx_mm_pr   c6_S0, c12_S0;
-    gmx_mm_pr   c6_S1, c12_S1;
-    gmx_mm_pr   c6_S2, c12_S2;
-    gmx_mm_pr   c6_S3, c12_S3;
+    real              pvdw_array[2*UNROLLI*UNROLLJ+3];
+    real             *pvdw_c6, *pvdw_c12;
+    gmx_simd_real_t   c6_S0, c12_S0;
+    gmx_simd_real_t   c6_S1, c12_S1;
+    gmx_simd_real_t   c6_S2, c12_S2;
+    gmx_simd_real_t   c6_S3, c12_S3;
  #endif
  
  #ifdef LJ_COMB_GEOM
-    const real *ljc;
+    const real       *ljc;
  
-    gmx_mm_pr   c6s_S0, c12s_S0;
-    gmx_mm_pr   c6s_S1, c12s_S1;
-    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
-    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
+    gmx_simd_real_t   c6s_S0, c12s_S0;
+    gmx_simd_real_t   c6s_S1, c12s_S1;
+    gmx_simd_real_t   c6s_S2  = gmx_simd_setzero_r();
+    gmx_simd_real_t   c12s_S2 = gmx_simd_setzero_r();
+    gmx_simd_real_t   c6s_S3  = gmx_simd_setzero_r();
+    gmx_simd_real_t   c12s_S3 = gmx_simd_setzero_r();
  #endif
  #endif /* LJ_COMB_LB */
  
-    gmx_mm_pr  vctot_S, Vvdwtot_S;
-    gmx_mm_pr  sixth_S, twelveth_S;
+    gmx_simd_real_t  vctot_S, Vvdwtot_S;
+    gmx_simd_real_t  sixth_S, twelveth_S;
  
-    gmx_mm_pr  avoid_sing_S;
-    gmx_mm_pr  rc2_S;
+    gmx_simd_real_t  avoid_sing_S;
+    gmx_simd_real_t  rc2_S;
  #ifdef VDW_CUTOFF_CHECK
-    gmx_mm_pr  rcvdw2_S;
+    gmx_simd_real_t  rcvdw2_S;
  #endif
  
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
+    gmx_simd_real_t  sh_invrc6_S, sh_invrc12_S;
  
      /* cppcheck-suppress unassignedVariable */
-    real       tmpsum_array[GMX_SIMD_WIDTH_HERE*2], *tmpsum;
+    real       tmpsum_array[GMX_SIMD_REAL_WIDTH*2], *tmpsum;
  #endif
  #ifdef CALC_SHIFTFORCES
      /* cppcheck-suppress unassignedVariable */
-    real       shf_array[GMX_SIMD_WIDTH_HERE*2], *shf;
+    real       shf_array[GMX_SIMD_REAL_WIDTH*2], *shf;
  #endif
  
      int ninner;
@@ -188,39 +190,39 @@
  #endif
  
      /* Load j-i for the first i */
-    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
+    diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i);
      /* Generate all the diagonal masks as comparison results */
  #if UNROLLI == UNROLLJ
-    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask_S1  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask_S3  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+    diagonal_mask_S0  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask_S1  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask_S2  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask_S3  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
  #else
  #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
-    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
+    diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask0_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask0_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
  
  #if UNROLLI == 2*UNROLLJ
      /* Load j-i for the second half of the j-cluster */
-    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
+    diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
  #endif
  
-    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
-    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
-    diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+    diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask1_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
+    diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
+    diagonal_mask1_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
  #endif
  #endif
  
@@ -251,9 +253,9 @@
      ti2 = prepare_table_load_buffer(ti2_array);
      ti3 = prepare_table_load_buffer(ti3_array);
  
-    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
+    invtsp_S  = gmx_simd_set1_r(ic->tabq_scale);
  #ifdef CALC_ENERGIES
-    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
+    mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale);
  #endif
  
  #ifdef TAB_FDV0
@@ -265,12 +267,12 @@
  #endif /* CALC_COUL_TAB */
  
  #ifdef CALC_COUL_EWALD
-    beta2_S = gmx_set1_pr(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
-    beta_S  = gmx_set1_pr(ic->ewaldcoeff_q);
+    beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
+    beta_S  = gmx_simd_set1_r(ic->ewaldcoeff_q);
  #endif
  
  #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
-    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
+    sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald);
  #endif
  
      q                   = nbat->q;
@@ -279,39 +281,39 @@
      shiftvec            = shift_vec[0];
      x                   = nbat->x;
  
-    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+    avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC);
  
      /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
-    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+    rc2_S    = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb);
  #ifdef VDW_CUTOFF_CHECK
-    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
+    rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw);
  #endif
  
  #ifdef CALC_ENERGIES
-    sixth_S      = gmx_set1_pr(1.0/6.0);
-    twelveth_S   = gmx_set1_pr(1.0/12.0);
+    sixth_S      = gmx_simd_set1_r(1.0/6.0);
+    twelveth_S   = gmx_simd_set1_r(1.0/12.0);
  
-    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
-    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+    sh_invrc6_S  = gmx_simd_set1_r(ic->sh_invrc6);
+    sh_invrc12_S = gmx_simd_set1_r(ic->sh_invrc6*ic->sh_invrc6);
  #endif
  
-    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
+    mrc_3_S  = gmx_simd_set1_r(-2*ic->k_rf);
  
  #ifdef CALC_ENERGIES
-    hrc_3_S  = gmx_set1_pr(ic->k_rf);
+    hrc_3_S  = gmx_simd_set1_r(ic->k_rf);
  
-    moh_rc_S = gmx_set1_pr(-ic->c_rf);
+    moh_rc_S = gmx_simd_set1_r(-ic->c_rf);
  #endif
  
  #ifdef CALC_ENERGIES
-    tmpsum   = gmx_simd_align_real(tmpsum_array);
+    tmpsum   = gmx_simd_align_r(tmpsum_array);
  #endif
  #ifdef CALC_SHIFTFORCES
-    shf      = gmx_simd_align_real(shf_array);
+    shf      = gmx_simd_align_r(shf_array);
  #endif
  
  #ifdef FIX_LJ_C
-    pvdw_c6  = gmx_simd_align_real(pvdw_array+3);
+    pvdw_c6  = gmx_simd_align_r(pvdw_array+3);
      pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
  
      for (jp = 0; jp < UNROLLJ; jp++)
@@ -326,15 +328,15 @@
          pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
          pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
      }
-    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
-    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
-    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
-    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
-
-    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
-    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
-    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
-    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+    c6_S0            = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ);
+    c6_S1            = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ);
+    c6_S2            = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ);
+    c6_S3            = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ);
+
+    c12_S0           = gmx_simd_load_r(pvdw_c12+0*UNROLLJ);
+    c12_S1           = gmx_simd_load_r(pvdw_c12+1*UNROLLJ);
+    c12_S2           = gmx_simd_load_r(pvdw_c12+2*UNROLLJ);
+    c12_S3           = gmx_simd_load_r(pvdw_c12+3*UNROLLJ);
  #endif /* FIX_LJ_C */
  
  #ifdef ENERGY_GROUPS
@@ -361,9 +363,9 @@
          ci               = nbln->ci;
          ci_sh            = (ish == CENTRAL ? ci : -1);
  
-        shX_S = gmx_load1_pr(shiftvec+ish3);
-        shY_S = gmx_load1_pr(shiftvec+ish3+1);
-        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
+        shX_S = gmx_simd_load1_r(shiftvec+ish3);
+        shY_S = gmx_simd_load1_r(shiftvec+ish3+1);
+        shZ_S = gmx_simd_load1_r(shiftvec+ish3+2);
  
  #if UNROLLJ <= 4
          sci              = ci*STRIDE;
@@ -446,51 +448,51 @@
          /* Load i atom data */
          sciy             = scix + STRIDE;
          sciz             = sciy + STRIDE;
-        ix_S0            = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
-        ix_S1            = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
-        ix_S2            = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
-        ix_S3            = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
-        iy_S0            = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
-        iy_S1            = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
-        iy_S2            = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
-        iy_S3            = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
-        iz_S0            = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
-        iz_S1            = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
-        iz_S2            = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
-        iz_S3            = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
+        ix_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+scix), shX_S);
+        ix_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+1), shX_S);
+        ix_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+2), shX_S);
+        ix_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+3), shX_S);
+        iy_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy), shY_S);
+        iy_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+1), shY_S);
+        iy_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+2), shY_S);
+        iy_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+3), shY_S);
+        iz_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz), shZ_S);
+        iz_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+1), shZ_S);
+        iz_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+2), shZ_S);
+        iz_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+3), shZ_S);
  
          if (do_coul)
          {
-            iq_S0      = gmx_set1_pr(facel*q[sci]);
-            iq_S1      = gmx_set1_pr(facel*q[sci+1]);
-            iq_S2      = gmx_set1_pr(facel*q[sci+2]);
-            iq_S3      = gmx_set1_pr(facel*q[sci+3]);
+            iq_S0      = gmx_simd_set1_r(facel*q[sci]);
+            iq_S1      = gmx_simd_set1_r(facel*q[sci+1]);
+            iq_S2      = gmx_simd_set1_r(facel*q[sci+2]);
+            iq_S3      = gmx_simd_set1_r(facel*q[sci+3]);
          }
  
  #ifdef LJ_COMB_LB
-        hsig_i_S0      = gmx_load1_pr(ljc+sci2+0);
-        hsig_i_S1      = gmx_load1_pr(ljc+sci2+1);
-        hsig_i_S2      = gmx_load1_pr(ljc+sci2+2);
-        hsig_i_S3      = gmx_load1_pr(ljc+sci2+3);
-        seps_i_S0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
-        seps_i_S1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
-        seps_i_S2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
-        seps_i_S3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
+        hsig_i_S0      = gmx_simd_load1_r(ljc+sci2+0);
+        hsig_i_S1      = gmx_simd_load1_r(ljc+sci2+1);
+        hsig_i_S2      = gmx_simd_load1_r(ljc+sci2+2);
+        hsig_i_S3      = gmx_simd_load1_r(ljc+sci2+3);
+        seps_i_S0      = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
+        seps_i_S1      = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
+        seps_i_S2      = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
+        seps_i_S3      = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
  #else
  #ifdef LJ_COMB_GEOM
-        c6s_S0         = gmx_load1_pr(ljc+sci2+0);
-        c6s_S1         = gmx_load1_pr(ljc+sci2+1);
+        c6s_S0         = gmx_simd_load1_r(ljc+sci2+0);
+        c6s_S1         = gmx_simd_load1_r(ljc+sci2+1);
          if (!half_LJ)
          {
-            c6s_S2     = gmx_load1_pr(ljc+sci2+2);
-            c6s_S3     = gmx_load1_pr(ljc+sci2+3);
+            c6s_S2     = gmx_simd_load1_r(ljc+sci2+2);
+            c6s_S3     = gmx_simd_load1_r(ljc+sci2+3);
          }
-        c12s_S0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
-        c12s_S1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
+        c12s_S0        = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
+        c12s_S1        = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
          if (!half_LJ)
          {
-            c12s_S2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
-            c12s_S3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
+            c12s_S2    = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
+            c12s_S3    = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
          }
  #else
          nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
@@ -504,22 +506,22 @@
  #endif
  
          /* Zero the potential energy for this list */
-        Vvdwtot_S        = gmx_setzero_pr();
-        vctot_S          = gmx_setzero_pr();
+        Vvdwtot_S        = gmx_simd_setzero_r();
+        vctot_S          = gmx_simd_setzero_r();
  
          /* Clear i atom forces */
-        fix_S0           = gmx_setzero_pr();
-        fix_S1           = gmx_setzero_pr();
-        fix_S2           = gmx_setzero_pr();
-        fix_S3           = gmx_setzero_pr();
-        fiy_S0           = gmx_setzero_pr();
-        fiy_S1           = gmx_setzero_pr();
-        fiy_S2           = gmx_setzero_pr();
-        fiy_S3           = gmx_setzero_pr();
-        fiz_S0           = gmx_setzero_pr();
-        fiz_S1           = gmx_setzero_pr();
-        fiz_S2           = gmx_setzero_pr();
-        fiz_S3           = gmx_setzero_pr();
+        fix_S0           = gmx_simd_setzero_r();
+        fix_S1           = gmx_simd_setzero_r();
+        fix_S2           = gmx_simd_setzero_r();
+        fix_S3           = gmx_simd_setzero_r();
+        fiy_S0           = gmx_simd_setzero_r();
+        fiy_S1           = gmx_simd_setzero_r();
+        fiy_S2           = gmx_simd_setzero_r();
+        fiy_S3           = gmx_simd_setzero_r();
+        fiz_S0           = gmx_simd_setzero_r();
+        fiz_S1           = gmx_simd_setzero_r();
+        fiz_S2           = gmx_simd_setzero_r();
+        fiz_S3           = gmx_simd_setzero_r();
  
          cjind = cjind0;
  
@@ -594,24 +596,24 @@
  #endif
  #else
          fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
-        gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
+        gmx_simd_store_r(f+scix, gmx_simd_add_r(fix0_S, gmx_simd_load_r(f+scix)));
          fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
-        gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
+        gmx_simd_store_r(f+scix+2, gmx_simd_add_r(fix2_S, gmx_simd_load_r(f+scix+2)));
  
          fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
-        gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
+        gmx_simd_store_r(f+sciy, gmx_simd_add_r(fiy0_S, gmx_simd_load_r(f+sciy)));
          fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
-        gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
+        gmx_simd_store_r(f+sciy+2, gmx_simd_add_r(fiy2_S, gmx_simd_load_r(f+sciy+2)));
  
          fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
-        gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
+        gmx_simd_store_r(f+sciz, gmx_simd_add_r(fiz0_S, gmx_simd_load_r(f+sciz)));
          fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
-        gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
+        gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2)));
  
  #ifdef CALC_SHIFTFORCES
-        fshift[ish3+0] += gmx_sum_simd2(gmx_add_pr(fix0_S, fix2_S), shf);
-        fshift[ish3+1] += gmx_sum_simd2(gmx_add_pr(fiy0_S, fiy2_S), shf);
-        fshift[ish3+2] += gmx_sum_simd2(gmx_add_pr(fiz0_S, fiz2_S), shf);
+        fshift[ish3+0] += gmx_sum_simd2(gmx_simd_add_r(fix0_S, fix2_S), shf);
+        fshift[ish3+1] += gmx_sum_simd2(gmx_simd_add_r(fiy0_S, fiy2_S), shf);
+        fshift[ish3+2] += gmx_sum_simd2(gmx_simd_add_r(fiz0_S, fiz2_S), shf);
  #endif
  #endif
  
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index 1813a9c43c5937a5a979cdca24ec5eb54f3dfad4..457db913939dfb617239c95e2f79861b8ad472ca 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -105,17 +105,17 @@
  #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
  
  /* The j-cluster size is matched to the SIMD width */
-#if GMX_SIMD_WIDTH_HERE == 2
+#if GMX_SIMD_REAL_WIDTH == 2
  #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
  #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
  #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
  #else
-#if GMX_SIMD_WIDTH_HERE == 4
+#if GMX_SIMD_REAL_WIDTH == 4
  #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
  #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
  #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
  #else
-#if GMX_SIMD_WIDTH_HERE == 8
+#if GMX_SIMD_REAL_WIDTH == 8
  #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
  #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
  #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
@@ -124,7 +124,7 @@
  #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
  #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
  #else
-#if GMX_SIMD_WIDTH_HERE == 16
+#if GMX_SIMD_REAL_WIDTH == 16
  #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
  #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
  #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
@@ -258,7 +258,7 @@ int nbnxn_kernel_to_cj_size(int nb_kernel_type)
      int cj_size          = 0;
  
  #ifdef GMX_NBNXN_SIMD
-    nbnxn_simd_width = GMX_SIMD_WIDTH_HERE;
+    nbnxn_simd_width = GMX_SIMD_REAL_WIDTH;
  #endif
  
      switch (nb_kernel_type)
@@ -808,20 +808,20 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x,
           * so we don't need to treat special cases in the rest of the code.
           */
  #ifdef NBNXN_SEARCH_BB_SIMD4
-        gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0]));
-        gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0]));
+        gmx_simd4_store_r(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0]));
+        gmx_simd4_store_r(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0]));
  #else
          bbj[1] = bbj[0];
  #endif
      }
  
  #ifdef NBNXN_SEARCH_BB_SIMD4
-    gmx_simd4_store_pr(&bb->lower[0],
-                       gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bbj[0].lower[0]),
-                                        gmx_simd4_load_bb_pr(&bbj[1].lower[0])));
-    gmx_simd4_store_pr(&bb->upper[0],
-                       gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bbj[0].upper[0]),
-                                        gmx_simd4_load_bb_pr(&bbj[1].upper[0])));
+    gmx_simd4_store_r(&bb->lower[0],
+                      gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bbj[0].lower[0]),
+                                      gmx_simd4_load_bb_pr(&bbj[1].lower[0])));
+    gmx_simd4_store_r(&bb->upper[0],
+                      gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bbj[0].upper[0]),
+                                      gmx_simd4_load_bb_pr(&bbj[1].upper[0])));
  #else
      {
          int i;
@@ -877,10 +877,10 @@ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
  /* Coordinate order xyz?, bb order xyz0 */
  static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
  {
-    gmx_simd4_pr bb_0_S, bb_1_S;
-    gmx_simd4_pr x_S;
+    gmx_simd4_real_t bb_0_S, bb_1_S;
+    gmx_simd4_real_t x_S;
  
-    int          i;
+    int              i;
  
      bb_0_S = gmx_simd4_load_bb_pr(x);
      bb_1_S = bb_0_S;
@@ -888,12 +888,12 @@ static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
      for (i = 1; i < na; i++)
      {
          x_S    = gmx_simd4_load_bb_pr(x+i*NNBSBB_C);
-        bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S);
-        bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S);
+        bb_0_S = gmx_simd4_min_r(bb_0_S, x_S);
+        bb_1_S = gmx_simd4_max_r(bb_1_S, x_S);
      }
  
-    gmx_simd4_store_pr(&bb->lower[0], bb_0_S);
-    gmx_simd4_store_pr(&bb->upper[0], bb_1_S);
+    gmx_simd4_store_r(&bb->lower[0], bb_0_S);
+    gmx_simd4_store_r(&bb->upper[0], bb_1_S);
  }
  
  /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
@@ -928,14 +928,14 @@ static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
          for (c2 = sc2; c2 < sc2+nc2; c2++)
          {
  #ifdef NBNXN_SEARCH_BB_SIMD4
-            gmx_simd4_pr min_S, max_S;
-
-            min_S = gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]),
-                                     gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0]));
-            max_S = gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]),
-                                     gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0]));
-            gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S);
-            gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S);
+            gmx_simd4_real_t min_S, max_S;
+
+            min_S = gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]),
+                                    gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0]));
+            max_S = gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]),
+                                    gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0]));
+            gmx_simd4_store_r(&grid->bbj[c2].lower[0], min_S);
+            gmx_simd4_store_r(&grid->bbj[c2].upper[0], max_S);
  #else
              for (j = 0; j < NNBSBB_C; j++)
              {
@@ -1156,7 +1156,7 @@ void fill_cell(const nbnxn_search_t nbs,
          offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
          bb_ptr = grid->bb + offset;
  
-#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
+#if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH == 2
          if (2*grid->na_cj == grid->na_c)
          {
              calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
@@ -2075,40 +2075,40 @@ static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
  static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
                                   int csj, const nbnxn_bb_t *bb_j_all)
  {
-    gmx_simd4_pr bb_i_S0, bb_i_S1;
-    gmx_simd4_pr bb_j_S0, bb_j_S1;
-    gmx_simd4_pr dl_S;
-    gmx_simd4_pr dh_S;
-    gmx_simd4_pr dm_S;
-    gmx_simd4_pr dm0_S;
+    gmx_simd4_real_t bb_i_S0, bb_i_S1;
+    gmx_simd4_real_t bb_j_S0, bb_j_S1;
+    gmx_simd4_real_t dl_S;
+    gmx_simd4_real_t dh_S;
+    gmx_simd4_real_t dm_S;
+    gmx_simd4_real_t dm0_S;
  
      bb_i_S0 = gmx_simd4_load_bb_pr(&bb_i_ci[si].lower[0]);
      bb_i_S1 = gmx_simd4_load_bb_pr(&bb_i_ci[si].upper[0]);
      bb_j_S0 = gmx_simd4_load_bb_pr(&bb_j_all[csj].lower[0]);
      bb_j_S1 = gmx_simd4_load_bb_pr(&bb_j_all[csj].upper[0]);
  
-    dl_S    = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1);
-    dh_S    = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1);
+    dl_S    = gmx_simd4_sub_r(bb_i_S0, bb_j_S1);
+    dh_S    = gmx_simd4_sub_r(bb_j_S0, bb_i_S1);
  
-    dm_S    = gmx_simd4_max_pr(dl_S, dh_S);
-    dm0_S   = gmx_simd4_max_pr(dm_S, gmx_simd4_setzero_pr());
+    dm_S    = gmx_simd4_max_r(dl_S, dh_S);
+    dm0_S   = gmx_simd4_max_r(dm_S, gmx_simd4_setzero_r());
  
-    return gmx_simd4_dotproduct3(dm0_S, dm0_S);
+    return gmx_simd4_dotproduct3_r(dm0_S, dm0_S);
  }
  
  /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
  #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
      {                                                \
-        int          shi;                                  \
+        int              shi;                                  \
                                                   \
-        gmx_simd4_pr dx_0, dy_0, dz_0;                       \
-        gmx_simd4_pr dx_1, dy_1, dz_1;                       \
+        gmx_simd4_real_t dx_0, dy_0, dz_0;                       \
+        gmx_simd4_real_t dx_1, dy_1, dz_1;                       \
                                                   \
-        gmx_simd4_pr mx, my, mz;                             \
-        gmx_simd4_pr m0x, m0y, m0z;                          \
+        gmx_simd4_real_t mx, my, mz;                             \
+        gmx_simd4_real_t m0x, m0y, m0z;                          \
                                                   \
-        gmx_simd4_pr d2x, d2y, d2z;                          \
-        gmx_simd4_pr d2s, d2t;                              \
+        gmx_simd4_real_t d2x, d2y, d2z;                          \
+        gmx_simd4_real_t d2s, d2t;                              \
                                                   \
          shi = si*NNBSBB_D*DIM;                       \
                                                   \
@@ -2119,30 +2119,30 @@ static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
          yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB);   \
          zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB);   \
                                                   \
-        dx_0 = gmx_simd4_sub_pr(xi_l, xj_h);                \
-        dy_0 = gmx_simd4_sub_pr(yi_l, yj_h);                \
-        dz_0 = gmx_simd4_sub_pr(zi_l, zj_h);                \
+        dx_0 = gmx_simd4_sub_r(xi_l, xj_h);                \
+        dy_0 = gmx_simd4_sub_r(yi_l, yj_h);                \
+        dz_0 = gmx_simd4_sub_r(zi_l, zj_h);                \
                                                   \
-        dx_1 = gmx_simd4_sub_pr(xj_l, xi_h);                \
-        dy_1 = gmx_simd4_sub_pr(yj_l, yi_h);                \
-        dz_1 = gmx_simd4_sub_pr(zj_l, zi_h);                \
+        dx_1 = gmx_simd4_sub_r(xj_l, xi_h);                \
+        dy_1 = gmx_simd4_sub_r(yj_l, yi_h);                \
+        dz_1 = gmx_simd4_sub_r(zj_l, zi_h);                \
                                                   \
-        mx   = gmx_simd4_max_pr(dx_0, dx_1);                \
-        my   = gmx_simd4_max_pr(dy_0, dy_1);                \
-        mz   = gmx_simd4_max_pr(dz_0, dz_1);                \
+        mx   = gmx_simd4_max_r(dx_0, dx_1);                \
+        my   = gmx_simd4_max_r(dy_0, dy_1);                \
+        mz   = gmx_simd4_max_r(dz_0, dz_1);                \
                                                   \
-        m0x  = gmx_simd4_max_pr(mx, zero);                  \
-        m0y  = gmx_simd4_max_pr(my, zero);                  \
-        m0z  = gmx_simd4_max_pr(mz, zero);                  \
+        m0x  = gmx_simd4_max_r(mx, zero);                  \
+        m0y  = gmx_simd4_max_r(my, zero);                  \
+        m0z  = gmx_simd4_max_r(mz, zero);                  \
                                                   \
-        d2x  = gmx_simd4_mul_pr(m0x, m0x);                  \
-        d2y  = gmx_simd4_mul_pr(m0y, m0y);                  \
-        d2z  = gmx_simd4_mul_pr(m0z, m0z);                  \
+        d2x  = gmx_simd4_mul_r(m0x, m0x);                  \
+        d2y  = gmx_simd4_mul_r(m0y, m0y);                  \
+        d2z  = gmx_simd4_mul_r(m0z, m0z);                  \
                                                   \
-        d2s  = gmx_simd4_add_pr(d2x, d2y);                  \
-        d2t  = gmx_simd4_add_pr(d2s, d2z);                  \
+        d2s  = gmx_simd4_add_r(d2x, d2y);                  \
+        d2t  = gmx_simd4_add_r(d2s, d2z);                  \
                                                   \
-        gmx_simd4_store_pr(d2+si, d2t);                     \
+        gmx_simd4_store_r(d2+si, d2t);                     \
      }
  
  /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
@@ -2150,21 +2150,21 @@ static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
                                       int nsi, const float *bb_i,
                                       float *d2)
  {
-    gmx_simd4_pr xj_l, yj_l, zj_l;
-    gmx_simd4_pr xj_h, yj_h, zj_h;
-    gmx_simd4_pr xi_l, yi_l, zi_l;
-    gmx_simd4_pr xi_h, yi_h, zi_h;
+    gmx_simd4_real_t xj_l, yj_l, zj_l;
+    gmx_simd4_real_t xj_h, yj_h, zj_h;
+    gmx_simd4_real_t xi_l, yi_l, zi_l;
+    gmx_simd4_real_t xi_h, yi_h, zi_h;
  
-    gmx_simd4_pr zero;
+    gmx_simd4_real_t zero;
  
-    zero = gmx_simd4_setzero_pr();
+    zero = gmx_simd4_setzero_r();
  
-    xj_l = gmx_simd4_set1_pr(bb_j[0*STRIDE_PBB]);
-    yj_l = gmx_simd4_set1_pr(bb_j[1*STRIDE_PBB]);
-    zj_l = gmx_simd4_set1_pr(bb_j[2*STRIDE_PBB]);
-    xj_h = gmx_simd4_set1_pr(bb_j[3*STRIDE_PBB]);
-    yj_h = gmx_simd4_set1_pr(bb_j[4*STRIDE_PBB]);
-    zj_h = gmx_simd4_set1_pr(bb_j[5*STRIDE_PBB]);
+    xj_l = gmx_simd4_set1_r(bb_j[0*STRIDE_PBB]);
+    yj_l = gmx_simd4_set1_r(bb_j[1*STRIDE_PBB]);
+    zj_l = gmx_simd4_set1_r(bb_j[2*STRIDE_PBB]);
+    xj_h = gmx_simd4_set1_r(bb_j[3*STRIDE_PBB]);
+    yj_h = gmx_simd4_set1_r(bb_j[4*STRIDE_PBB]);
+    zj_h = gmx_simd4_set1_r(bb_j[5*STRIDE_PBB]);
  
      /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
       * But as we know the number of iterations is 1 or 2, we unroll manually.
@@ -2214,10 +2214,10 @@ static gmx_bool subc_in_range_x(int na_c,
  /* When we make seperate single/double precision SIMD vector operation
   * include files, this function should be moved there (also using FMA).
   */
-static inline gmx_simd4_pr
-gmx_simd4_calc_rsq_pr(gmx_simd4_pr x, gmx_simd4_pr y, gmx_simd4_pr z)
+static inline gmx_simd4_real_t
+gmx_simd4_calc_rsq_r(gmx_simd4_real_t x, gmx_simd4_real_t y, gmx_simd4_real_t z)
  {
-    return gmx_simd4_add_pr( gmx_simd4_add_pr( gmx_simd4_mul_pr(x, x), gmx_simd4_mul_pr(y, y) ), gmx_simd4_mul_pr(z, z) );
+    return gmx_simd4_add_r( gmx_simd4_add_r( gmx_simd4_mul_r(x, x), gmx_simd4_mul_r(y, y) ), gmx_simd4_mul_r(z, z) );
  }
  
  /* 4-wide SIMD function which determines if any atom pair between two cells,
@@ -2229,15 +2229,15 @@ static gmx_bool subc_in_range_simd4(int na_c,
                                      int csj, int stride, const real *x_j,
                                      real rl2)
  {
-    gmx_simd4_pr ix_S0, iy_S0, iz_S0;
-    gmx_simd4_pr ix_S1, iy_S1, iz_S1;
+    gmx_simd4_real_t ix_S0, iy_S0, iz_S0;
+    gmx_simd4_real_t ix_S1, iy_S1, iz_S1;
  
-    gmx_simd4_pr rc2_S;
+    gmx_simd4_real_t rc2_S;
  
-    int          dim_stride;
-    int          j0, j1;
+    int              dim_stride;
+    int              j0, j1;
  
-    rc2_S   = gmx_simd4_set1_pr(rl2);
+    rc2_S   = gmx_simd4_set1_r(rl2);
  
      dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
      ix_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
@@ -2254,63 +2254,63 @@ static gmx_bool subc_in_range_simd4(int na_c,
      j1 = j0 + na_c - 1;
      while (j0 < j1)
      {
-        gmx_simd4_pr jx0_S, jy0_S, jz0_S;
-        gmx_simd4_pr jx1_S, jy1_S, jz1_S;
+        gmx_simd4_real_t jx0_S, jy0_S, jz0_S;
+        gmx_simd4_real_t jx1_S, jy1_S, jz1_S;
  
-        gmx_simd4_pr dx_S0, dy_S0, dz_S0;
-        gmx_simd4_pr dx_S1, dy_S1, dz_S1;
-        gmx_simd4_pr dx_S2, dy_S2, dz_S2;
-        gmx_simd4_pr dx_S3, dy_S3, dz_S3;
+        gmx_simd4_real_t dx_S0, dy_S0, dz_S0;
+        gmx_simd4_real_t dx_S1, dy_S1, dz_S1;
+        gmx_simd4_real_t dx_S2, dy_S2, dz_S2;
+        gmx_simd4_real_t dx_S3, dy_S3, dz_S3;
  
-        gmx_simd4_pr rsq_S0;
-        gmx_simd4_pr rsq_S1;
-        gmx_simd4_pr rsq_S2;
-        gmx_simd4_pr rsq_S3;
+        gmx_simd4_real_t rsq_S0;
+        gmx_simd4_real_t rsq_S1;
+        gmx_simd4_real_t rsq_S2;
+        gmx_simd4_real_t rsq_S3;
  
-        gmx_simd4_pb wco_S0;
-        gmx_simd4_pb wco_S1;
-        gmx_simd4_pb wco_S2;
-        gmx_simd4_pb wco_S3;
-        gmx_simd4_pb wco_any_S01, wco_any_S23, wco_any_S;
+        gmx_simd4_bool_t wco_S0;
+        gmx_simd4_bool_t wco_S1;
+        gmx_simd4_bool_t wco_S2;
+        gmx_simd4_bool_t wco_S3;
+        gmx_simd4_bool_t wco_any_S01, wco_any_S23, wco_any_S;
  
-        jx0_S = gmx_simd4_set1_pr(x_j[j0*stride+0]);
-        jy0_S = gmx_simd4_set1_pr(x_j[j0*stride+1]);
-        jz0_S = gmx_simd4_set1_pr(x_j[j0*stride+2]);
+        jx0_S = gmx_simd4_set1_r(x_j[j0*stride+0]);
+        jy0_S = gmx_simd4_set1_r(x_j[j0*stride+1]);
+        jz0_S = gmx_simd4_set1_r(x_j[j0*stride+2]);
  
-        jx1_S = gmx_simd4_set1_pr(x_j[j1*stride+0]);
-        jy1_S = gmx_simd4_set1_pr(x_j[j1*stride+1]);
-        jz1_S = gmx_simd4_set1_pr(x_j[j1*stride+2]);
+        jx1_S = gmx_simd4_set1_r(x_j[j1*stride+0]);
+        jy1_S = gmx_simd4_set1_r(x_j[j1*stride+1]);
+        jz1_S = gmx_simd4_set1_r(x_j[j1*stride+2]);
  
          /* Calculate distance */
-        dx_S0            = gmx_simd4_sub_pr(ix_S0, jx0_S);
-        dy_S0            = gmx_simd4_sub_pr(iy_S0, jy0_S);
-        dz_S0            = gmx_simd4_sub_pr(iz_S0, jz0_S);
-        dx_S1            = gmx_simd4_sub_pr(ix_S1, jx0_S);
-        dy_S1            = gmx_simd4_sub_pr(iy_S1, jy0_S);
-        dz_S1            = gmx_simd4_sub_pr(iz_S1, jz0_S);
-        dx_S2            = gmx_simd4_sub_pr(ix_S0, jx1_S);
-        dy_S2            = gmx_simd4_sub_pr(iy_S0, jy1_S);
-        dz_S2            = gmx_simd4_sub_pr(iz_S0, jz1_S);
-        dx_S3            = gmx_simd4_sub_pr(ix_S1, jx1_S);
-        dy_S3            = gmx_simd4_sub_pr(iy_S1, jy1_S);
-        dz_S3            = gmx_simd4_sub_pr(iz_S1, jz1_S);
+        dx_S0            = gmx_simd4_sub_r(ix_S0, jx0_S);
+        dy_S0            = gmx_simd4_sub_r(iy_S0, jy0_S);
+        dz_S0            = gmx_simd4_sub_r(iz_S0, jz0_S);
+        dx_S1            = gmx_simd4_sub_r(ix_S1, jx0_S);
+        dy_S1            = gmx_simd4_sub_r(iy_S1, jy0_S);
+        dz_S1            = gmx_simd4_sub_r(iz_S1, jz0_S);
+        dx_S2            = gmx_simd4_sub_r(ix_S0, jx1_S);
+        dy_S2            = gmx_simd4_sub_r(iy_S0, jy1_S);
+        dz_S2            = gmx_simd4_sub_r(iz_S0, jz1_S);
+        dx_S3            = gmx_simd4_sub_r(ix_S1, jx1_S);
+        dy_S3            = gmx_simd4_sub_r(iy_S1, jy1_S);
+        dz_S3            = gmx_simd4_sub_r(iz_S1, jz1_S);
  
          /* rsq = dx*dx+dy*dy+dz*dz */
-        rsq_S0           = gmx_simd4_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-        rsq_S1           = gmx_simd4_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
-        rsq_S2           = gmx_simd4_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
-        rsq_S3           = gmx_simd4_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+        rsq_S0           = gmx_simd4_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+        rsq_S1           = gmx_simd4_calc_rsq_r(dx_S1, dy_S1, dz_S1);
+        rsq_S2           = gmx_simd4_calc_rsq_r(dx_S2, dy_S2, dz_S2);
+        rsq_S3           = gmx_simd4_calc_rsq_r(dx_S3, dy_S3, dz_S3);
  
-        wco_S0           = gmx_simd4_cmplt_pr(rsq_S0, rc2_S);
-        wco_S1           = gmx_simd4_cmplt_pr(rsq_S1, rc2_S);
-        wco_S2           = gmx_simd4_cmplt_pr(rsq_S2, rc2_S);
-        wco_S3           = gmx_simd4_cmplt_pr(rsq_S3, rc2_S);
+        wco_S0           = gmx_simd4_cmplt_r(rsq_S0, rc2_S);
+        wco_S1           = gmx_simd4_cmplt_r(rsq_S1, rc2_S);
+        wco_S2           = gmx_simd4_cmplt_r(rsq_S2, rc2_S);
+        wco_S3           = gmx_simd4_cmplt_r(rsq_S3, rc2_S);
  
-        wco_any_S01      = gmx_simd4_or_pb(wco_S0, wco_S1);
-        wco_any_S23      = gmx_simd4_or_pb(wco_S2, wco_S3);
-        wco_any_S        = gmx_simd4_or_pb(wco_any_S01, wco_any_S23);
+        wco_any_S01      = gmx_simd4_or_b(wco_S0, wco_S1);
+        wco_any_S23      = gmx_simd4_or_b(wco_S2, wco_S3);
+        wco_any_S        = gmx_simd4_or_b(wco_any_S01, wco_any_S23);
  
-        if (gmx_simd4_anytrue_pb(wco_any_S))
+        if (gmx_simd4_anytrue_b(wco_any_S))
          {
              return TRUE;
          }
@@ -2738,17 +2738,17 @@ static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
  }
  
  #ifdef GMX_NBNXN_SIMD
-#if GMX_SIMD_WIDTH_HERE == 2
+#if GMX_SIMD_REAL_WIDTH == 2
  #define get_imask_simd_4xn  get_imask_simd_j2
  #endif
-#if GMX_SIMD_WIDTH_HERE == 4
+#if GMX_SIMD_REAL_WIDTH == 4
  #define get_imask_simd_4xn  get_imask_simd_j4
  #endif
-#if GMX_SIMD_WIDTH_HERE == 8
+#if GMX_SIMD_REAL_WIDTH == 8
  #define get_imask_simd_4xn  get_imask_simd_j8
  #define get_imask_simd_2xnn get_imask_simd_j4
  #endif
-#if GMX_SIMD_WIDTH_HERE == 16
+#if GMX_SIMD_REAL_WIDTH == 16
  #define get_imask_simd_2xnn get_imask_simd_j8
  #endif
  #endif
@@ -3191,7 +3191,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs,
  /* The next code line is usually not needed. We do not want to version
   * away the above line, because there is logic that relies on being
   * able to detect easily whether any exclusions exist. */
-#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+#if (defined GMX_SIMD_IBM_QPX)
                          nbl->cj[found].interaction_mask_indices[inner_i] &= ~(1U << inner_e);
  #endif
                      }
diff --git a/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h

index b383daf0b78c5b76c9425bdffde1669c5f4f5066..59760f4b94fd18072ab50e6e15c045a19e306666 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
+++ b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,16 +37,16 @@
  #include "nbnxn_kernels/nbnxn_kernel_simd_utils.h"
  
  
-#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
-#define STRIDE_S  (GMX_SIMD_WIDTH_HERE/2)
+#if GMX_SIMD_REAL_WIDTH >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_SIMD_REAL_WIDTH/2)
  #else
  #define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
  #endif
  
-static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+static gmx_inline gmx_simd_real_t gmx_load_hpr_hilo_pr(const real *a)
  {
-    gmx_mm_hpr a_S;
-    gmx_mm_pr  a_a_S;
+    gmx_mm_hpr       a_S;
+    gmx_simd_real_t  a_a_S;
  
      gmx_load_hpr(&a_S, a);
  
@@ -55,10 +55,10 @@ static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
      return a_a_S;
  }
  
-static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift)
+static gmx_inline gmx_simd_real_t gmx_set_2real_shift_pr(const real *a, real shift)
  {
-    gmx_mm_hpr a0_S, a1_S;
-    gmx_mm_pr  a0_a1_S;
+    gmx_mm_hpr       a0_S, a1_S;
+    gmx_simd_real_t  a0_a1_S;
  
      gmx_set1_hpr(&a0_S, a[0] + shift);
      gmx_set1_hpr(&a1_S, a[1] + shift);
@@ -105,26 +105,26 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
                              real rl2, float rbb2,
                              int *ndistc)
  {
-    const nbnxn_x_ci_simd_2xnn_t *work;
-    const nbnxn_bb_t             *bb_ci;
+    const nbnxn_x_ci_simd_2xnn_t       *work;
+    const nbnxn_bb_t                   *bb_ci;
  
-    gmx_mm_pr                     jx_S, jy_S, jz_S;
+    gmx_simd_real_t                     jx_S, jy_S, jz_S;
  
-    gmx_mm_pr                     dx_S0, dy_S0, dz_S0;
-    gmx_mm_pr                     dx_S2, dy_S2, dz_S2;
+    gmx_simd_real_t                     dx_S0, dy_S0, dz_S0;
+    gmx_simd_real_t                     dx_S2, dy_S2, dz_S2;
  
-    gmx_mm_pr                     rsq_S0;
-    gmx_mm_pr                     rsq_S2;
+    gmx_simd_real_t                     rsq_S0;
+    gmx_simd_real_t                     rsq_S2;
  
-    gmx_mm_pb                     wco_S0;
-    gmx_mm_pb                     wco_S2;
-    gmx_mm_pb                     wco_any_S;
+    gmx_simd_bool_t                     wco_S0;
+    gmx_simd_bool_t                     wco_S2;
+    gmx_simd_bool_t                     wco_any_S;
  
-    gmx_mm_pr                     rc2_S;
+    gmx_simd_real_t                     rc2_S;
  
-    gmx_bool                      InRange;
-    float                         d2;
-    int                           xind_f, xind_l, cj;
+    gmx_bool                            InRange;
+    float                               d2;
+    int                                 xind_f, xind_l, cj;
  
      cjf = CI_TO_CJ_SIMD_2XNN(cjf);
      cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
@@ -133,7 +133,7 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
  
      bb_ci = nbl->work->bb_ci;
  
-    rc2_S   = gmx_set1_pr(rl2);
+    rc2_S   = gmx_simd_set1_r(rl2);
  
      InRange = FALSE;
      while (!InRange && cjf <= cjl)
@@ -163,25 +163,25 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
              jz_S  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
  
              /* Calculate distance */
-            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
-            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
-            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
-            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
-            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
-            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
+            dx_S0            = gmx_simd_sub_r(work->ix_S0, jx_S);
+            dy_S0            = gmx_simd_sub_r(work->iy_S0, jy_S);
+            dz_S0            = gmx_simd_sub_r(work->iz_S0, jz_S);
+            dx_S2            = gmx_simd_sub_r(work->ix_S2, jx_S);
+            dy_S2            = gmx_simd_sub_r(work->iy_S2, jy_S);
+            dz_S2            = gmx_simd_sub_r(work->iz_S2, jz_S);
  
              /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+            rsq_S0           = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+            rsq_S2           = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
  
-            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
-            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
+            wco_S0           = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+            wco_S2           = gmx_simd_cmplt_r(rsq_S2, rc2_S);
  
-            wco_any_S        = gmx_or_pb(wco_S0, wco_S2);
+            wco_any_S        = gmx_simd_or_b(wco_S0, wco_S2);
  
-            InRange          = gmx_anytrue_pb(wco_any_S);
+            InRange          = gmx_simd_anytrue_b(wco_any_S);
  
-            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+            *ndistc += 2*GMX_SIMD_REAL_WIDTH;
          }
          if (!InRange)
          {
@@ -221,25 +221,25 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
              jz_S  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
  
              /* Calculate distance */
-            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
-            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
-            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
-            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
-            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
-            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
+            dx_S0            = gmx_simd_sub_r(work->ix_S0, jx_S);
+            dy_S0            = gmx_simd_sub_r(work->iy_S0, jy_S);
+            dz_S0            = gmx_simd_sub_r(work->iz_S0, jz_S);
+            dx_S2            = gmx_simd_sub_r(work->ix_S2, jx_S);
+            dy_S2            = gmx_simd_sub_r(work->iy_S2, jy_S);
+            dz_S2            = gmx_simd_sub_r(work->iz_S2, jz_S);
  
              /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+            rsq_S0           = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+            rsq_S2           = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
  
-            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
-            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
+            wco_S0           = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+            wco_S2           = gmx_simd_cmplt_r(rsq_S2, rc2_S);
  
-            wco_any_S        = gmx_or_pb(wco_S0, wco_S2);
+            wco_any_S        = gmx_simd_or_b(wco_S0, wco_S2);
  
-            InRange          = gmx_anytrue_pb(wco_any_S);
+            InRange          = gmx_simd_anytrue_b(wco_any_S);
  
-            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+            *ndistc += 2*GMX_SIMD_REAL_WIDTH;
          }
          if (!InRange)
          {
diff --git a/src/gromacs/mdlib/nbnxn_search_simd_4xn.h b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h

index 12dd77fdb4bca7ee98f0f4d381e0bc77cc094e24..4931a1a4eb02087cc78991e7dfe421fa8e384827 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search_simd_4xn.h
+++ b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -34,8 +34,8 @@
   */
  
  
-#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
-#define STRIDE_S  (GMX_SIMD_WIDTH_HERE)
+#if GMX_SIMD_REAL_WIDTH >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_SIMD_REAL_WIDTH)
  #else
  #define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
  #endif
@@ -55,18 +55,18 @@ icell_set_x_simd_4xn(int ci,
  
      ia = X_IND_CI_SIMD_4XN(ci);
  
-    x_ci->ix_S0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
-    x_ci->iy_S0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
-    x_ci->iz_S0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
-    x_ci->ix_S1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
-    x_ci->iy_S1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
-    x_ci->iz_S1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
-    x_ci->ix_S2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
-    x_ci->iy_S2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
-    x_ci->iz_S2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
-    x_ci->ix_S3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
-    x_ci->iy_S3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
-    x_ci->iz_S3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+    x_ci->ix_S0 = gmx_simd_set1_r(x[ia + 0*STRIDE_S    ] + shx);
+    x_ci->iy_S0 = gmx_simd_set1_r(x[ia + 1*STRIDE_S    ] + shy);
+    x_ci->iz_S0 = gmx_simd_set1_r(x[ia + 2*STRIDE_S    ] + shz);
+    x_ci->ix_S1 = gmx_simd_set1_r(x[ia + 0*STRIDE_S + 1] + shx);
+    x_ci->iy_S1 = gmx_simd_set1_r(x[ia + 1*STRIDE_S + 1] + shy);
+    x_ci->iz_S1 = gmx_simd_set1_r(x[ia + 2*STRIDE_S + 1] + shz);
+    x_ci->ix_S2 = gmx_simd_set1_r(x[ia + 0*STRIDE_S + 2] + shx);
+    x_ci->iy_S2 = gmx_simd_set1_r(x[ia + 1*STRIDE_S + 2] + shy);
+    x_ci->iz_S2 = gmx_simd_set1_r(x[ia + 2*STRIDE_S + 2] + shz);
+    x_ci->ix_S3 = gmx_simd_set1_r(x[ia + 0*STRIDE_S + 3] + shx);
+    x_ci->iy_S3 = gmx_simd_set1_r(x[ia + 1*STRIDE_S + 3] + shy);
+    x_ci->iz_S3 = gmx_simd_set1_r(x[ia + 2*STRIDE_S + 3] + shz);
  }
  
  /* SIMD code for making a pair list of cell ci vs cell cjf-cjl
@@ -83,32 +83,32 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
                             real rl2, float rbb2,
                             int *ndistc)
  {
-    const nbnxn_x_ci_simd_4xn_t *work;
-    const nbnxn_bb_t            *bb_ci;
+    const nbnxn_x_ci_simd_4xn_t       *work;
+    const nbnxn_bb_t                  *bb_ci;
  
-    gmx_mm_pr                    jx_S, jy_S, jz_S;
+    gmx_simd_real_t                    jx_S, jy_S, jz_S;
  
-    gmx_mm_pr                    dx_S0, dy_S0, dz_S0;
-    gmx_mm_pr                    dx_S1, dy_S1, dz_S1;
-    gmx_mm_pr                    dx_S2, dy_S2, dz_S2;
-    gmx_mm_pr                    dx_S3, dy_S3, dz_S3;
+    gmx_simd_real_t                    dx_S0, dy_S0, dz_S0;
+    gmx_simd_real_t                    dx_S1, dy_S1, dz_S1;
+    gmx_simd_real_t                    dx_S2, dy_S2, dz_S2;
+    gmx_simd_real_t                    dx_S3, dy_S3, dz_S3;
  
-    gmx_mm_pr                    rsq_S0;
-    gmx_mm_pr                    rsq_S1;
-    gmx_mm_pr                    rsq_S2;
-    gmx_mm_pr                    rsq_S3;
+    gmx_simd_real_t                    rsq_S0;
+    gmx_simd_real_t                    rsq_S1;
+    gmx_simd_real_t                    rsq_S2;
+    gmx_simd_real_t                    rsq_S3;
  
-    gmx_mm_pb                    wco_S0;
-    gmx_mm_pb                    wco_S1;
-    gmx_mm_pb                    wco_S2;
-    gmx_mm_pb                    wco_S3;
-    gmx_mm_pb                    wco_any_S01, wco_any_S23, wco_any_S;
+    gmx_simd_bool_t                    wco_S0;
+    gmx_simd_bool_t                    wco_S1;
+    gmx_simd_bool_t                    wco_S2;
+    gmx_simd_bool_t                    wco_S3;
+    gmx_simd_bool_t                    wco_any_S01, wco_any_S23, wco_any_S;
  
-    gmx_mm_pr                    rc2_S;
+    gmx_simd_real_t                    rc2_S;
  
-    gmx_bool                     InRange;
-    float                        d2;
-    int                          xind_f, xind_l, cj;
+    gmx_bool                           InRange;
+    float                              d2;
+    int                                xind_f, xind_l, cj;
  
      cjf = CI_TO_CJ_SIMD_4XN(cjf);
      cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
@@ -117,7 +117,7 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
  
      bb_ci = nbl->work->bb_ci;
  
-    rc2_S   = gmx_set1_pr(rl2);
+    rc2_S   = gmx_simd_set1_r(rl2);
  
      InRange = FALSE;
      while (!InRange && cjf <= cjl)
@@ -142,43 +142,43 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
          {
              xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
  
-            jx_S  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
-            jy_S  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
-            jz_S  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+            jx_S  = gmx_simd_load_r(x_j+xind_f+0*STRIDE_S);
+            jy_S  = gmx_simd_load_r(x_j+xind_f+1*STRIDE_S);
+            jz_S  = gmx_simd_load_r(x_j+xind_f+2*STRIDE_S);
  
  
              /* Calculate distance */
-            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
-            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
-            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
-            dx_S1            = gmx_sub_pr(work->ix_S1, jx_S);
-            dy_S1            = gmx_sub_pr(work->iy_S1, jy_S);
-            dz_S1            = gmx_sub_pr(work->iz_S1, jz_S);
-            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
-            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
-            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
-            dx_S3            = gmx_sub_pr(work->ix_S3, jx_S);
-            dy_S3            = gmx_sub_pr(work->iy_S3, jy_S);
-            dz_S3            = gmx_sub_pr(work->iz_S3, jz_S);
+            dx_S0            = gmx_simd_sub_r(work->ix_S0, jx_S);
+            dy_S0            = gmx_simd_sub_r(work->iy_S0, jy_S);
+            dz_S0            = gmx_simd_sub_r(work->iz_S0, jz_S);
+            dx_S1            = gmx_simd_sub_r(work->ix_S1, jx_S);
+            dy_S1            = gmx_simd_sub_r(work->iy_S1, jy_S);
+            dz_S1            = gmx_simd_sub_r(work->iz_S1, jz_S);
+            dx_S2            = gmx_simd_sub_r(work->ix_S2, jx_S);
+            dy_S2            = gmx_simd_sub_r(work->iy_S2, jy_S);
+            dz_S2            = gmx_simd_sub_r(work->iz_S2, jz_S);
+            dx_S3            = gmx_simd_sub_r(work->ix_S3, jx_S);
+            dy_S3            = gmx_simd_sub_r(work->iy_S3, jy_S);
+            dz_S3            = gmx_simd_sub_r(work->iz_S3, jz_S);
  
              /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-            rsq_S1           = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
-            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
-            rsq_S3           = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+            rsq_S0           = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+            rsq_S1           = gmx_simd_calc_rsq_r(dx_S1, dy_S1, dz_S1);
+            rsq_S2           = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
+            rsq_S3           = gmx_simd_calc_rsq_r(dx_S3, dy_S3, dz_S3);
  
-            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
-            wco_S1           = gmx_cmplt_pr(rsq_S1, rc2_S);
-            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
-            wco_S3           = gmx_cmplt_pr(rsq_S3, rc2_S);
+            wco_S0           = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+            wco_S1           = gmx_simd_cmplt_r(rsq_S1, rc2_S);
+            wco_S2           = gmx_simd_cmplt_r(rsq_S2, rc2_S);
+            wco_S3           = gmx_simd_cmplt_r(rsq_S3, rc2_S);
  
-            wco_any_S01      = gmx_or_pb(wco_S0, wco_S1);
-            wco_any_S23      = gmx_or_pb(wco_S2, wco_S3);
-            wco_any_S        = gmx_or_pb(wco_any_S01, wco_any_S23);
+            wco_any_S01      = gmx_simd_or_b(wco_S0, wco_S1);
+            wco_any_S23      = gmx_simd_or_b(wco_S2, wco_S3);
+            wco_any_S        = gmx_simd_or_b(wco_any_S01, wco_any_S23);
  
-            InRange          = gmx_anytrue_pb(wco_any_S);
+            InRange          = gmx_simd_anytrue_b(wco_any_S);
  
-            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
+            *ndistc += 4*GMX_SIMD_REAL_WIDTH;
          }
          if (!InRange)
          {
@@ -213,42 +213,42 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
          {
              xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
  
-            jx_S  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
-            jy_S  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
-            jz_S  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+            jx_S  = gmx_simd_load_r(x_j+xind_l+0*STRIDE_S);
+            jy_S  = gmx_simd_load_r(x_j+xind_l+1*STRIDE_S);
+            jz_S  = gmx_simd_load_r(x_j+xind_l+2*STRIDE_S);
  
              /* Calculate distance */
-            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
-            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
-            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
-            dx_S1            = gmx_sub_pr(work->ix_S1, jx_S);
-            dy_S1            = gmx_sub_pr(work->iy_S1, jy_S);
-            dz_S1            = gmx_sub_pr(work->iz_S1, jz_S);
-            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
-            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
-            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
-            dx_S3            = gmx_sub_pr(work->ix_S3, jx_S);
-            dy_S3            = gmx_sub_pr(work->iy_S3, jy_S);
-            dz_S3            = gmx_sub_pr(work->iz_S3, jz_S);
+            dx_S0            = gmx_simd_sub_r(work->ix_S0, jx_S);
+            dy_S0            = gmx_simd_sub_r(work->iy_S0, jy_S);
+            dz_S0            = gmx_simd_sub_r(work->iz_S0, jz_S);
+            dx_S1            = gmx_simd_sub_r(work->ix_S1, jx_S);
+            dy_S1            = gmx_simd_sub_r(work->iy_S1, jy_S);
+            dz_S1            = gmx_simd_sub_r(work->iz_S1, jz_S);
+            dx_S2            = gmx_simd_sub_r(work->ix_S2, jx_S);
+            dy_S2            = gmx_simd_sub_r(work->iy_S2, jy_S);
+            dz_S2            = gmx_simd_sub_r(work->iz_S2, jz_S);
+            dx_S3            = gmx_simd_sub_r(work->ix_S3, jx_S);
+            dy_S3            = gmx_simd_sub_r(work->iy_S3, jy_S);
+            dz_S3            = gmx_simd_sub_r(work->iz_S3, jz_S);
  
              /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
-            rsq_S1           = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
-            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
-            rsq_S3           = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+            rsq_S0           = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+            rsq_S1           = gmx_simd_calc_rsq_r(dx_S1, dy_S1, dz_S1);
+            rsq_S2           = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
+            rsq_S3           = gmx_simd_calc_rsq_r(dx_S3, dy_S3, dz_S3);
  
-            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
-            wco_S1           = gmx_cmplt_pr(rsq_S1, rc2_S);
-            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
-            wco_S3           = gmx_cmplt_pr(rsq_S3, rc2_S);
+            wco_S0           = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+            wco_S1           = gmx_simd_cmplt_r(rsq_S1, rc2_S);
+            wco_S2           = gmx_simd_cmplt_r(rsq_S2, rc2_S);
+            wco_S3           = gmx_simd_cmplt_r(rsq_S3, rc2_S);
  
-            wco_any_S01      = gmx_or_pb(wco_S0, wco_S1);
-            wco_any_S23      = gmx_or_pb(wco_S2, wco_S3);
-            wco_any_S        = gmx_or_pb(wco_any_S01, wco_any_S23);
+            wco_any_S01      = gmx_simd_or_b(wco_S0, wco_S1);
+            wco_any_S23      = gmx_simd_or_b(wco_S2, wco_S3);
+            wco_any_S        = gmx_simd_or_b(wco_any_S01, wco_any_S23);
  
-            InRange          = gmx_anytrue_pb(wco_any_S);
+            InRange          = gmx_simd_anytrue_b(wco_any_S);
  
-            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
+            *ndistc += 4*GMX_SIMD_REAL_WIDTH;
          }
          if (!InRange)
          {
@@ -263,7 +263,7 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
              /* Store cj and the interaction mask */
              nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
              nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
              nbl->cj[nbl->ncj].interaction_mask_indices[0] = (nbl->cj[nbl->ncj].excl & 0x000F) >> (0 * 4);
              nbl->cj[nbl->ncj].interaction_mask_indices[1] = (nbl->cj[nbl->ncj].excl & 0x00F0) >> (1 * 4);
              nbl->cj[nbl->ncj].interaction_mask_indices[2] = (nbl->cj[nbl->ncj].excl & 0x0F00) >> (2 * 4);
diff --git a/src/gromacs/mdlib/pme.c b/src/gromacs/mdlib/pme.c

index ff52b5b9fe3dbe54bb993ca21d64914bbfeaa0cd..34ebe16bbe9fc3fa62c34a0e70785f4d191b5150 100644 (file)
--- a/src/gromacs/mdlib/pme.c
+++ b/src/gromacs/mdlib/pme.c
@@ -260,9 +260,9 @@ typedef struct {
  typedef struct {
  #ifdef PME_SIMD4_SPREAD_GATHER
      /* Masks for 4-wide SIMD aligned spreading and gathering */
-    gmx_simd4_pb mask_S0[6], mask_S1[6];
+    gmx_simd4_bool_t mask_S0[6], mask_S1[6];
  #else
-    int          dummy; /* C89 requires that struct has at least one member */
+    int              dummy; /* C89 requires that struct has at least one member */
  #endif
  } pme_spline_work_t;
  
@@ -1877,7 +1877,7 @@ static void realloc_work(pme_work_t *work, int nkx)
           * elements at the end for padding.
           */
  #ifdef PME_SIMD_SOLVE
-        simd_width = GMX_SIMD_WIDTH_HERE;
+        simd_width = GMX_SIMD_REAL_WIDTH;
  #else
          /* We can use any alignment, apart from 0, so we use 4 */
          simd_width = 4;
@@ -1914,24 +1914,24 @@ static void free_work(pme_work_t *work)
  inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
  {
      {
-        const gmx_mm_pr two = gmx_set1_pr(2.0);
-        gmx_mm_pr f_simd;
-        gmx_mm_pr lu;
-        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+        const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+        gmx_simd_real_t f_simd;
+        gmx_simd_real_t lu;
+        gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
          int kx;
-        f_simd = gmx_set1_pr(f);
+        f_simd = gmx_simd_set1_r(f);
          /* We only need to calculate from start. But since start is 0 or 1
           * and we want to use aligned loads/stores, we always start from 0.
           */
-        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+        for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
          {
-            tmp_d1   = gmx_load_pr(d_aligned+kx);
-            d_inv    = gmx_inv_pr(tmp_d1);
-            tmp_r    = gmx_load_pr(r_aligned+kx);
-            tmp_r    = gmx_exp_pr(tmp_r);
-            tmp_e    = gmx_mul_pr(f_simd, d_inv);
-            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
-            gmx_store_pr(e_aligned+kx, tmp_e);
+            tmp_d1   = gmx_simd_load_r(d_aligned+kx);
+            d_inv    = gmx_simd_inv_r(tmp_d1);
+            tmp_r    = gmx_simd_load_r(r_aligned+kx);
+            tmp_r    = gmx_simd_exp_r(tmp_r);
+            tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
+            tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
+            gmx_simd_store_r(e_aligned+kx, tmp_e);
          }
      }
  }
@@ -1958,23 +1958,23 @@ inline static void calc_exponentials_q(int start, int end, real f, real *d, real
  /* Calculate exponentials through SIMD */
  inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
  {
-    gmx_mm_pr tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
-    const gmx_mm_pr sqr_PI = gmx_sqrt_pr(gmx_set1_pr(M_PI));
+    gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+    const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
      int kx;
-    for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+    for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
      {
          /* We only need to calculate from start. But since start is 0 or 1
           * and we want to use aligned loads/stores, we always start from 0.
           */
-        tmp_d = gmx_load_pr(d_aligned+kx);
-        d_inv = gmx_inv_pr(tmp_d);
-        gmx_store_pr(d_aligned+kx, d_inv);
-        tmp_r = gmx_load_pr(r_aligned+kx);
-        tmp_r = gmx_exp_pr(tmp_r);
-        gmx_store_pr(r_aligned+kx, tmp_r);
-        tmp_mk  = gmx_load_pr(factor_aligned+kx);
-        tmp_fac = gmx_mul_pr(sqr_PI, gmx_mul_pr(tmp_mk, gmx_erfc_pr(tmp_mk)));
-        gmx_store_pr(factor_aligned+kx, tmp_fac);
+        tmp_d = gmx_simd_load_r(d_aligned+kx);
+        d_inv = gmx_simd_inv_r(tmp_d);
+        gmx_simd_store_r(d_aligned+kx, d_inv);
+        tmp_r = gmx_simd_load_r(r_aligned+kx);
+        tmp_r = gmx_simd_exp_r(tmp_r);
+        gmx_simd_store_r(r_aligned+kx, tmp_r);
+        tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
+        tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+        gmx_simd_store_r(factor_aligned+kx, tmp_fac);
      }
  }
  #else
@@ -3400,15 +3400,15 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
  
  #ifdef PME_SIMD4_SPREAD_GATHER
      real         tmp[12], *tmp_aligned;
-    gmx_simd4_pr zero_S;
-    gmx_simd4_pr real_mask_S0, real_mask_S1;
+    gmx_simd4_real_t zero_S;
+    gmx_simd4_real_t real_mask_S0, real_mask_S1;
      int          of, i;
  
      snew_aligned(work, 1, SIMD4_ALIGNMENT);
  
      tmp_aligned = gmx_simd4_align_real(tmp);
  
-    zero_S = gmx_simd4_setzero_pr();
+    zero_S = gmx_simd4_setzero_r();
  
      /* Generate bit masks to mask out the unused grid entries,
       * as we only operate on order of the 8 grid entries that are
@@ -3420,10 +3420,10 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
          {
              tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
          }
-        real_mask_S0      = gmx_simd4_load_pr(tmp_aligned);
-        real_mask_S1      = gmx_simd4_load_pr(tmp_aligned+4);
-        work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S);
-        work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S);
+        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
+        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+4);
+        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
      }
  #else
      work = NULL;
diff --git a/src/gromacs/mdlib/pme_simd4.h b/src/gromacs/mdlib/pme_simd4.h

index 1b6f0b0314ef5c3cd746d241a027635ae5c1e054..4cd2213c5ded5b16b2522c6d5a991219389ce8dc 100644 (file)
--- a/src/gromacs/mdlib/pme_simd4.h
+++ b/src/gromacs/mdlib/pme_simd4.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -45,44 +45,44 @@
   * This code does not assume any memory alignment for the grid.
   */
  {
-    gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3;
-    gmx_simd4_pr tz_S;
-    gmx_simd4_pr vx_S;
-    gmx_simd4_pr vx_tz_S;
-    gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3;
-    gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3;
-
-    ty_S0 = gmx_simd4_set1_pr(thy[0]);
-    ty_S1 = gmx_simd4_set1_pr(thy[1]);
-    ty_S2 = gmx_simd4_set1_pr(thy[2]);
-    ty_S3 = gmx_simd4_set1_pr(thy[3]);
+    gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3;
+    gmx_simd4_real_t tz_S;
+    gmx_simd4_real_t vx_S;
+    gmx_simd4_real_t vx_tz_S;
+    gmx_simd4_real_t sum_S0, sum_S1, sum_S2, sum_S3;
+    gmx_simd4_real_t gri_S0, gri_S1, gri_S2, gri_S3;
+
+    ty_S0 = gmx_simd4_set1_r(thy[0]);
+    ty_S1 = gmx_simd4_set1_r(thy[1]);
+    ty_S2 = gmx_simd4_set1_r(thy[2]);
+    ty_S3 = gmx_simd4_set1_r(thy[3]);
  
      /* With order 4 the z-spline is actually aligned */
-    tz_S  = gmx_simd4_load_pr(thz);
+    tz_S  = gmx_simd4_load_r(thz);
  
      for (ithx = 0; (ithx < 4); ithx++)
      {
          index_x = (i0+ithx)*pny*pnz;
          valx    = qn*thx[ithx];
  
-        vx_S   = gmx_simd4_set1_pr(valx);
+        vx_S   = gmx_simd4_set1_r(valx);
  
-        vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S);
+        vx_tz_S = gmx_simd4_mul_r(vx_S, tz_S);
  
-        gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0);
-        gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0);
-        gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0);
-        gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0);
+        gri_S0 = gmx_simd4_loadu_r(grid+index_x+(j0+0)*pnz+k0);
+        gri_S1 = gmx_simd4_loadu_r(grid+index_x+(j0+1)*pnz+k0);
+        gri_S2 = gmx_simd4_loadu_r(grid+index_x+(j0+2)*pnz+k0);
+        gri_S3 = gmx_simd4_loadu_r(grid+index_x+(j0+3)*pnz+k0);
  
-        sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0);
-        sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1);
-        sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2);
-        sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3);
+        sum_S0 = gmx_simd4_fmadd_r(vx_tz_S, ty_S0, gri_S0);
+        sum_S1 = gmx_simd4_fmadd_r(vx_tz_S, ty_S1, gri_S1);
+        sum_S2 = gmx_simd4_fmadd_r(vx_tz_S, ty_S2, gri_S2);
+        sum_S3 = gmx_simd4_fmadd_r(vx_tz_S, ty_S3, gri_S3);
  
-        gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0);
-        gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1);
-        gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2);
-        gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3);
+        gmx_simd4_storeu_r(grid+index_x+(j0+0)*pnz+k0, sum_S0);
+        gmx_simd4_storeu_r(grid+index_x+(j0+1)*pnz+k0, sum_S1);
+        gmx_simd4_storeu_r(grid+index_x+(j0+2)*pnz+k0, sum_S2);
+        gmx_simd4_storeu_r(grid+index_x+(j0+3)*pnz+k0, sum_S3);
      }
  }
  #undef PME_SPREAD_SIMD4_ORDER4
@@ -94,52 +94,52 @@
   * This code does not assume any memory alignment for the grid.
   */
  {
-    real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
+    real             fx_tmp[4], fy_tmp[4], fz_tmp[4];
  
-    gmx_simd4_pr fx_S, fy_S, fz_S;
+    gmx_simd4_real_t fx_S, fy_S, fz_S;
  
-    gmx_simd4_pr tx_S, ty_S, tz_S;
-    gmx_simd4_pr dx_S, dy_S, dz_S;
+    gmx_simd4_real_t tx_S, ty_S, tz_S;
+    gmx_simd4_real_t dx_S, dy_S, dz_S;
  
-    gmx_simd4_pr gval_S;
+    gmx_simd4_real_t gval_S;
  
-    gmx_simd4_pr fxy1_S;
-    gmx_simd4_pr fz1_S;
+    gmx_simd4_real_t fxy1_S;
+    gmx_simd4_real_t fz1_S;
  
-    fx_S = gmx_simd4_setzero_pr();
-    fy_S = gmx_simd4_setzero_pr();
-    fz_S = gmx_simd4_setzero_pr();
+    fx_S = gmx_simd4_setzero_r();
+    fy_S = gmx_simd4_setzero_r();
+    fz_S = gmx_simd4_setzero_r();
  
      /* With order 4 the z-spline is actually aligned */
-    tz_S  = gmx_simd4_load_pr(thz);
-    dz_S  = gmx_simd4_load_pr(dthz);
+    tz_S  = gmx_simd4_load_r(thz);
+    dz_S  = gmx_simd4_load_r(dthz);
  
      for (ithx = 0; (ithx < 4); ithx++)
      {
          index_x  = (i0+ithx)*pny*pnz;
-        tx_S     = gmx_simd4_set1_pr(thx[ithx]);
-        dx_S     = gmx_simd4_set1_pr(dthx[ithx]);
+        tx_S     = gmx_simd4_set1_r(thx[ithx]);
+        dx_S     = gmx_simd4_set1_r(dthx[ithx]);
  
          for (ithy = 0; (ithy < 4); ithy++)
          {
              index_xy = index_x+(j0+ithy)*pnz;
-            ty_S     = gmx_simd4_set1_pr(thy[ithy]);
-            dy_S     = gmx_simd4_set1_pr(dthy[ithy]);
+            ty_S     = gmx_simd4_set1_r(thy[ithy]);
+            dy_S     = gmx_simd4_set1_r(dthy[ithy]);
  
-            gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
+            gval_S = gmx_simd4_loadu_r(grid+index_xy+k0);
  
-            fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S);
-            fz1_S  = gmx_simd4_mul_pr(dz_S, gval_S);
+            fxy1_S = gmx_simd4_mul_r(tz_S, gval_S);
+            fz1_S  = gmx_simd4_mul_r(dz_S, gval_S);
  
-            fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
-            fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
-            fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
+            fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S);
+            fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S);
+            fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S);
          }
      }
  
-    gmx_simd4_storeu_pr(fx_tmp, fx_S);
-    gmx_simd4_storeu_pr(fy_tmp, fy_S);
-    gmx_simd4_storeu_pr(fz_tmp, fz_S);
+    gmx_simd4_storeu_r(fx_tmp, fx_S);
+    gmx_simd4_storeu_r(fy_tmp, fy_S);
+    gmx_simd4_storeu_r(fz_tmp, fz_S);
  
      fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
      fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
@@ -155,32 +155,32 @@
   * This code supports pme_order <= 5.
   */
  {
-    int          offset;
-    int          index;
-    gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
-    gmx_simd4_pr tz_S0;
-    gmx_simd4_pr tz_S1;
-    gmx_simd4_pr vx_S;
-    gmx_simd4_pr vx_tz_S0;
-    gmx_simd4_pr vx_tz_S1;
-    gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
-    gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
-    gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
-    gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
+    int              offset;
+    int              index;
+    gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
+    gmx_simd4_real_t tz_S0;
+    gmx_simd4_real_t tz_S1;
+    gmx_simd4_real_t vx_S;
+    gmx_simd4_real_t vx_tz_S0;
+    gmx_simd4_real_t vx_tz_S1;
+    gmx_simd4_real_t sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
+    gmx_simd4_real_t sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
+    gmx_simd4_real_t gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
+    gmx_simd4_real_t gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
  
      offset = k0 & 3;
  
-    ty_S0 = gmx_simd4_set1_pr(thy[0]);
-    ty_S1 = gmx_simd4_set1_pr(thy[1]);
-    ty_S2 = gmx_simd4_set1_pr(thy[2]);
-    ty_S3 = gmx_simd4_set1_pr(thy[3]);
+    ty_S0 = gmx_simd4_set1_r(thy[0]);
+    ty_S1 = gmx_simd4_set1_r(thy[1]);
+    ty_S2 = gmx_simd4_set1_r(thy[2]);
+    ty_S3 = gmx_simd4_set1_r(thy[3]);
  #if PME_ORDER == 5
-    ty_S4 = gmx_simd4_set1_pr(thy[4]);
+    ty_S4 = gmx_simd4_set1_r(thy[4]);
  #endif
  
  #ifdef GMX_SIMD4_HAVE_UNALIGNED
-    tz_S0 = gmx_simd4_loadu_pr(thz-offset);
-    tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
+    tz_S0 = gmx_simd4_loadu_r(thz-offset);
+    tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
  #else
      {
          int i;
@@ -189,66 +189,66 @@
          {
              thz_aligned[offset+i] = thz[i];
          }
-        tz_S0 = gmx_simd4_load_pr(thz_aligned);
-        tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
+        tz_S0 = gmx_simd4_load_r(thz_aligned);
+        tz_S1 = gmx_simd4_load_r(thz_aligned+4);
      }
  #endif
-    tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
-    tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
+    tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]);
+    tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]);
  
      for (ithx = 0; (ithx < PME_ORDER); ithx++)
      {
          index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
          valx  = qn*thx[ithx];
  
-        vx_S   = gmx_simd4_set1_pr(valx);
+        vx_S   = gmx_simd4_set1_r(valx);
  
-        vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0);
-        vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1);
+        vx_tz_S0 = gmx_simd4_mul_r(vx_S, tz_S0);
+        vx_tz_S1 = gmx_simd4_mul_r(vx_S, tz_S1);
  
-        gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz);
-        gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz);
-        gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz);
-        gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz);
+        gri_S00 = gmx_simd4_load_r(grid+index+0*pnz);
+        gri_S01 = gmx_simd4_load_r(grid+index+1*pnz);
+        gri_S02 = gmx_simd4_load_r(grid+index+2*pnz);
+        gri_S03 = gmx_simd4_load_r(grid+index+3*pnz);
  #if PME_ORDER == 5
-        gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz);
+        gri_S04 = gmx_simd4_load_r(grid+index+4*pnz);
  #endif
-        gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4);
-        gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4);
-        gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4);
-        gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4);
+        gri_S10 = gmx_simd4_load_r(grid+index+0*pnz+4);
+        gri_S11 = gmx_simd4_load_r(grid+index+1*pnz+4);
+        gri_S12 = gmx_simd4_load_r(grid+index+2*pnz+4);
+        gri_S13 = gmx_simd4_load_r(grid+index+3*pnz+4);
  #if PME_ORDER == 5
-        gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4);
+        gri_S14 = gmx_simd4_load_r(grid+index+4*pnz+4);
  #endif
  
-        sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00);
-        sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01);
-        sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02);
-        sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03);
+        sum_S00 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S0, gri_S00);
+        sum_S01 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S1, gri_S01);
+        sum_S02 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S2, gri_S02);
+        sum_S03 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S3, gri_S03);
  #if PME_ORDER == 5
-        sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04);
+        sum_S04 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S4, gri_S04);
  #endif
-        sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10);
-        sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11);
-        sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12);
-        sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13);
+        sum_S10 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S0, gri_S10);
+        sum_S11 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S1, gri_S11);
+        sum_S12 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S2, gri_S12);
+        sum_S13 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S3, gri_S13);
  #if PME_ORDER == 5
-        sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14);
+        sum_S14 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S4, gri_S14);
  #endif
  
-        gmx_simd4_store_pr(grid+index+0*pnz, sum_S00);
-        gmx_simd4_store_pr(grid+index+1*pnz, sum_S01);
-        gmx_simd4_store_pr(grid+index+2*pnz, sum_S02);
-        gmx_simd4_store_pr(grid+index+3*pnz, sum_S03);
+        gmx_simd4_store_r(grid+index+0*pnz, sum_S00);
+        gmx_simd4_store_r(grid+index+1*pnz, sum_S01);
+        gmx_simd4_store_r(grid+index+2*pnz, sum_S02);
+        gmx_simd4_store_r(grid+index+3*pnz, sum_S03);
  #if PME_ORDER == 5
-        gmx_simd4_store_pr(grid+index+4*pnz, sum_S04);
+        gmx_simd4_store_r(grid+index+4*pnz, sum_S04);
  #endif
-        gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10);
-        gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11);
-        gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12);
-        gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13);
+        gmx_simd4_store_r(grid+index+0*pnz+4, sum_S10);
+        gmx_simd4_store_r(grid+index+1*pnz+4, sum_S11);
+        gmx_simd4_store_r(grid+index+2*pnz+4, sum_S12);
+        gmx_simd4_store_r(grid+index+3*pnz+4, sum_S13);
  #if PME_ORDER == 5
-        gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14);
+        gmx_simd4_store_r(grid+index+4*pnz+4, sum_S14);
  #endif
      }
  }
@@ -263,36 +263,36 @@
   * This code supports pme_order <= 5.
   */
  {
-    int          offset;
+    int              offset;
  
-    real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
+    real             fx_tmp[4], fy_tmp[4], fz_tmp[4];
  
-    gmx_simd4_pr fx_S, fy_S, fz_S;
+    gmx_simd4_real_t fx_S, fy_S, fz_S;
  
-    gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1;
-    gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1;
+    gmx_simd4_real_t tx_S, ty_S, tz_S0, tz_S1;
+    gmx_simd4_real_t dx_S, dy_S, dz_S0, dz_S1;
  
-    gmx_simd4_pr gval_S0;
-    gmx_simd4_pr gval_S1;
+    gmx_simd4_real_t gval_S0;
+    gmx_simd4_real_t gval_S1;
  
-    gmx_simd4_pr fxy1_S0;
-    gmx_simd4_pr fz1_S0;
-    gmx_simd4_pr fxy1_S1;
-    gmx_simd4_pr fz1_S1;
-    gmx_simd4_pr fxy1_S;
-    gmx_simd4_pr fz1_S;
+    gmx_simd4_real_t fxy1_S0;
+    gmx_simd4_real_t fz1_S0;
+    gmx_simd4_real_t fxy1_S1;
+    gmx_simd4_real_t fz1_S1;
+    gmx_simd4_real_t fxy1_S;
+    gmx_simd4_real_t fz1_S;
  
      offset = k0 & 3;
  
-    fx_S = gmx_simd4_setzero_pr();
-    fy_S = gmx_simd4_setzero_pr();
-    fz_S = gmx_simd4_setzero_pr();
+    fx_S = gmx_simd4_setzero_r();
+    fy_S = gmx_simd4_setzero_r();
+    fz_S = gmx_simd4_setzero_r();
  
  #ifdef GMX_SIMD4_HAVE_UNALIGNED
-    tz_S0 = gmx_simd4_loadu_pr(thz-offset);
-    tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
-    dz_S0 = gmx_simd4_loadu_pr(dthz-offset);
-    dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4);
+    tz_S0 = gmx_simd4_loadu_r(thz-offset);
+    tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
+    dz_S0 = gmx_simd4_loadu_r(dthz-offset);
+    dz_S1 = gmx_simd4_loadu_r(dthz-offset+4);
  #else
      {
          int i;
@@ -302,49 +302,49 @@
              thz_aligned[offset+i]  = thz[i];
              dthz_aligned[offset+i] = dthz[i];
          }
-        tz_S0 = gmx_simd4_load_pr(thz_aligned);
-        tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
-        dz_S0 = gmx_simd4_load_pr(dthz_aligned);
-        dz_S1 = gmx_simd4_load_pr(dthz_aligned+4);
+        tz_S0 = gmx_simd4_load_r(thz_aligned);
+        tz_S1 = gmx_simd4_load_r(thz_aligned+4);
+        dz_S0 = gmx_simd4_load_r(dthz_aligned);
+        dz_S1 = gmx_simd4_load_r(dthz_aligned+4);
      }
  #endif
-    tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
-    dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]);
-    tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
-    dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]);
+    tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]);
+    dz_S0 = gmx_simd4_blendzero_r(dz_S0, work->mask_S0[offset]);
+    tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]);
+    dz_S1 = gmx_simd4_blendzero_r(dz_S1, work->mask_S1[offset]);
  
      for (ithx = 0; (ithx < PME_ORDER); ithx++)
      {
          index_x  = (i0+ithx)*pny*pnz;
-        tx_S     = gmx_simd4_set1_pr(thx[ithx]);
-        dx_S     = gmx_simd4_set1_pr(dthx[ithx]);
+        tx_S     = gmx_simd4_set1_r(thx[ithx]);
+        dx_S     = gmx_simd4_set1_r(dthx[ithx]);
  
          for (ithy = 0; (ithy < PME_ORDER); ithy++)
          {
              index_xy = index_x+(j0+ithy)*pnz;
-            ty_S     = gmx_simd4_set1_pr(thy[ithy]);
-            dy_S     = gmx_simd4_set1_pr(dthy[ithy]);
+            ty_S     = gmx_simd4_set1_r(thy[ithy]);
+            dy_S     = gmx_simd4_set1_r(dthy[ithy]);
  
-            gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
-            gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
+            gval_S0 = gmx_simd4_load_r(grid+index_xy+k0-offset);
+            gval_S1 = gmx_simd4_load_r(grid+index_xy+k0-offset+4);
  
-            fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0);
-            fz1_S0  = gmx_simd4_mul_pr(dz_S0, gval_S0);
-            fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1);
-            fz1_S1  = gmx_simd4_mul_pr(dz_S1, gval_S1);
+            fxy1_S0 = gmx_simd4_mul_r(tz_S0, gval_S0);
+            fz1_S0  = gmx_simd4_mul_r(dz_S0, gval_S0);
+            fxy1_S1 = gmx_simd4_mul_r(tz_S1, gval_S1);
+            fz1_S1  = gmx_simd4_mul_r(dz_S1, gval_S1);
  
-            fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1);
-            fz1_S  = gmx_simd4_add_pr(fz1_S0, fz1_S1);
+            fxy1_S = gmx_simd4_add_r(fxy1_S0, fxy1_S1);
+            fz1_S  = gmx_simd4_add_r(fz1_S0, fz1_S1);
  
-            fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
-            fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
-            fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
+            fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S);
+            fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S);
+            fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S);
          }
      }
  
-    gmx_simd4_store_pr(fx_tmp, fx_S);
-    gmx_simd4_store_pr(fy_tmp, fy_S);
-    gmx_simd4_store_pr(fz_tmp, fz_S);
+    gmx_simd4_store_r(fx_tmp, fx_S);
+    gmx_simd4_store_r(fy_tmp, fy_S);
+    gmx_simd4_store_r(fz_tmp, fz_S);
  
      fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
      fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
diff --git a/src/gromacs/mdlib/tpi.c b/src/gromacs/mdlib/tpi.c

index 2726173dd6f1c2e4e02a1c02f83f045c7d708cea..057b73c54fd215193b2e15c47c02f54f0bbc6e06 100644 (file)
--- a/src/gromacs/mdlib/tpi.c
+++ b/src/gromacs/mdlib/tpi.c
@@ -79,7 +79,7 @@
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
  
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
  #include "gromacs/simd/general_x86_sse2.h"
  #endif
  
@@ -439,7 +439,7 @@ double do_tpi(FILE *fplog, t_commrec *cr,
  
      refvolshift = log(det(rerun_fr.box));
  
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
      /* Make sure we don't detect SSE overflow generated before this point */
      gmx_mm_check_and_reset_overflow();
  #endif
@@ -631,7 +631,7 @@ double do_tpi(FILE *fplog, t_commrec *cr,
  
                  epot               = enerd->term[F_EPOT];
                  bEnergyOutOfBounds = FALSE;
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
                  /* With SSE the energy can overflow, check for this */
                  if (gmx_mm_check_and_reset_overflow())
                  {
diff --git a/src/gromacs/simd/four_wide_macros.h b/src/gromacs/simd/four_wide_macros.h

index 8f6f08cc49eedbc90ea6a3f9abe10f3979b60153..8ed1d3493290db0194c59dc3a40902ae3e9c9503 100644 (file)
--- a/src/gromacs/simd/four_wide_macros.h
+++ b/src/gromacs/simd/four_wide_macros.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -73,8 +73,8 @@ typedef float   gmx_simd4_real;
  #endif
  
  /* Uncomment the next line, without other SIMD active, for testing plain-C */
-/* #define GMX_SIMD4_REFERENCE_PLAIN_C */
-#ifdef GMX_SIMD4_REFERENCE_PLAIN_C
+/* #define GMX_SIMD4_REFERENCE */
+#ifdef GMX_SIMD4_REFERENCE
  /* Plain C SIMD reference implementation, also serves as documentation */
  #define GMX_HAVE_SIMD4_MACROS
  
@@ -82,51 +82,51 @@ typedef float   gmx_simd4_real;
  #include "four_wide_macros_ref.h"
  
  /* float/double SIMD register type */
-#define gmx_simd4_pr  gmx_simd4_ref_pr
+#define gmx_simd4_real_t  gmx_simd4_ref_pr
  
  /* boolean SIMD register type */
-#define gmx_simd4_pb  gmx_simd4_ref_pb
+#define gmx_simd4_bool_t  gmx_simd4_ref_pb
  
-#define gmx_simd4_load_pr       gmx_simd4_ref_load_pr
+#define gmx_simd4_load_r       gmx_simd4_ref_load_pr
  #define gmx_simd4_load_bb_pr    gmx_simd4_ref_load_pr
-#define gmx_simd4_set1_pr       gmx_simd4_ref_set1_pr
-#define gmx_simd4_setzero_pr    gmx_simd4_ref_setzero_pr
-#define gmx_simd4_store_pr      gmx_simd4_ref_store_pr
+#define gmx_simd4_set1_r       gmx_simd4_ref_set1_pr
+#define gmx_simd4_setzero_r    gmx_simd4_ref_setzero_pr
+#define gmx_simd4_store_r      gmx_simd4_ref_store_pr
  
  /* Unaligned load+store are not required,
   * but they can speed up the PME spread+gather operations.
   */
  #define GMX_SIMD4_HAVE_UNALIGNED
  #ifdef GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_pr      gmx_simd4_ref_load_pr
-#define gmx_simd4_storeu_pr     gmx_simd4_ref_store_pr
+#define gmx_simd4_loadu_r      gmx_simd4_ref_load_pr
+#define gmx_simd4_storeu_r     gmx_simd4_ref_store_pr
  #endif
  
-#define gmx_simd4_add_pr        gmx_simd4_ref_add_pr
-#define gmx_simd4_sub_pr        gmx_simd4_ref_sub_pr
-#define gmx_simd4_mul_pr        gmx_simd4_ref_mul_pr
+#define gmx_simd4_add_r        gmx_simd4_ref_add_pr
+#define gmx_simd4_sub_r        gmx_simd4_ref_sub_pr
+#define gmx_simd4_mul_r        gmx_simd4_ref_mul_pr
  /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-#define gmx_simd4_madd_pr       gmx_simd4_ref_madd_pr
-#define gmx_simd4_nmsub_pr      gmx_simd4_ref_nmsub_pr
+#define gmx_simd4_fmadd_r       gmx_simd4_ref_madd_pr
+#define gmx_simd4_fnmadd_r      gmx_simd4_ref_nmsub_pr
  
-#define gmx_simd4_dotproduct3   gmx_simd4_ref_dotproduct3
+#define gmx_simd4_dotproduct3_r   gmx_simd4_ref_dotproduct3
  
-#define gmx_simd4_min_pr        gmx_simd4_ref_min_pr
-#define gmx_simd4_max_pr        gmx_simd4_ref_max_pr
+#define gmx_simd4_min_r        gmx_simd4_ref_min_pr
+#define gmx_simd4_max_r        gmx_simd4_ref_max_pr
  
-#define gmx_simd4_blendzero_pr  gmx_simd4_ref_blendzero_pr
+#define gmx_simd4_blendzero_r  gmx_simd4_ref_blendzero_pr
  
  /* Comparison */
-#define gmx_simd4_cmplt_pr      gmx_simd4_ref_cmplt_pr
+#define gmx_simd4_cmplt_r      gmx_simd4_ref_cmplt_pr
  
  /* Logical operations on SIMD booleans */
-#define gmx_simd4_and_pb        gmx_simd4_ref_and_pb
-#define gmx_simd4_or_pb         gmx_simd4_ref_or_pb
+#define gmx_simd4_and_b        gmx_simd4_ref_and_pb
+#define gmx_simd4_or_b         gmx_simd4_ref_or_pb
  
  /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
-#define gmx_simd4_anytrue_pb    gmx_simd4_ref_anytrue_pb
+#define gmx_simd4_anytrue_b    gmx_simd4_ref_anytrue_pb
  
-#endif /* GMX_SIMD4_REFERENCE_PLAIN_C */
+#endif /* GMX_SIMD4_REFERENCE */
  
  
  /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
@@ -139,7 +139,7 @@ typedef float   gmx_simd4_real;
   */
  
  
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
  /* This is for general x86 SIMD instruction sets that also support SSE2 */
  
  #ifdef GMX_SIMD4_SINGLE
@@ -147,17 +147,17 @@ typedef float   gmx_simd4_real;
  #endif
  
  #ifdef GMX_SIMD4_DOUBLE
-/* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA.
+/* Note that here we will use 256-bit SIMD with GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER.
   * This is inconsistent naming wise, but should give the best performance.
   */
-#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
+#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
  #define GMX_HAVE_SIMD4_MACROS
  #endif
  #endif
  
  #ifdef GMX_HAVE_SIMD4_MACROS
  
-#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
+#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
  
  #include <immintrin.h>
  #ifdef HAVE_X86INTRIN_H
@@ -168,7 +168,7 @@ typedef float   gmx_simd4_real;
  #endif
  
  #else
-#ifdef GMX_X86_SSE4_1
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
  #include <smmintrin.h>
  #else
  /* We only have SSE2 */
@@ -178,39 +178,39 @@ typedef float   gmx_simd4_real;
  
  #ifdef GMX_SIMD4_SINGLE
  
-#define gmx_simd4_pr  __m128
+#define gmx_simd4_real_t  __m128
  
-#define gmx_simd4_pb  __m128
+#define gmx_simd4_bool_t  __m128
  
-#define gmx_simd4_load_pr       _mm_load_ps
+#define gmx_simd4_load_r       _mm_load_ps
  #define gmx_simd4_load_bb_pr    _mm_load_ps
-#define gmx_simd4_set1_pr       _mm_set1_ps
-#define gmx_simd4_setzero_pr    _mm_setzero_ps
-#define gmx_simd4_store_pr      _mm_store_ps
+#define gmx_simd4_set1_r       _mm_set1_ps
+#define gmx_simd4_setzero_r    _mm_setzero_ps
+#define gmx_simd4_store_r      _mm_store_ps
  
  /* Some old AMD processors could have problems with unaligned loads+stores */
  #ifndef GMX_FAHCORE
  #define GMX_SIMD4_HAVE_UNALIGNED
  #endif
  #ifdef GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_pr      _mm_loadu_ps
-#define gmx_simd4_storeu_pr     _mm_storeu_ps
+#define gmx_simd4_loadu_r      _mm_loadu_ps
+#define gmx_simd4_storeu_r     _mm_storeu_ps
  #endif
  
-#define gmx_simd4_add_pr        _mm_add_ps
-#define gmx_simd4_sub_pr        _mm_sub_ps
-#define gmx_simd4_mul_pr        _mm_mul_ps
+#define gmx_simd4_add_r        _mm_add_ps
+#define gmx_simd4_sub_r        _mm_sub_ps
+#define gmx_simd4_mul_r        _mm_mul_ps
  
-#ifdef GMX_X86_AVX_128_FMA
-#define gmx_simd4_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
-#define gmx_simd4_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
+#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
+#define gmx_simd4_fmadd_r(a, b, c)   _mm_macc_ps(a, b, c)
+#define gmx_simd4_fnmadd_r(a, b, c)  _mm_nmacc_ps(a, b, c)
  #else
-#define gmx_simd4_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
-#define gmx_simd4_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd4_fmadd_r(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd4_fnmadd_r(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
  #endif
  
-static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
-#ifdef GMX_X86_SSE4_1
+static inline float gmx_simd4_dotproduct3_r(__m128 a, __m128 b)
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
  {
      float dp;
  
@@ -232,66 +232,66 @@ static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
  }
  #endif
  
-#define gmx_simd4_min_pr        _mm_min_ps
-#define gmx_simd4_max_pr        _mm_max_ps
+#define gmx_simd4_min_r        _mm_min_ps
+#define gmx_simd4_max_r        _mm_max_ps
  
-#define gmx_simd4_blendzero_pr  _mm_and_ps
+#define gmx_simd4_blendzero_r  _mm_and_ps
  
-#define gmx_simd4_cmplt_pr      _mm_cmplt_ps
-#define gmx_simd4_and_pb        _mm_and_ps
-#define gmx_simd4_or_pb         _mm_or_ps
+#define gmx_simd4_cmplt_r      _mm_cmplt_ps
+#define gmx_simd4_and_b        _mm_and_ps
+#define gmx_simd4_or_b         _mm_or_ps
  
-#define gmx_simd4_anytrue_pb    _mm_movemask_ps
+#define gmx_simd4_anytrue_b    _mm_movemask_ps
  
  #endif /* GMX_SIMD4_SINGLE */
  
  
  #ifdef GMX_SIMD4_DOUBLE
  
-#define gmx_simd4_pr  __m256d
+#define gmx_simd4_real_t  __m256d
  
-#define gmx_simd4_pb  __m256d
+#define gmx_simd4_bool_t  __m256d
  
-#define gmx_simd4_load_pr       _mm256_load_pd
+#define gmx_simd4_load_r       _mm256_load_pd
  #define gmx_simd4_load_bb_pr    _mm256_load_pd
-#define gmx_simd4_set1_pr       _mm256_set1_pd
-#define gmx_simd4_setzero_pr    _mm256_setzero_pd
-#define gmx_simd4_store_pr      _mm256_store_pd
+#define gmx_simd4_set1_r       _mm256_set1_pd
+#define gmx_simd4_setzero_r    _mm256_setzero_pd
+#define gmx_simd4_store_r      _mm256_store_pd
  
  #define GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_pr      _mm256_loadu_pd
-#define gmx_simd4_storeu_pr     _mm256_storeu_pd
-
-#define gmx_simd4_add_pr        _mm256_add_pd
-#define gmx_simd4_sub_pr        _mm256_sub_pd
-#define gmx_simd4_mul_pr        _mm256_mul_pd
-#ifdef GMX_X86_AVX_128_FMA
-#define gmx_simd4_madd_pr(a, b, c)   _mm256_macc_pd(a, b, c)
-#define gmx_simd4_nmsub_pr(a, b, c)  _mm256_nmacc_pd(a, b, c)
+#define gmx_simd4_loadu_r      _mm256_loadu_pd
+#define gmx_simd4_storeu_r     _mm256_storeu_pd
+
+#define gmx_simd4_add_r        _mm256_add_pd
+#define gmx_simd4_sub_r        _mm256_sub_pd
+#define gmx_simd4_mul_r        _mm256_mul_pd
+#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
+#define gmx_simd4_fmadd_r(a, b, c)   _mm256_macc_pd(a, b, c)
+#define gmx_simd4_fnmadd_r(a, b, c)  _mm256_nmacc_pd(a, b, c)
  #else
-#define gmx_simd4_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd4_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd4_fmadd_r(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd4_fnmadd_r(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
  #endif
-#define gmx_simd4_min_pr        _mm256_min_pd
-#define gmx_simd4_max_pr        _mm256_max_pd
+#define gmx_simd4_min_r        _mm256_min_pd
+#define gmx_simd4_max_r        _mm256_max_pd
  
-#define gmx_simd4_blendzero_pr  _mm256_and_pd
+#define gmx_simd4_blendzero_r  _mm256_and_pd
  
  /* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
-#define gmx_simd4_and_pb        _mm256_and_pd
-#define gmx_simd4_or_pb         _mm256_or_pd
+#define gmx_simd4_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
+#define gmx_simd4_and_b        _mm256_and_pd
+#define gmx_simd4_or_b         _mm256_or_pd
  
-#define gmx_simd4_anytrue_pb    _mm256_movemask_pd
+#define gmx_simd4_anytrue_b    _mm256_movemask_pd
  
  #endif /* GMX_SIMD4_DOUBLE */
  
  
  #endif /* GMX_HAVE_SIMD4_MACROS */
  
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
  
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
  /* i.e. BlueGene/Q */
  
  /* This hack works on the compilers that can reach this code. A real
@@ -302,8 +302,8 @@ static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
  #define GMX_HAVE_SIMD4_MACROS
  #endif
  
-typedef vector4double gmx_simd4_pr;
-typedef vector4double gmx_simd4_pb;
+typedef vector4double gmx_simd4_real_t;
+typedef vector4double gmx_simd4_bool_t;
  
  /* The declarations of vec_ld* use non-const pointers, and IBM
     can't/won't fix this any time soon. So GROMACS has to cast away the
@@ -316,10 +316,10 @@ typedef vector4double gmx_simd4_pb;
     always-float variables have to be done with a function that does
     the correct cast. Since functions cannot be overloaded by type in
     C, they have to have different names. Thus we have
-   gmx_simd4_load_pr and gmx_simd4_load_bb_pr.
+   gmx_simd4_load_r and gmx_simd4_load_bb_pr.
   */
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_r(const real *a)
  {
  #ifdef NDEBUG
      return vec_ld(0, (real *) a);
@@ -328,7 +328,7 @@ static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a
  #endif
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
  {
  #ifdef NDEBUG
      return vec_ld(0, (float *) a);
@@ -337,12 +337,12 @@ static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const floa
  #endif
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_set1_pr(const real a)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_set1_r(const real a)
  {
      return vec_splats(a);
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr()
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_setzero_r()
  {
      return vec_splats(0.0);
  }
@@ -350,7 +350,7 @@ static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr()
  /* TODO this will not yet work, because the function might be passed a
     pointer to a float when running in double precision.
   */
-static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_pr b)
+static gmx_inline void gmx_always_inline gmx_simd4_store_r(real *a, gmx_simd4_real_t b)
  {
  #ifdef NDEBUG
      vec_st(b, 0, a);
@@ -359,64 +359,64 @@ static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_p
  #endif
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_add_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      return vec_add(a, b);
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_sub_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_sub_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      return vec_sub(a, b);
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_mul_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_mul_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      return vec_mul(a, b);
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_madd_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
  {
      return vec_madd(a, b, c);
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_nmsub_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fnmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
  {
      return vec_nmsub(a, b, c);
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_min_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_min_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      /* Implemented the same way as max, but with the subtraction
         operands swapped. */
      return vec_sel(b, a, vec_sub(b, a));
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_max_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_max_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      return vec_sel(b, a, vec_sub(a, b));
  }
  
-static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_blendzero_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_blendzero_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
-    return vec_sel(gmx_setzero_pr(), a, b);
+    return vec_sel(gmx_simd_setzero_r(), a, b);
  }
  
-static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_cmplt_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_cmplt_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      return vec_cmplt(a, b);
  }
  
-static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_and_pb(gmx_simd4_pb a, gmx_simd4_pb b)
+static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_and_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
  {
      return vec_and(a, b);
  }
  
-static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_or_pb(gmx_simd4_pb a, gmx_simd4_pb b)
+static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_or_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
  {
      return vec_or(a, b);
  }
  
-static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, gmx_simd4_pr b)
+static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
      /* The dot product is done solely on the QPX AXU (which is the
         only available FPU). This is awkward, because pretty much no
@@ -432,25 +432,25 @@ static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a,
         memory at all.
       */
  
-    gmx_simd4_pr dp_shifted_left_0 = vec_mul(a, b);
-    gmx_simd4_pr dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
-    gmx_simd4_pr dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
-    gmx_simd4_pr dp                = vec_add(dp_shifted_left_2,
-                                             vec_add(dp_shifted_left_0, dp_shifted_left_1));
+    gmx_simd4_real_t dp_shifted_left_0 = vec_mul(a, b);
+    gmx_simd4_real_t dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
+    gmx_simd4_real_t dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
+    gmx_simd4_real_t dp                = vec_add(dp_shifted_left_2,
+                                                 vec_add(dp_shifted_left_0, dp_shifted_left_1));
  
      /* See comment in nbnxn_make_pairlist_part() about how this should
         be able to return a double on PowerPC. */
      return (float) vec_extract(dp, 0);
  }
  
-static gmx_inline int gmx_always_inline gmx_simd4_anytrue_pb(gmx_simd4_pb a)
+static gmx_inline int gmx_always_inline gmx_simd4_anytrue_b(gmx_simd4_bool_t a)
  {
-    return gmx_anytrue_pb(a);
+    return gmx_simd_anytrue_b(a);
  }
  
  #undef gmx_always_inline
  
-#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+#endif /* GMX_SIMD_IBM_QPX */
  
  #ifdef GMX_HAVE_SIMD4_MACROS
  /* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
diff --git a/src/gromacs/simd/four_wide_macros_ref.h b/src/gromacs/simd/four_wide_macros_ref.h

index 002f3a96f9aa55852323fd2ceb7feab9f1ea9ece..8b47d64d7a1803eeece49cd771d4fa1d259eb719 100644 (file)
--- a/src/gromacs/simd/four_wide_macros_ref.h
+++ b/src/gromacs/simd/four_wide_macros_ref.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2013, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -284,7 +284,7 @@ gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
      return c;
  }
  
-/* gmx_anytrue_pb(x) returns if any of the boolean is x is True */
+/* gmx_simd_anytrue_b(x) returns if any of the boolean is x is True */
  static gmx_inline int
  gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a)
  {
diff --git a/src/gromacs/simd/general_x86_avx_128_fma.h b/src/gromacs/simd/general_x86_avx_128_fma.h

index 5314d1c4a20695377c57bce67ee1ae80bcace45e..19ec986dd5b5ad11d1156de3ea60d8f2c6d90bdc 100644 (file)
--- a/src/gromacs/simd/general_x86_avx_128_fma.h
+++ b/src/gromacs/simd/general_x86_avx_128_fma.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -139,7 +139,7 @@ _mm_msub_pd(__m128d a, __m128d b, __m128d c)
  #endif /* AMD FMA emulation support */
  
  static void
-gmx_mm_printxmm_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -149,7 +149,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -159,7 +159,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmm_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -168,7 +168,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm)
  }
  
  static void
-gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -178,7 +178,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
  
  
  static void
-gmx_mm_printxmm_epi32(const char *s, __m128i xmmi)
+gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
  {
      int i[4];
  
@@ -211,7 +211,7 @@ static int gmx_mm_check_and_reset_overflow(void)
  }
  
  /* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
-#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
  #    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
  #    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
  #    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
diff --git a/src/gromacs/simd/general_x86_avx_256.h b/src/gromacs/simd/general_x86_avx_256.h

index d13bdeec8cd8b34a14ed4dec4e069afca4caaac1..b7b1c236e8b20f8e87ef0f711f8fb4ca18a565fd 100644 (file)
--- a/src/gromacs/simd/general_x86_avx_256.h
+++ b/src/gromacs/simd/general_x86_avx_256.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -167,7 +167,7 @@ static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
  
  
  static void
-gmx_mm_printxmm_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -177,7 +177,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -187,7 +187,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmm_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -196,7 +196,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm)
  }
  
  static void
-gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -206,7 +206,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
  
  
  static void
-gmx_mm_printxmm_epi32(const char *s, __m128i xmmi)
+gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
  {
      int i[4];
  
@@ -287,7 +287,7 @@ static int gmx_mm_check_and_reset_overflow(void)
  }
  
  /* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
-#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
  #    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
  #    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
  #    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
diff --git a/src/gromacs/simd/general_x86_mic.h b/src/gromacs/simd/general_x86_mic.h

index b41b42e09cb03ec6215697b2c4b7d25d421ab48c..9f4c19185599cba4c625d4a26ca920f04a7fb5a6 100644 (file)
--- a/src/gromacs/simd/general_x86_mic.h
+++ b/src/gromacs/simd/general_x86_mic.h
@@ -47,49 +47,49 @@
  #endif
  
  typedef __m512 gmx_mm_ps;
-typedef __m512 gmx_mm_pr;
+typedef __m512 gmx_simd_real_t;
  /* boolean SIMD register type */
-typedef __mmask16 gmx_mm_pb;
-typedef __m512i gmx_epi32;
+typedef __mmask16 gmx_simd_bool_t;
+typedef __m512i gmx_simd_int32_t;
  
  #define GMX_HAVE_SIMD_MACROS
-#define GMX_SIMD_WIDTH_HERE  16
-#define GMX_SIMD_EPI32_WIDTH 16
+#define GMX_SIMD_REAL_WIDTH  16
+#define GMX_SIMD_INT32_WIDTH 16
  
-#define gmx_load_pr _mm512_load_ps
+#define gmx_simd_load_r _mm512_load_ps
  
  /* Set all SIMD register elements to *r */
  static gmx_inline gmx_mm_ps
-gmx_load1_pr(const real *r)
+gmx_simd_load1_r(const real *r)
  {
      return _mm512_extload_ps(r, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
  }
  
-#define gmx_set1_pr _mm512_set1_ps
+#define gmx_simd_set1_r _mm512_set1_ps
  /* Set all SIMD register elements to 0 */
-#define gmx_setzero_pr _mm512_setzero_ps
-#define gmx_store_pr _mm512_store_ps
+#define gmx_simd_setzero_r _mm512_setzero_ps
+#define gmx_simd_store_r _mm512_store_ps
  
-#define gmx_add_pr _mm512_add_ps
-#define gmx_sub_pr _mm512_sub_ps
-#define gmx_mul_pr _mm512_mul_ps
+#define gmx_simd_add_r _mm512_add_ps
+#define gmx_simd_sub_r _mm512_sub_ps
+#define gmx_simd_mul_r _mm512_mul_ps
  
  #define GMX_SIMD_HAVE_FMA
-#define gmx_madd_pr _mm512_fmadd_ps
-#define gmx_nmsub_pr _mm512_fnmadd_ps
+#define gmx_simd_fmadd_r _mm512_fmadd_ps
+#define gmx_simd_fnmadd_r _mm512_fnmadd_ps
  
-#define gmx_max_pr _mm512_max_ps
+#define gmx_simd_max_r _mm512_max_ps
  
  static gmx_inline gmx_mm_ps
-gmx_blendzero_pr(gmx_mm_ps a, gmx_mm_pb b)
+gmx_simd_blendzero_r(gmx_mm_ps a, gmx_simd_bool_t b)
  {
      return _mm512_mask_mov_ps(_mm512_setzero_ps(), b, a);
  }
  
-#define gmx_round_pr _mm512_rint_ps
+#define gmx_simd_round_r _mm512_rint_ps
  
  #define GMX_SIMD_HAVE_FLOOR
-#define gmx_floor_pr _mm512_floor_ps
+#define gmx_simd_floor_r _mm512_floor_ps
  
  /* Copy the sign of a to b, assumes b >= 0 for efficiency */
  static gmx_inline gmx_mm_ps
@@ -104,49 +104,49 @@ gmx_cpsgn_nonneg_pr(gmx_mm_ps a, gmx_mm_ps b)
  
  /* Very specific operation required in the non-bonded kernels */
  static gmx_inline gmx_mm_ps
-gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_ps b, gmx_mm_ps c)
+gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_mm_ps b, gmx_mm_ps c)
  {
      return _mm512_mask_add_ps(b, _mm512_knot(a), b, c);
  }
  
  /* Comparison */
-#define gmx_cmplt_pr _mm512_cmplt_ps_mask
+#define gmx_simd_cmplt_r _mm512_cmplt_ps_mask
  
  /* Logical AND on SIMD booleans. */
-#define gmx_and_pb _mm512_kand
+#define gmx_simd_and_b _mm512_kand
  
  /* Logical OR on SIMD booleans. */
-#define gmx_or_pb _mm512_kor
+#define gmx_simd_or_b _mm512_kor
  
  /* Returns a single int (0/1) which tells if any of the booleans is True
     It returns the full mask (not 1 for True). But given that any non-zero is True this is OK. */
-#define gmx_anytrue_pb _mm512_mask2int
+#define gmx_simd_anytrue_b _mm512_mask2int
  
  /* Conversions only used for PME table lookup */
-static gmx_inline gmx_epi32
-gmx_cvttpr_epi32(gmx_mm_ps a)
+static gmx_inline gmx_simd_int32_t
+gmx_simd_cvtt_r2i(gmx_mm_ps a)
  {
      return _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_ROUND_MODE_DOWN, _MM_EXPADJ_NONE);
  };
  
  /* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
+ * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
   */
-#define gmx_rsqrt_pr _mm512_rsqrt23_ps
-#define gmx_rcp_pr _mm512_rcp23_ps
+#define gmx_simd_rsqrt_r _mm512_rsqrt23_ps
+#define gmx_simd_rcp_r _mm512_rcp23_ps
  
  #define GMX_SIMD_HAVE_EXP
-#define gmx_exp_pr _mm512_exp_ps
+#define gmx_simd_exp_r _mm512_exp_ps
  
  #define GMX_SIMD_HAVE_ERFC
-#define gmx_erfc_pr _mm512_erfc_ps
+#define gmx_simd_erfc_r _mm512_erfc_ps
  
  #define GMX_SIMD_HAVE_TRIGONOMETRIC
-#define gmx_sqrt_pr  _mm512_sqrt_ps
+#define gmx_simd_sqrt_r  _mm512_sqrt_ps
  
  static gmx_inline int
-gmx_sincos_pr(gmx_mm_ps a,
-              gmx_mm_ps *s, gmx_mm_ps *c)
+gmx_simd_sincos_r(gmx_mm_ps a,
+                  gmx_mm_ps *s, gmx_mm_ps *c)
  {
      /* TODO (only bond): optimize that both are calculated together.
         Or (if if that isn't fast on MIC) don't call sincos if only one is needed. */
@@ -155,7 +155,7 @@ gmx_sincos_pr(gmx_mm_ps a,
      return 0;
  }
  
-#define gmx_acos_pr _mm512_acos_ps
-#define gmx_atan2_pr _mm512_atan2_ps
+#define gmx_simd_acos_r _mm512_acos_ps
+#define gmx_simd_atan2_r _mm512_atan2_ps
  
  #endif /* _general_x86_mic_h_ */
diff --git a/src/gromacs/simd/general_x86_sse2.h b/src/gromacs/simd/general_x86_sse2.h

index c6c8b4d3ae564d4d3945b4f4799ca7e1b5f60964..8aa70852f7f85b573242da17e9e4d39003546d5e 100644 (file)
--- a/src/gromacs/simd/general_x86_sse2.h
+++ b/src/gromacs/simd/general_x86_sse2.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -94,7 +94,7 @@ static __m128i gmx_mm_castpd_si128(__m128d a)
  
  
  static void
-gmx_mm_printxmm_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -104,7 +104,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -114,7 +114,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmm_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -123,7 +123,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm)
  }
  
  static void
-gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -133,7 +133,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
  
  
  static void
-gmx_mm_printxmm_epi32(const char *s, __m128i xmmi)
+gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
  {
      int i[4];
  
diff --git a/src/gromacs/simd/general_x86_sse4_1.h b/src/gromacs/simd/general_x86_sse4_1.h

index be0eaa7fa2cce509b5f98168555c23e15cf97229..43b83ef90b7cf38ea96c99ef85539f7f5c0ace4d 100644 (file)
--- a/src/gromacs/simd/general_x86_sse4_1.h
+++ b/src/gromacs/simd/general_x86_sse4_1.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -94,7 +94,7 @@ static __m128i gmx_mm_castpd_si128(__m128d a)
  
  
  static void
-gmx_mm_printxmm_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -104,7 +104,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
+gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
  {
      float f[4];
  
@@ -114,7 +114,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm)
  
  
  static void
-gmx_mm_printxmm_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -123,7 +123,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm)
  }
  
  static void
-gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
+gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
  {
      double f[2];
  
@@ -133,7 +133,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm)
  
  
  static void
-gmx_mm_printxmm_epi32(const char *s, __m128i xmmi)
+gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
  {
      int i[4];
  
diff --git a/src/gromacs/simd/macros.h b/src/gromacs/simd/macros.h

index 4aad78beb3c6e9a4bc05f923cee3e4f3b8ae7a9d..a24cd56d4a143f17ec8bd7374f8fb0120207b10a 100644 (file)
--- a/src/gromacs/simd/macros.h
+++ b/src/gromacs/simd/macros.h
@@ -46,55 +46,55 @@
  
  /* NOTE: SSE2 acceleration does not include floor or blendv */
  
-#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+#ifdef GMX_SIMD_REFERENCE
  /* Plain C SIMD reference implementation, also serves as documentation */
  #define GMX_HAVE_SIMD_MACROS
  
  /* Include plain-C reference implementation, also serves as documentation */
  #include "gromacs/simd/macros_ref.h"
  
-#define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
+#define GMX_SIMD_REAL_WIDTH  GMX_SIMD_REF_WIDTH
  
  /* float/double SIMD register type */
-#define gmx_mm_pr  gmx_simd_ref_pr
+#define gmx_simd_real_t  gmx_simd_ref_pr
  
  /* boolean SIMD register type */
-#define gmx_mm_pb  gmx_simd_ref_pb
+#define gmx_simd_bool_t  gmx_simd_ref_pb
  
  /* integer SIMD register type, only for table indexing and exclusion masks */
-#define gmx_epi32  gmx_simd_ref_epi32
-#define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
+#define gmx_simd_int32_t  gmx_simd_ref_epi32
+#define GMX_SIMD_INT32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  
-/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
-#define gmx_load_pr       gmx_simd_ref_load_pr
+/* Load GMX_SIMD_REAL_WIDTH reals for memory starting at r */
+#define gmx_simd_load_r       gmx_simd_ref_load_pr
  /* Set all SIMD register elements to *r */
-#define gmx_load1_pr      gmx_simd_ref_load1_pr
-#define gmx_set1_pr       gmx_simd_ref_set1_pr
-#define gmx_setzero_pr    gmx_simd_ref_setzero_pr
-#define gmx_store_pr      gmx_simd_ref_store_pr
-
-#define gmx_add_pr        gmx_simd_ref_add_pr
-#define gmx_sub_pr        gmx_simd_ref_sub_pr
-#define gmx_mul_pr        gmx_simd_ref_mul_pr
+#define gmx_simd_load1_r      gmx_simd_ref_load1_pr
+#define gmx_simd_set1_r       gmx_simd_ref_set1_pr
+#define gmx_simd_setzero_r    gmx_simd_ref_setzero_pr
+#define gmx_simd_store_r      gmx_simd_ref_store_pr
+
+#define gmx_simd_add_r        gmx_simd_ref_add_pr
+#define gmx_simd_sub_r        gmx_simd_ref_sub_pr
+#define gmx_simd_mul_r        gmx_simd_ref_mul_pr
  /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-#define gmx_madd_pr       gmx_simd_ref_madd_pr
-#define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
+#define gmx_simd_fmadd_r       gmx_simd_ref_madd_pr
+#define gmx_simd_fnmadd_r      gmx_simd_ref_nmsub_pr
  
-#define gmx_max_pr        gmx_simd_ref_max_pr
-#define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
+#define gmx_simd_max_r        gmx_simd_ref_max_pr
+#define gmx_simd_blendzero_r  gmx_simd_ref_blendzero_pr
  
-#define gmx_round_pr      gmx_simd_ref_round_pr
+#define gmx_simd_round_r      gmx_simd_ref_round_pr
  
  /* Not required, only used to speed up the nbnxn tabulated PME kernels */
  #define GMX_SIMD_HAVE_FLOOR
  #ifdef GMX_SIMD_HAVE_FLOOR
-#define gmx_floor_pr      gmx_simd_ref_floor_pr
+#define gmx_simd_floor_r      gmx_simd_ref_floor_pr
  #endif
  
  /* Not required, only used when blendv is faster than comparison */
  #define GMX_SIMD_HAVE_BLENDV
  #ifdef GMX_SIMD_HAVE_BLENDV
-#define gmx_blendv_pr     gmx_simd_ref_blendv_pr
+#define gmx_simd_blendv_r     gmx_simd_ref_blendv_pr
  #endif
  
  /* Copy the sign of a to b, assumes b >= 0 for efficiency */
@@ -104,39 +104,39 @@
  #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
  
  /* Comparison */
-#define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
+#define gmx_simd_cmplt_r      gmx_simd_ref_cmplt_pr
  
  /* Logical operations on SIMD booleans */
-#define gmx_and_pb        gmx_simd_ref_and_pb
-#define gmx_or_pb         gmx_simd_ref_or_pb
+#define gmx_simd_and_b        gmx_simd_ref_and_pb
+#define gmx_simd_or_b         gmx_simd_ref_or_pb
  
  /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
-#define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
+#define gmx_simd_anytrue_b    gmx_simd_ref_anytrue_pb
  
  /* Conversions only used for PME table lookup */
-#define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
-#define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
+#define gmx_simd_cvtt_r2i  gmx_simd_ref_cvttpr_epi32
+#define gmx_simd_cvt_i2r   gmx_simd_ref_cvtepi32_pr
  
  /* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
+ * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
   */
-#define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
-#define gmx_rcp_pr        gmx_simd_ref_rcp_pr
+#define gmx_simd_rsqrt_r      gmx_simd_ref_rsqrt_pr
+#define gmx_simd_rcp_r        gmx_simd_ref_rcp_pr
  
  /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
  #define GMX_SIMD_HAVE_EXP
  #ifdef GMX_SIMD_HAVE_EXP
-#define gmx_exp_pr        gmx_simd_ref_exp_pr
+#define gmx_simd_exp_r        gmx_simd_ref_exp_pr
  #endif
  #define GMX_SIMD_HAVE_TRIGONOMETRIC
  #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
-#define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
-#define gmx_sincos_pr     gmx_simd_ref_sincos_pr
-#define gmx_acos_pr       gmx_simd_ref_acos_pr
-#define gmx_atan2_pr      gmx_simd_ref_atan2_pr
+#define gmx_simd_sqrt_r       gmx_simd_ref_sqrt_pr
+#define gmx_simd_sincos_r     gmx_simd_ref_sincos_pr
+#define gmx_simd_acos_r       gmx_simd_ref_acos_pr
+#define gmx_simd_atan2_r      gmx_simd_ref_atan2_pr
  #endif
  
-#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
+#endif /* GMX_SIMD_REFERENCE */
  
  
  /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
@@ -150,7 +150,7 @@
  
  
  #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
-#if defined GMX_X86_AVX_256 || defined __MIC__
+#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__
  /* We have half SIMD width support, continue */
  #else
  #error "half SIMD width intrinsics are not supported"
@@ -159,349 +159,349 @@
  
  #if defined GMX_TARGET_X86 && !defined __MIC__
  
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
  /* This is for general x86 SIMD instruction sets that also support SSE2 */
  #define GMX_HAVE_SIMD_MACROS
  
  /* Include the highest supported x86 SIMD intrisics + math functions */
-#ifdef GMX_X86_AVX_256
+#ifdef GMX_SIMD_X86_AVX_256_OR_HIGHER
  #include "general_x86_avx_256.h"
  #ifdef GMX_DOUBLE
  #include "math_x86_avx_256_double.h"
  #else  /* GMX_DOUBLE */
  #include "math_x86_avx_256_single.h"
  #endif /* GMX_DOUBLE */
-#else  /* GMX_X86_AVX_256 */
-#ifdef GMX_X86_AVX_128_FMA
+#else  /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
+#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
  #include "general_x86_avx_128_fma.h"
  #ifdef GMX_DOUBLE
  #include "math_x86_avx_128_fma_double.h"
  #else  /* GMX_DOUBLE */
  #include "math_x86_avx_128_fma_single.h"
  #endif /* GMX_DOUBLE */
-#else  /* GMX_X86_AVX_128_FMA */
-#ifdef GMX_X86_SSE4_1
+#else  /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
+#ifdef GMX_SIMD_X86_SSE4_1
  #include "general_x86_sse4_1.h"
  #ifdef GMX_DOUBLE
  #include "math_x86_sse4_1_double.h"
  #else  /* GMX_DOUBLE */
  #include "math_x86_sse4_1_single.h"
  #endif /* GMX_DOUBLE */
-#else  /* GMX_X86_SSE4_1 */
-#ifdef GMX_X86_SSE2
+#else  /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
  #include "general_x86_sse2.h"
  #ifdef GMX_DOUBLE
  #include "math_x86_sse2_double.h"
  #else  /* GMX_DOUBLE */
  #include "math_x86_sse2_single.h"
  #endif /* GMX_DOUBLE */
-#else  /* GMX_X86_SSE2 */
+#else  /* GMX_SIMD_X86_SSE2_OR_HIGHER */
  #error No x86 acceleration defined
-#endif /* GMX_X86_SSE2 */
-#endif /* GMX_X86_SSE4_1 */
-#endif /* GMX_X86_AVX_128_FMA */
-#endif /* GMX_X86_AVX_256 */
+#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
+#endif /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
+#endif /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
+#endif /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
  
  /* exp and trigonometric functions are included above */
  #define GMX_SIMD_HAVE_EXP
  #define GMX_SIMD_HAVE_ERFC
  #define GMX_SIMD_HAVE_TRIGONOMETRIC
  
-#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
+#if !defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined GMX_USE_HALF_WIDTH_SIMD_HERE
  
  #ifndef GMX_DOUBLE
  
-#define GMX_SIMD_WIDTH_HERE  4
+#define GMX_SIMD_REAL_WIDTH  4
  
-#define gmx_mm_pr  __m128
+#define gmx_simd_real_t  __m128
  
-#define gmx_mm_pb  __m128
+#define gmx_simd_bool_t  __m128
  
-#define gmx_epi32  __m128i
-#define GMX_SIMD_EPI32_WIDTH  4
+#define gmx_simd_int32_t  __m128i
+#define GMX_SIMD_INT32_WIDTH  4
  
-#define gmx_load_pr       _mm_load_ps
-#define gmx_load1_pr      _mm_load1_ps
-#define gmx_set1_pr       _mm_set1_ps
-#define gmx_setzero_pr    _mm_setzero_ps
-#define gmx_store_pr      _mm_store_ps
+#define gmx_simd_load_r       _mm_load_ps
+#define gmx_simd_load1_r      _mm_load1_ps
+#define gmx_simd_set1_r       _mm_set1_ps
+#define gmx_simd_setzero_r    _mm_setzero_ps
+#define gmx_simd_store_r      _mm_store_ps
  
-#define gmx_add_pr        _mm_add_ps
-#define gmx_sub_pr        _mm_sub_ps
-#define gmx_mul_pr        _mm_mul_ps
-#ifdef GMX_X86_AVX_128_FMA
+#define gmx_simd_add_r        _mm_add_ps
+#define gmx_simd_sub_r        _mm_sub_ps
+#define gmx_simd_mul_r        _mm_mul_ps
+#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
  #define GMX_SIMD_HAVE_FMA
-#define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
-#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
+#define gmx_simd_fmadd_r(a, b, c)   _mm_macc_ps(a, b, c)
+#define gmx_simd_fnmadd_r(a, b, c)  _mm_nmacc_ps(a, b, c)
  #else
-#define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
-#define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd_fmadd_r(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd_fnmadd_r(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
  #endif
-#define gmx_max_pr        _mm_max_ps
-#define gmx_blendzero_pr  _mm_and_ps
+#define gmx_simd_max_r        _mm_max_ps
+#define gmx_simd_blendzero_r  _mm_and_ps
  
-#define gmx_cmplt_pr      _mm_cmplt_ps
-#define gmx_and_pb        _mm_and_ps
-#define gmx_or_pb         _mm_or_ps
+#define gmx_simd_cmplt_r      _mm_cmplt_ps
+#define gmx_simd_and_b        _mm_and_ps
+#define gmx_simd_or_b         _mm_or_ps
  
-#ifdef GMX_X86_SSE4_1
-#define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
+#define gmx_simd_round_r(x)   _mm_round_ps(x, 0x0)
  #define GMX_SIMD_HAVE_FLOOR
-#define gmx_floor_pr      _mm_floor_ps
+#define gmx_simd_floor_r      _mm_floor_ps
  #else
-#define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
+#define gmx_simd_round_r(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
  #endif
  
-#ifdef GMX_X86_SSE4_1
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
  #define GMX_SIMD_HAVE_BLENDV
-#define gmx_blendv_pr     _mm_blendv_ps
+#define gmx_simd_blendv_r     _mm_blendv_ps
  #endif
  
-static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      /* The value -0.0 has only the sign-bit set */
-    gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
+    gmx_simd_real_t sign_mask = _mm_set1_ps(-0.0);
      return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
  };
  
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
      return _mm_add_ps(b, _mm_andnot_ps(a, c));
  };
  
-#define gmx_anytrue_pb    _mm_movemask_ps
+#define gmx_simd_anytrue_b    _mm_movemask_ps
  
-#define gmx_cvttpr_epi32  _mm_cvttps_epi32
-#define gmx_cvtepi32_pr   _mm_cvtepi32_ps
+#define gmx_simd_cvtt_r2i  _mm_cvttps_epi32
+#define gmx_simd_cvt_i2r   _mm_cvtepi32_ps
  
-#define gmx_rsqrt_pr      _mm_rsqrt_ps
-#define gmx_rcp_pr        _mm_rcp_ps
+#define gmx_simd_rsqrt_r      _mm_rsqrt_ps
+#define gmx_simd_rcp_r        _mm_rcp_ps
  
-#define gmx_exp_pr        gmx_mm_exp_ps
-#define gmx_sqrt_pr       gmx_mm_sqrt_ps
-#define gmx_sincos_pr     gmx_mm_sincos_ps
-#define gmx_acos_pr       gmx_mm_acos_ps
-#define gmx_atan2_pr      gmx_mm_atan2_ps
-#define gmx_erfc_pr       gmx_mm_erfc_ps
+#define gmx_simd_exp_r        gmx_mm_exp_ps
+#define gmx_simd_sqrt_r       gmx_mm_sqrt_ps
+#define gmx_simd_sincos_r     gmx_mm_sincos_ps
+#define gmx_simd_acos_r       gmx_mm_acos_ps
+#define gmx_simd_atan2_r      gmx_mm_atan2_ps
+#define gmx_simd_erfc_r       gmx_mm_erfc_ps
  
  #else /* ifndef GMX_DOUBLE */
  
-#define GMX_SIMD_WIDTH_HERE  2
+#define GMX_SIMD_REAL_WIDTH  2
  
-#define gmx_mm_pr  __m128d
+#define gmx_simd_real_t  __m128d
  
-#define gmx_mm_pb  __m128d
+#define gmx_simd_bool_t  __m128d
  
-#define gmx_epi32  __m128i
-#define GMX_SIMD_EPI32_WIDTH  4
+#define gmx_simd_int32_t  __m128i
+#define GMX_SIMD_INT32_WIDTH  4
  
-#define gmx_load_pr       _mm_load_pd
-#define gmx_load1_pr      _mm_load1_pd
-#define gmx_set1_pr       _mm_set1_pd
-#define gmx_setzero_pr    _mm_setzero_pd
-#define gmx_store_pr      _mm_store_pd
+#define gmx_simd_load_r       _mm_load_pd
+#define gmx_simd_load1_r      _mm_load1_pd
+#define gmx_simd_set1_r       _mm_set1_pd
+#define gmx_simd_setzero_r    _mm_setzero_pd
+#define gmx_simd_store_r      _mm_store_pd
  
-#define gmx_add_pr        _mm_add_pd
-#define gmx_sub_pr        _mm_sub_pd
-#define gmx_mul_pr        _mm_mul_pd
-#ifdef GMX_X86_AVX_128_FMA
+#define gmx_simd_add_r        _mm_add_pd
+#define gmx_simd_sub_r        _mm_sub_pd
+#define gmx_simd_mul_r        _mm_mul_pd
+#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
  #define GMX_SIMD_HAVE_FMA
-#define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
-#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
+#define gmx_simd_fmadd_r(a, b, c)   _mm_macc_pd(a, b, c)
+#define gmx_simd_fnmadd_r(a, b, c)  _mm_nmacc_pd(a, b, c)
  #else
-#define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
-#define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
+#define gmx_simd_fmadd_r(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
+#define gmx_simd_fnmadd_r(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
  #endif
-#define gmx_max_pr        _mm_max_pd
-#define gmx_blendzero_pr  _mm_and_pd
+#define gmx_simd_max_r        _mm_max_pd
+#define gmx_simd_blendzero_r  _mm_and_pd
  
-#ifdef GMX_X86_SSE4_1
-#define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
+#define gmx_simd_round_r(x)   _mm_round_pd(x, 0x0)
  #define GMX_SIMD_HAVE_FLOOR
-#define gmx_floor_pr      _mm_floor_pd
+#define gmx_simd_floor_r      _mm_floor_pd
  #else
-#define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
-/* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
+#define gmx_simd_round_r(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
+/* gmx_simd_floor_r is not used in code for pre-SSE4_1 hardware */
  #endif
  
-#ifdef GMX_X86_SSE4_1
+#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
  #define GMX_SIMD_HAVE_BLENDV
-#define gmx_blendv_pr     _mm_blendv_pd
+#define gmx_simd_blendv_r     _mm_blendv_pd
  #endif
  
-static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
-    gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
+    gmx_simd_real_t sign_mask = _mm_set1_pd(-0.0);
      return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
  };
  
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
      return _mm_add_pd(b, _mm_andnot_pd(a, c));
  };
  
-#define gmx_cmplt_pr      _mm_cmplt_pd
+#define gmx_simd_cmplt_r      _mm_cmplt_pd
  
-#define gmx_and_pb        _mm_and_pd
-#define gmx_or_pb         _mm_or_pd
+#define gmx_simd_and_b        _mm_and_pd
+#define gmx_simd_or_b         _mm_or_pd
  
-#define gmx_anytrue_pb    _mm_movemask_pd
+#define gmx_simd_anytrue_b    _mm_movemask_pd
  
-#define gmx_cvttpr_epi32  _mm_cvttpd_epi32
-#define gmx_cvtepi32_pr   _mm_cvtepi32_pd
+#define gmx_simd_cvtt_r2i  _mm_cvttpd_epi32
+#define gmx_simd_cvt_i2r   _mm_cvtepi32_pd
  
-#define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
-#define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
+#define gmx_simd_rsqrt_r(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
+#define gmx_simd_rcp_r(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
  
-#define gmx_exp_pr        gmx_mm_exp_pd
-#define gmx_sqrt_pr       gmx_mm_sqrt_pd
-#define gmx_sincos_pr     gmx_mm_sincos_pd
-#define gmx_acos_pr       gmx_mm_acos_pd
-#define gmx_atan2_pr      gmx_mm_atan2_pd
-#define gmx_erfc_pr       gmx_mm_erfc_pd
+#define gmx_simd_exp_r        gmx_mm_exp_pd
+#define gmx_simd_sqrt_r       gmx_mm_sqrt_pd
+#define gmx_simd_sincos_r     gmx_mm_sincos_pd
+#define gmx_simd_acos_r       gmx_mm_acos_pd
+#define gmx_simd_atan2_r      gmx_mm_atan2_pd
+#define gmx_simd_erfc_r       gmx_mm_erfc_pd
  
  #endif /* ifndef GMX_DOUBLE */
  
  #else
-/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
+/* We have GMX_SIMD_X86_AVX_256_OR_HIGHER and not GMX_USE_HALF_WIDTH_SIMD_HERE,
   * so we use 256-bit SIMD.
   */
  
  #ifndef GMX_DOUBLE
  
-#define GMX_SIMD_WIDTH_HERE  8
+#define GMX_SIMD_REAL_WIDTH  8
  
-#define gmx_mm_pr  __m256
+#define gmx_simd_real_t  __m256
  
-#define gmx_mm_pb  __m256
+#define gmx_simd_bool_t  __m256
  
-#define gmx_epi32  __m256i
-#define GMX_SIMD_EPI32_WIDTH  8
+#define gmx_simd_int32_t  __m256i
+#define GMX_SIMD_INT32_WIDTH  8
  
-#define gmx_load_pr       _mm256_load_ps
-#define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
-#define gmx_set1_pr       _mm256_set1_ps
-#define gmx_setzero_pr    _mm256_setzero_ps
-#define gmx_store_pr      _mm256_store_ps
+#define gmx_simd_load_r       _mm256_load_ps
+#define gmx_simd_load1_r(x)   _mm256_set1_ps((x)[0])
+#define gmx_simd_set1_r       _mm256_set1_ps
+#define gmx_simd_setzero_r    _mm256_setzero_ps
+#define gmx_simd_store_r      _mm256_store_ps
  
-#define gmx_add_pr        _mm256_add_ps
-#define gmx_sub_pr        _mm256_sub_ps
-#define gmx_mul_pr        _mm256_mul_ps
-#define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
-#define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
-#define gmx_max_pr        _mm256_max_ps
-#define gmx_blendzero_pr  _mm256_and_ps
+#define gmx_simd_add_r        _mm256_add_ps
+#define gmx_simd_sub_r        _mm256_sub_ps
+#define gmx_simd_mul_r        _mm256_mul_ps
+#define gmx_simd_fmadd_r(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#define gmx_simd_fnmadd_r(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
+#define gmx_simd_max_r        _mm256_max_ps
+#define gmx_simd_blendzero_r  _mm256_and_ps
  
-#define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
+#define gmx_simd_round_r(x)   _mm256_round_ps(x, 0x0)
  #define GMX_SIMD_HAVE_FLOOR
-#define gmx_floor_pr      _mm256_floor_ps
+#define gmx_simd_floor_r      _mm256_floor_ps
  
  #define GMX_SIMD_HAVE_BLENDV
-#define gmx_blendv_pr     _mm256_blendv_ps
+#define gmx_simd_blendv_r     _mm256_blendv_ps
  
-static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
-    gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
+    gmx_simd_real_t sign_mask = _mm256_set1_ps(-0.0);
      return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
  };
  
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
      return _mm256_add_ps(b, _mm256_andnot_ps(a, c));
  };
  
  /* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
-#define gmx_and_pb        _mm256_and_ps
-#define gmx_or_pb         _mm256_or_ps
+#define gmx_simd_cmplt_r(x, y) _mm256_cmp_ps(x, y, 0x11)
+#define gmx_simd_and_b        _mm256_and_ps
+#define gmx_simd_or_b         _mm256_or_ps
  
-#define gmx_anytrue_pb    _mm256_movemask_ps
+#define gmx_simd_anytrue_b    _mm256_movemask_ps
  
-#define gmx_cvttpr_epi32  _mm256_cvttps_epi32
+#define gmx_simd_cvtt_r2i  _mm256_cvttps_epi32
  
-#define gmx_rsqrt_pr      _mm256_rsqrt_ps
-#define gmx_rcp_pr        _mm256_rcp_ps
+#define gmx_simd_rsqrt_r      _mm256_rsqrt_ps
+#define gmx_simd_rcp_r        _mm256_rcp_ps
  
-#define gmx_exp_pr        gmx_mm256_exp_ps
-#define gmx_sqrt_pr       gmx_mm256_sqrt_ps
-#define gmx_sincos_pr     gmx_mm256_sincos_ps
-#define gmx_acos_pr       gmx_mm256_acos_ps
-#define gmx_atan2_pr      gmx_mm256_atan2_ps
-#define gmx_erfc_pr       gmx_mm256_erfc_ps
+#define gmx_simd_exp_r        gmx_mm256_exp_ps
+#define gmx_simd_sqrt_r       gmx_mm256_sqrt_ps
+#define gmx_simd_sincos_r     gmx_mm256_sincos_ps
+#define gmx_simd_acos_r       gmx_mm256_acos_ps
+#define gmx_simd_atan2_r      gmx_mm256_atan2_ps
+#define gmx_simd_erfc_r       gmx_mm256_erfc_ps
  
  #else /* ifndef GMX_DOUBLE */
  
-#define GMX_SIMD_WIDTH_HERE  4
+#define GMX_SIMD_REAL_WIDTH  4
  
-#define gmx_mm_pr  __m256d
+#define gmx_simd_real_t  __m256d
  
-#define gmx_mm_pb  __m256d
+#define gmx_simd_bool_t  __m256d
  
  /* We use 128-bit integer registers because of missing 256-bit operations */
-#define gmx_epi32  __m128i
-#define GMX_SIMD_EPI32_WIDTH  4
-
-#define gmx_load_pr       _mm256_load_pd
-#define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
-#define gmx_set1_pr       _mm256_set1_pd
-#define gmx_setzero_pr    _mm256_setzero_pd
-#define gmx_store_pr      _mm256_store_pd
-
-#define gmx_add_pr        _mm256_add_pd
-#define gmx_sub_pr        _mm256_sub_pd
-#define gmx_mul_pr        _mm256_mul_pd
-#define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
-#define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
-#define gmx_max_pr        _mm256_max_pd
-#define gmx_blendzero_pr  _mm256_and_pd
-
-#define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
+#define gmx_simd_int32_t  __m128i
+#define GMX_SIMD_INT32_WIDTH  4
+
+#define gmx_simd_load_r       _mm256_load_pd
+#define gmx_simd_load1_r(x)   _mm256_set1_pd((x)[0])
+#define gmx_simd_set1_r       _mm256_set1_pd
+#define gmx_simd_setzero_r    _mm256_setzero_pd
+#define gmx_simd_store_r      _mm256_store_pd
+
+#define gmx_simd_add_r        _mm256_add_pd
+#define gmx_simd_sub_r        _mm256_sub_pd
+#define gmx_simd_mul_r        _mm256_mul_pd
+#define gmx_simd_fmadd_r(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd_fnmadd_r(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd_max_r        _mm256_max_pd
+#define gmx_simd_blendzero_r  _mm256_and_pd
+
+#define gmx_simd_round_r(x)   _mm256_round_pd(x, 0x0)
  #define GMX_SIMD_HAVE_FLOOR
-#define gmx_floor_pr      _mm256_floor_pd
+#define gmx_simd_floor_r      _mm256_floor_pd
  
  #define GMX_SIMD_HAVE_BLENDV
-#define gmx_blendv_pr     _mm256_blendv_pd
+#define gmx_simd_blendv_r     _mm256_blendv_pd
  
-static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
-    gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
+    gmx_simd_real_t sign_mask = _mm256_set1_pd(-0.0);
      return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
  };
  
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
      return _mm256_add_pd(b, _mm256_andnot_pd(a, c));
  };
  
  /* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+#define gmx_simd_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
  
-#define gmx_and_pb        _mm256_and_pd
-#define gmx_or_pb         _mm256_or_pd
+#define gmx_simd_and_b        _mm256_and_pd
+#define gmx_simd_or_b         _mm256_or_pd
  
-#define gmx_anytrue_pb    _mm256_movemask_pd
+#define gmx_simd_anytrue_b    _mm256_movemask_pd
  
-#define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
+#define gmx_simd_cvtt_r2i  _mm256_cvttpd_epi32
  
-#define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
-#define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
+#define gmx_simd_rsqrt_r(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
+#define gmx_simd_rcp_r(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
  
-#define gmx_exp_pr        gmx_mm256_exp_pd
-#define gmx_sqrt_pr       gmx_mm256_sqrt_pd
-#define gmx_sincos_pr     gmx_mm256_sincos_pd
-#define gmx_acos_pr       gmx_mm256_acos_pd
-#define gmx_atan2_pr      gmx_mm256_atan2_pd
-#define gmx_erfc_pr       gmx_mm256_erfc_pd
+#define gmx_simd_exp_r        gmx_mm256_exp_pd
+#define gmx_simd_sqrt_r       gmx_mm256_sqrt_pd
+#define gmx_simd_sincos_r     gmx_mm256_sincos_pd
+#define gmx_simd_acos_r       gmx_mm256_acos_pd
+#define gmx_simd_atan2_r      gmx_mm256_atan2_pd
+#define gmx_simd_erfc_r       gmx_mm256_erfc_pd
  
  #endif /* ifndef GMX_DOUBLE */
  
  #endif /* 128- or 256-bit x86 SIMD */
  
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
  
  #endif /* GMX_TARGET_X86 */
  
-#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#ifdef GMX_SIMD_IBM_QPX
  
  /* This hack works on the compilers that can reach this code. A real
     solution with broader scope will be proposed in master branch. */
@@ -518,13 +518,13 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
  /* No need to version the code by the precision, because the QPX AXU
     extends to and truncates from double precision for free. */
  
-#define GMX_SIMD_WIDTH_HERE  4
-typedef vector4double gmx_mm_pr;
-typedef vector4double gmx_mm_pb;
-typedef vector4double gmx_epi32;
-#define GMX_SIMD_EPI32_WIDTH  4
+#define GMX_SIMD_REAL_WIDTH  4
+typedef vector4double gmx_simd_real_t;
+typedef vector4double gmx_simd_bool_t;
+typedef vector4double gmx_simd_int32_t;
+#define GMX_SIMD_INT32_WIDTH  4
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_load_pr(const real *a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load_r(const real *a)
  {
  #ifdef NDEBUG
      return vec_ld(0, (real *) a);
@@ -533,22 +533,22 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_load_pr(const real *a)
  #endif
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_load1_pr(const real *a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load1_r(const real *a)
  {
      return vec_splats(*a);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_set1_pr(real a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_set1_r(real a)
  {
      return vec_splats(a);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_setzero_pr()
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_setzero_r()
  {
      return vec_splats(0.0);
  }
  
-static gmx_inline void gmx_always_inline gmx_store_pr(real *a, gmx_mm_pr b)
+static gmx_inline void gmx_always_inline gmx_simd_store_r(real *a, gmx_simd_real_t b)
  {
  #ifdef NDEBUG
      vec_st(b, 0, a);
@@ -557,81 +557,81 @@ static gmx_inline void gmx_always_inline gmx_store_pr(real *a, gmx_mm_pr b)
  #endif
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_add_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_add_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      return vec_add(a, b);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_sub_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sub_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      return vec_sub(a, b);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_mul_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_mul_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      return vec_mul(a, b);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_madd_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
      return vec_madd(a, b, c);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_nmsub_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fnmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
      return vec_nmsub(a, b, c);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_max_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_max_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      return vec_sel(b, a, vec_sub(a, b));
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendzero_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendzero_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
-    return vec_sel(gmx_setzero_pr(), a, b);
+    return vec_sel(gmx_simd_setzero_r(), a, b);
  }
  
-static gmx_inline gmx_mm_pb gmx_always_inline gmx_cmplt_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_cmplt_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      return vec_cmplt(a, b);
  }
  
-static gmx_inline gmx_mm_pb gmx_always_inline gmx_and_pb(gmx_mm_pb a, gmx_mm_pb b)
+static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_and_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
  {
      return vec_and(a, b);
  }
  
-static gmx_inline gmx_mm_pb gmx_always_inline gmx_or_pb(gmx_mm_pb a, gmx_mm_pb b)
+static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_or_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
  {
      return vec_or(a, b);
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_round_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_round_r(gmx_simd_real_t a)
  {
      return vec_round(a);
  }
  
  #define GMX_SIMD_HAVE_FLOOR
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_floor_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_floor_r(gmx_simd_real_t a)
  {
      return vec_floor(a);
  }
  
  #define GMX_SIMD_HAVE_BLENDV
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendv_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendv_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
-    return vec_sel(b, a, gmx_cmplt_pr(gmx_setzero_pr(), c));
+    return vec_sel(b, a, gmx_simd_cmplt_r(gmx_simd_setzero_r(), c));
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      return vec_cpsgn(a, b);
  };
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
  {
-    return vec_add(b, vec_sel(c, gmx_setzero_pr(), a));
+    return vec_add(b, vec_sel(c, gmx_simd_setzero_r(), a));
  };
  
  static gmx_inline gmx_bool gmx_always_inline
@@ -640,17 +640,17 @@ GMX_SIMD_IS_TRUE(real x)
      return x >= 0.0;
  }
  
-static gmx_inline gmx_epi32 gmx_always_inline gmx_cvttpr_epi32(gmx_mm_pr a)
+static gmx_inline gmx_simd_int32_t gmx_always_inline gmx_simd_cvtt_r2i(gmx_simd_real_t a)
  {
      return vec_ctiwuz(a);
  }
  /* Don't want this, we have floor */
-/* #define gmx_cvtepi32_pr   vec_cvtepi32 */
+/* #define gmx_simd_cvt_i2r   vec_cvtepi32 */
  
  /* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
     Architecture only promises 2^-8. So probably no need for
     Newton-Raphson iterates at single or double. */
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_rsqrt_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rsqrt_r(gmx_simd_real_t a)
  {
      return vec_rsqrte(a);
  }
@@ -658,7 +658,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_rsqrt_pr(gmx_mm_pr a)
  /* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
     Architecture only promises 2^-5. So probably no need for
     Newton-Raphson iterates at single or double. */
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_rcp_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rcp_r(gmx_simd_real_t a)
  {
      return vec_re(a);
  }
@@ -667,7 +667,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_rcp_pr(gmx_mm_pr a)
     compiling on BlueGene/Q with clang */
  
  #define GMX_SIMD_HAVE_EXP
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_exp_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_exp_r(gmx_simd_real_t a)
  {
  #ifdef __clang__
  #ifndef GMX_DOUBLE
@@ -684,7 +684,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_exp_pr(gmx_mm_pr a)
  #endif
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_sqrt_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sqrt_r(gmx_simd_real_t a)
  {
  #ifdef NDEBUG
      return vec_swsqrt_nochk(a);
@@ -694,7 +694,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_sqrt_pr(gmx_mm_pr a)
  }
  
  #define GMX_SIMD_HAVE_TRIGONOMETRIC
-static gmx_inline int gmx_always_inline gmx_sincos_pr(gmx_mm_pr a, gmx_mm_pr *b, gmx_mm_pr *c)
+static gmx_inline int gmx_always_inline gmx_simd_sincos_r(gmx_simd_real_t a, gmx_simd_real_t *b, gmx_simd_real_t *c)
  {
  #ifdef __clang__
  #ifndef GMX_DOUBLE
@@ -712,7 +712,7 @@ static gmx_inline int gmx_always_inline gmx_sincos_pr(gmx_mm_pr a, gmx_mm_pr *b,
      return 1;
  }
  
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_acos_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_acos_r(gmx_simd_real_t a)
  {
  #ifdef __clang__
  #ifndef GMX_DOUBLE
@@ -731,7 +731,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_acos_pr(gmx_mm_pr a)
  
  /* NB The order of parameters here is correct; the
     documentation of atan2[df]4 in SIMD MASS is wrong. */
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_pr b)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_atan2_r(gmx_simd_real_t a, gmx_simd_real_t b)
  {
  #ifdef __clang__
  #ifndef GMX_DOUBLE
@@ -749,7 +749,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_p
  }
  
  #define GMX_SIMD_HAVE_ERFC
-static gmx_inline gmx_mm_pr gmx_always_inline gmx_erfc_pr(gmx_mm_pr a)
+static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_erfc_r(gmx_simd_real_t a)
  {
      /* The BG/Q qpxmath.h vector math library intended for use with
         bgclang does not have erfc, so we need to use a function from
@@ -765,7 +765,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_erfc_pr(gmx_mm_pr a)
  /* TODO: gmx_mm_erfc_p[sd] should be generalized using gmx_*_pr, so that it just works on BlueGene */
  
  static gmx_inline int gmx_always_inline
-gmx_anytrue_pb(gmx_mm_pb a)
+gmx_simd_anytrue_b(gmx_simd_bool_t a)
  {
      /* The "anytrue" is done solely on the QPX AXU (which is the only
         available FPU). This is awkward, because pretty much no
@@ -780,19 +780,19 @@ gmx_anytrue_pb(gmx_mm_pb a)
         comparison on the zeroth vector element, which avoids needing
         memory at all.
       */
-    gmx_mm_pb vec_shifted_left_0 = a;
-    gmx_mm_pb vec_shifted_left_1 = vec_sldw(a, a, 1);
-    gmx_mm_pb vec_shifted_left_2 = vec_sldw(a, a, 2);
-    gmx_mm_pb vec_shifted_left_3 = vec_sldw(a, a, 3);
+    gmx_simd_bool_t vec_shifted_left_0 = a;
+    gmx_simd_bool_t vec_shifted_left_1 = vec_sldw(a, a, 1);
+    gmx_simd_bool_t vec_shifted_left_2 = vec_sldw(a, a, 2);
+    gmx_simd_bool_t vec_shifted_left_3 = vec_sldw(a, a, 3);
  
-    gmx_mm_pb vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
-                                  vec_or(vec_shifted_left_0, vec_shifted_left_1));
+    gmx_simd_bool_t vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
+                                        vec_or(vec_shifted_left_0, vec_shifted_left_1));
      return (0.0 < vec_extract(vec_return, 0));
  };
  
  #undef gmx_always_inline
  
-#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+#endif /* GMX_SIMD_IBM_QPX */
  
  #ifdef __MIC__
  #include "general_x86_mic.h"
@@ -800,20 +800,20 @@ gmx_anytrue_pb(gmx_mm_pb a)
  
  #ifdef GMX_HAVE_SIMD_MACROS
  /* Generic functions to extract a SIMD aligned pointer from a pointer x.
- * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
+ * x should have at least GMX_SIMD_REAL_WIDTH elements extra compared
   * to how many you want to use, to avoid indexing outside the aligned region.
   */
  
  static gmx_inline real *
-gmx_simd_align_real(const real *x)
+gmx_simd_align_r(const real *x)
  {
-    return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
+    return (real *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(real)-1))));
  }
  
  static gmx_inline int *
-gmx_simd_align_int(const int *x)
+gmx_simd_align_i(const int *x)
  {
-    return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
+    return (int  *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(int )-1))));
  }
  
  
diff --git a/src/gromacs/simd/macros_ref.h b/src/gromacs/simd/macros_ref.h

index 80021d0bb34735fbb2e7a5f53ec8f87d09c27568..2f11e04d99d376c87a13d1b3f829219a9fe4a664 100644 (file)
--- a/src/gromacs/simd/macros_ref.h
+++ b/src/gromacs/simd/macros_ref.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2013, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -410,7 +410,7 @@ gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
  };
  
  /* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
+ * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
   */
  static gmx_inline gmx_simd_ref_pr
  gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a)
diff --git a/src/gromacs/simd/math_double.h b/src/gromacs/simd/math_double.h

index 76dcc95a12f7cda5c8280e31a0cdd79b290387b4..8e7d7331851b49dc5a1717da6223a012f4eb8fea 100644 (file)
--- a/src/gromacs/simd/math_double.h
+++ b/src/gromacs/simd/math_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,32 +37,32 @@
  
  
  /* 1.0/sqrt(x) */
-static gmx_inline gmx_mm_pr
-gmx_invsqrt_pr(gmx_mm_pr x)
+static gmx_inline gmx_simd_real_t
+gmx_simd_invsqrt_r(gmx_simd_real_t x)
  {
-    const gmx_mm_pr half  = gmx_set1_pr(0.5);
-    const gmx_mm_pr three = gmx_set1_pr(3.0);
+    const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
+    const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
  
      /* Lookup instruction only exists in single precision, convert back and forth... */
-    gmx_mm_pr lu = gmx_rsqrt_pr(x);
+    gmx_simd_real_t lu = gmx_simd_rsqrt_r(x);
  
-    lu = gmx_mul_pr(gmx_mul_pr(half, lu), gmx_nmsub_pr(gmx_mul_pr(lu, lu), x, three));
-    return gmx_mul_pr(gmx_mul_pr(half, lu), gmx_nmsub_pr(gmx_mul_pr(lu, lu), x, three));
+    lu = gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three));
+    return gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three));
  }
  
  
  /* 1.0/x */
-static gmx_inline gmx_mm_pr
-gmx_inv_pr(gmx_mm_pr x)
+static gmx_inline gmx_simd_real_t
+gmx_simd_inv_r(gmx_simd_real_t x)
  {
-    const gmx_mm_pr two  = gmx_set1_pr(2.0);
+    const gmx_simd_real_t two  = gmx_simd_set1_r(2.0);
  
      /* Lookup instruction only exists in single precision, convert back and forth... */
-    gmx_mm_pr lu = gmx_rcp_pr(x);
+    gmx_simd_real_t lu = gmx_simd_rcp_r(x);
  
      /* Perform two N-R steps for double precision */
-    lu         = gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two));
-    return gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two));
+    lu         = gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
+    return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
  }
  
  
@@ -134,54 +134,54 @@ gmx_inv_pr(gmx_mm_pr x)
   *    vectorial force to add to the particles.
   *
   */
-static gmx_mm_pr
-gmx_pmecorrF_pr(gmx_mm_pr z2)
+static gmx_simd_real_t
+gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
  {
-    const gmx_mm_pr  FN10     = gmx_set1_pr(-8.0072854618360083154e-14);
-    const gmx_mm_pr  FN9      = gmx_set1_pr(1.1859116242260148027e-11);
-    const gmx_mm_pr  FN8      = gmx_set1_pr(-8.1490406329798423616e-10);
-    const gmx_mm_pr  FN7      = gmx_set1_pr(3.4404793543907847655e-8);
-    const gmx_mm_pr  FN6      = gmx_set1_pr(-9.9471420832602741006e-7);
-    const gmx_mm_pr  FN5      = gmx_set1_pr(0.000020740315999115847456);
-    const gmx_mm_pr  FN4      = gmx_set1_pr(-0.00031991745139313364005);
-    const gmx_mm_pr  FN3      = gmx_set1_pr(0.0035074449373659008203);
-    const gmx_mm_pr  FN2      = gmx_set1_pr(-0.031750380176100813405);
-    const gmx_mm_pr  FN1      = gmx_set1_pr(0.13884101728898463426);
-    const gmx_mm_pr  FN0      = gmx_set1_pr(-0.75225277815249618847);
+    const gmx_simd_real_t  FN10     = gmx_simd_set1_r(-8.0072854618360083154e-14);
+    const gmx_simd_real_t  FN9      = gmx_simd_set1_r(1.1859116242260148027e-11);
+    const gmx_simd_real_t  FN8      = gmx_simd_set1_r(-8.1490406329798423616e-10);
+    const gmx_simd_real_t  FN7      = gmx_simd_set1_r(3.4404793543907847655e-8);
+    const gmx_simd_real_t  FN6      = gmx_simd_set1_r(-9.9471420832602741006e-7);
+    const gmx_simd_real_t  FN5      = gmx_simd_set1_r(0.000020740315999115847456);
+    const gmx_simd_real_t  FN4      = gmx_simd_set1_r(-0.00031991745139313364005);
+    const gmx_simd_real_t  FN3      = gmx_simd_set1_r(0.0035074449373659008203);
+    const gmx_simd_real_t  FN2      = gmx_simd_set1_r(-0.031750380176100813405);
+    const gmx_simd_real_t  FN1      = gmx_simd_set1_r(0.13884101728898463426);
+    const gmx_simd_real_t  FN0      = gmx_simd_set1_r(-0.75225277815249618847);
  
-    const gmx_mm_pr  FD5      = gmx_set1_pr(0.000016009278224355026701);
-    const gmx_mm_pr  FD4      = gmx_set1_pr(0.00051055686934806966046);
-    const gmx_mm_pr  FD3      = gmx_set1_pr(0.0081803507497974289008);
-    const gmx_mm_pr  FD2      = gmx_set1_pr(0.077181146026670287235);
-    const gmx_mm_pr  FD1      = gmx_set1_pr(0.41543303143712535988);
-    const gmx_mm_pr  FD0      = gmx_set1_pr(1.0);
+    const gmx_simd_real_t  FD5      = gmx_simd_set1_r(0.000016009278224355026701);
+    const gmx_simd_real_t  FD4      = gmx_simd_set1_r(0.00051055686934806966046);
+    const gmx_simd_real_t  FD3      = gmx_simd_set1_r(0.0081803507497974289008);
+    const gmx_simd_real_t  FD2      = gmx_simd_set1_r(0.077181146026670287235);
+    const gmx_simd_real_t  FD1      = gmx_simd_set1_r(0.41543303143712535988);
+    const gmx_simd_real_t  FD0      = gmx_simd_set1_r(1.0);
  
-    gmx_mm_pr        z4;
-    gmx_mm_pr        polyFN0, polyFN1, polyFD0, polyFD1;
+    gmx_simd_real_t        z4;
+    gmx_simd_real_t        polyFN0, polyFN1, polyFD0, polyFD1;
  
-    z4             = gmx_mul_pr(z2, z2);
+    z4             = gmx_simd_mul_r(z2, z2);
  
-    polyFD1        = gmx_madd_pr(FD5, z4, FD3);
-    polyFD1        = gmx_madd_pr(polyFD1, z4, FD1);
-    polyFD1        = gmx_mul_pr(polyFD1, z2);
-    polyFD0        = gmx_madd_pr(FD4, z4, FD2);
-    polyFD0        = gmx_madd_pr(polyFD0, z4, FD0);
-    polyFD0        = gmx_add_pr(polyFD0, polyFD1);
+    polyFD1        = gmx_simd_fmadd_r(FD5, z4, FD3);
+    polyFD1        = gmx_simd_fmadd_r(polyFD1, z4, FD1);
+    polyFD1        = gmx_simd_mul_r(polyFD1, z2);
+    polyFD0        = gmx_simd_fmadd_r(FD4, z4, FD2);
+    polyFD0        = gmx_simd_fmadd_r(polyFD0, z4, FD0);
+    polyFD0        = gmx_simd_add_r(polyFD0, polyFD1);
  
-    polyFD0        = gmx_inv_pr(polyFD0);
+    polyFD0        = gmx_simd_inv_r(polyFD0);
  
-    polyFN0        = gmx_madd_pr(FN10, z4, FN8);
-    polyFN0        = gmx_madd_pr(polyFN0, z4, FN6);
-    polyFN0        = gmx_madd_pr(polyFN0, z4, FN4);
-    polyFN0        = gmx_madd_pr(polyFN0, z4, FN2);
-    polyFN0        = gmx_madd_pr(polyFN0, z4, FN0);
-    polyFN1        = gmx_madd_pr(FN9, z4, FN7);
-    polyFN1        = gmx_madd_pr(polyFN1, z4, FN5);
-    polyFN1        = gmx_madd_pr(polyFN1, z4, FN3);
-    polyFN1        = gmx_madd_pr(polyFN1, z4, FN1);
-    polyFN0        = gmx_madd_pr(polyFN1, z2, polyFN0);
+    polyFN0        = gmx_simd_fmadd_r(FN10, z4, FN8);
+    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN6);
+    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN4);
+    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN2);
+    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN0);
+    polyFN1        = gmx_simd_fmadd_r(FN9, z4, FN7);
+    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN5);
+    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN3);
+    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN1);
+    polyFN0        = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
  
-    return gmx_mul_pr(polyFN0, polyFD0);
+    return gmx_simd_mul_r(polyFN0, polyFD0);
  }
  
  
@@ -212,51 +212,51 @@ gmx_pmecorrF_pr(gmx_mm_pr z2)
   *    and you have your potential.
   *
   */
-static gmx_mm_pr
-gmx_pmecorrV_pr(gmx_mm_pr z2)
+static gmx_simd_real_t
+gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
  {
-    const gmx_mm_pr  VN9      = gmx_set1_pr(-9.3723776169321855475e-13);
-    const gmx_mm_pr  VN8      = gmx_set1_pr(1.2280156762674215741e-10);
-    const gmx_mm_pr  VN7      = gmx_set1_pr(-7.3562157912251309487e-9);
-    const gmx_mm_pr  VN6      = gmx_set1_pr(2.6215886208032517509e-7);
-    const gmx_mm_pr  VN5      = gmx_set1_pr(-4.9532491651265819499e-6);
-    const gmx_mm_pr  VN4      = gmx_set1_pr(0.00025907400778966060389);
-    const gmx_mm_pr  VN3      = gmx_set1_pr(0.0010585044856156469792);
-    const gmx_mm_pr  VN2      = gmx_set1_pr(0.045247661136833092885);
-    const gmx_mm_pr  VN1      = gmx_set1_pr(0.11643931522926034421);
-    const gmx_mm_pr  VN0      = gmx_set1_pr(1.1283791671726767970);
+    const gmx_simd_real_t  VN9      = gmx_simd_set1_r(-9.3723776169321855475e-13);
+    const gmx_simd_real_t  VN8      = gmx_simd_set1_r(1.2280156762674215741e-10);
+    const gmx_simd_real_t  VN7      = gmx_simd_set1_r(-7.3562157912251309487e-9);
+    const gmx_simd_real_t  VN6      = gmx_simd_set1_r(2.6215886208032517509e-7);
+    const gmx_simd_real_t  VN5      = gmx_simd_set1_r(-4.9532491651265819499e-6);
+    const gmx_simd_real_t  VN4      = gmx_simd_set1_r(0.00025907400778966060389);
+    const gmx_simd_real_t  VN3      = gmx_simd_set1_r(0.0010585044856156469792);
+    const gmx_simd_real_t  VN2      = gmx_simd_set1_r(0.045247661136833092885);
+    const gmx_simd_real_t  VN1      = gmx_simd_set1_r(0.11643931522926034421);
+    const gmx_simd_real_t  VN0      = gmx_simd_set1_r(1.1283791671726767970);
  
-    const gmx_mm_pr  VD5      = gmx_set1_pr(0.000021784709867336150342);
-    const gmx_mm_pr  VD4      = gmx_set1_pr(0.00064293662010911388448);
-    const gmx_mm_pr  VD3      = gmx_set1_pr(0.0096311444822588683504);
-    const gmx_mm_pr  VD2      = gmx_set1_pr(0.085608012351550627051);
-    const gmx_mm_pr  VD1      = gmx_set1_pr(0.43652499166614811084);
-    const gmx_mm_pr  VD0      = gmx_set1_pr(1.0);
+    const gmx_simd_real_t  VD5      = gmx_simd_set1_r(0.000021784709867336150342);
+    const gmx_simd_real_t  VD4      = gmx_simd_set1_r(0.00064293662010911388448);
+    const gmx_simd_real_t  VD3      = gmx_simd_set1_r(0.0096311444822588683504);
+    const gmx_simd_real_t  VD2      = gmx_simd_set1_r(0.085608012351550627051);
+    const gmx_simd_real_t  VD1      = gmx_simd_set1_r(0.43652499166614811084);
+    const gmx_simd_real_t  VD0      = gmx_simd_set1_r(1.0);
  
-    gmx_mm_pr        z4;
-    gmx_mm_pr        polyVN0, polyVN1, polyVD0, polyVD1;
+    gmx_simd_real_t        z4;
+    gmx_simd_real_t        polyVN0, polyVN1, polyVD0, polyVD1;
  
-    z4             = gmx_mul_pr(z2, z2);
+    z4             = gmx_simd_mul_r(z2, z2);
  
-    polyVD1        = gmx_madd_pr(VD5, z4, VD3);
-    polyVD0        = gmx_madd_pr(VD4, z4, VD2);
-    polyVD1        = gmx_madd_pr(polyVD1, z4, VD1);
-    polyVD0        = gmx_madd_pr(polyVD0, z4, VD0);
-    polyVD0        = gmx_madd_pr(polyVD1, z2, polyVD0);
+    polyVD1        = gmx_simd_fmadd_r(VD5, z4, VD3);
+    polyVD0        = gmx_simd_fmadd_r(VD4, z4, VD2);
+    polyVD1        = gmx_simd_fmadd_r(polyVD1, z4, VD1);
+    polyVD0        = gmx_simd_fmadd_r(polyVD0, z4, VD0);
+    polyVD0        = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
  
-    polyVD0        = gmx_inv_pr(polyVD0);
+    polyVD0        = gmx_simd_inv_r(polyVD0);
  
-    polyVN1        = gmx_madd_pr(VN9, z4, VN7);
-    polyVN0        = gmx_madd_pr(VN8, z4, VN6);
-    polyVN1        = gmx_madd_pr(polyVN1, z4, VN5);
-    polyVN0        = gmx_madd_pr(polyVN0, z4, VN4);
-    polyVN1        = gmx_madd_pr(polyVN1, z4, VN3);
-    polyVN0        = gmx_madd_pr(polyVN0, z4, VN2);
-    polyVN1        = gmx_madd_pr(polyVN1, z4, VN1);
-    polyVN0        = gmx_madd_pr(polyVN0, z4, VN0);
-    polyVN0        = gmx_madd_pr(polyVN1, z2, polyVN0);
+    polyVN1        = gmx_simd_fmadd_r(VN9, z4, VN7);
+    polyVN0        = gmx_simd_fmadd_r(VN8, z4, VN6);
+    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN5);
+    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN4);
+    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN3);
+    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN2);
+    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN1);
+    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN0);
+    polyVN0        = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
  
-    return gmx_mul_pr(polyVN0, polyVD0);
+    return gmx_simd_mul_r(polyVN0, polyVD0);
  }
  
  
diff --git a/src/gromacs/simd/math_single.h b/src/gromacs/simd/math_single.h

index 377855c549b9bbca892a5399539a2575ef52c427..c956b9ad86735734115395a554e3553f715e349c 100644 (file)
--- a/src/gromacs/simd/math_single.h
+++ b/src/gromacs/simd/math_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,40 +37,40 @@
  
  
  /* 1.0/sqrt(x) */
-static gmx_inline gmx_mm_pr
-gmx_invsqrt_pr(gmx_mm_pr x)
+static gmx_inline gmx_simd_real_t
+gmx_simd_invsqrt_r(gmx_simd_real_t x)
  {
      /* This is one of the few cases where FMA adds a FLOP, but ends up with
       * less instructions in total when FMA is available in hardware.
       * Usually we would not optimize this far, but invsqrt is used often.
       */
  #ifdef GMX_SIMD_HAVE_FMA
-    const gmx_mm_pr half  = gmx_set1_pr(0.5);
-    const gmx_mm_pr one   = gmx_set1_pr(1.0);
+    const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
+    const gmx_simd_real_t one   = gmx_simd_set1_r(1.0);
  
-    gmx_mm_pr       lu = gmx_rsqrt_pr(x);
+    gmx_simd_real_t       lu = gmx_simd_rsqrt_r(x);
  
-    return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu);
+    return gmx_simd_fmadd_r(gmx_simd_fnmadd_r(x, gmx_simd_mul_r(lu, lu), one), gmx_simd_mul_r(lu, half), lu);
  #else
-    const gmx_mm_pr half  = gmx_set1_pr(0.5);
-    const gmx_mm_pr three = gmx_set1_pr(3.0);
+    const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
+    const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
  
-    gmx_mm_pr       lu = gmx_rsqrt_pr(x);
+    gmx_simd_real_t       lu = gmx_simd_rsqrt_r(x);
  
-    return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu));
+    return gmx_simd_mul_r(half, gmx_simd_mul_r(gmx_simd_sub_r(three, gmx_simd_mul_r(gmx_simd_mul_r(lu, lu), x)), lu));
  #endif
  }
  
  
  /* 1.0/x */
-static gmx_inline gmx_mm_pr
-gmx_inv_pr(gmx_mm_pr x)
+static gmx_inline gmx_simd_real_t
+gmx_simd_inv_r(gmx_simd_real_t x)
  {
-    const gmx_mm_pr two = gmx_set1_pr(2.0);
+    const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
  
-    gmx_mm_pr       lu = gmx_rcp_pr(x);
+    gmx_simd_real_t       lu = gmx_simd_rcp_r(x);
  
-    return gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two));
+    return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
  }
  
  
@@ -142,49 +142,49 @@ gmx_inv_pr(gmx_mm_pr x)
   *    vectorial force to add to the particles.
   *
   */
-static gmx_mm_pr
-gmx_pmecorrF_pr(gmx_mm_pr z2)
+static gmx_simd_real_t
+gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
  {
-    const gmx_mm_pr  FN6      = gmx_set1_pr(-1.7357322914161492954e-8f);
-    const gmx_mm_pr  FN5      = gmx_set1_pr(1.4703624142580877519e-6f);
-    const gmx_mm_pr  FN4      = gmx_set1_pr(-0.000053401640219807709149f);
-    const gmx_mm_pr  FN3      = gmx_set1_pr(0.0010054721316683106153f);
-    const gmx_mm_pr  FN2      = gmx_set1_pr(-0.019278317264888380590f);
-    const gmx_mm_pr  FN1      = gmx_set1_pr(0.069670166153766424023f);
-    const gmx_mm_pr  FN0      = gmx_set1_pr(-0.75225204789749321333f);
-
-    const gmx_mm_pr  FD4      = gmx_set1_pr(0.0011193462567257629232f);
-    const gmx_mm_pr  FD3      = gmx_set1_pr(0.014866955030185295499f);
-    const gmx_mm_pr  FD2      = gmx_set1_pr(0.11583842382862377919f);
-    const gmx_mm_pr  FD1      = gmx_set1_pr(0.50736591960530292870f);
-    const gmx_mm_pr  FD0      = gmx_set1_pr(1.0f);
-
-    gmx_mm_pr        z4;
-    gmx_mm_pr        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = gmx_mul_pr(z2, z2);
-
-    polyFD0        = gmx_madd_pr(FD4, z4, FD2);
-    polyFD1        = gmx_madd_pr(FD3, z4, FD1);
-    polyFD0        = gmx_madd_pr(polyFD0, z4, FD0);
-    polyFD0        = gmx_madd_pr(polyFD1, z2, polyFD0);
-
-    polyFD0        = gmx_inv_pr(polyFD0);
-
-    polyFN0        = gmx_madd_pr(FN6, z4, FN4);
-    polyFN1        = gmx_madd_pr(FN5, z4, FN3);
-    polyFN0        = gmx_madd_pr(polyFN0, z4, FN2);
-    polyFN1        = gmx_madd_pr(polyFN1, z4, FN1);
-    polyFN0        = gmx_madd_pr(polyFN0, z4, FN0);
-    polyFN0        = gmx_madd_pr(polyFN1, z2, polyFN0);
-
-    return gmx_mul_pr(polyFN0, polyFD0);
+    const gmx_simd_real_t  FN6      = gmx_simd_set1_r(-1.7357322914161492954e-8f);
+    const gmx_simd_real_t  FN5      = gmx_simd_set1_r(1.4703624142580877519e-6f);
+    const gmx_simd_real_t  FN4      = gmx_simd_set1_r(-0.000053401640219807709149f);
+    const gmx_simd_real_t  FN3      = gmx_simd_set1_r(0.0010054721316683106153f);
+    const gmx_simd_real_t  FN2      = gmx_simd_set1_r(-0.019278317264888380590f);
+    const gmx_simd_real_t  FN1      = gmx_simd_set1_r(0.069670166153766424023f);
+    const gmx_simd_real_t  FN0      = gmx_simd_set1_r(-0.75225204789749321333f);
+
+    const gmx_simd_real_t  FD4      = gmx_simd_set1_r(0.0011193462567257629232f);
+    const gmx_simd_real_t  FD3      = gmx_simd_set1_r(0.014866955030185295499f);
+    const gmx_simd_real_t  FD2      = gmx_simd_set1_r(0.11583842382862377919f);
+    const gmx_simd_real_t  FD1      = gmx_simd_set1_r(0.50736591960530292870f);
+    const gmx_simd_real_t  FD0      = gmx_simd_set1_r(1.0f);
+
+    gmx_simd_real_t        z4;
+    gmx_simd_real_t        polyFN0, polyFN1, polyFD0, polyFD1;
+
+    z4             = gmx_simd_mul_r(z2, z2);
+
+    polyFD0        = gmx_simd_fmadd_r(FD4, z4, FD2);
+    polyFD1        = gmx_simd_fmadd_r(FD3, z4, FD1);
+    polyFD0        = gmx_simd_fmadd_r(polyFD0, z4, FD0);
+    polyFD0        = gmx_simd_fmadd_r(polyFD1, z2, polyFD0);
+
+    polyFD0        = gmx_simd_inv_r(polyFD0);
+
+    polyFN0        = gmx_simd_fmadd_r(FN6, z4, FN4);
+    polyFN1        = gmx_simd_fmadd_r(FN5, z4, FN3);
+    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN2);
+    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN1);
+    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN0);
+    polyFN0        = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
+
+    return gmx_simd_mul_r(polyFN0, polyFD0);
  }
  
  
  /* Calculate the potential correction due to PME analytically.
   *
- * See gmx_pmecorrF_pr() for details about the approximation.
+ * See gmx_simd_pmecorrF_r() for details about the approximation.
   *
   * This routine calculates Erf(z)/z, although you should provide z^2
   * as the input argument.
@@ -210,41 +210,41 @@ gmx_pmecorrF_pr(gmx_mm_pr z2)
   * 6. Add the result to 1/r, multiply by the product of the charges,
   *    and you have your potential.
   */
-static gmx_mm_pr
-gmx_pmecorrV_pr(gmx_mm_pr z2)
+static gmx_simd_real_t
+gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
  {
-    const gmx_mm_pr  VN6      = gmx_set1_pr(1.9296833005951166339e-8f);
-    const gmx_mm_pr  VN5      = gmx_set1_pr(-1.4213390571557850962e-6f);
-    const gmx_mm_pr  VN4      = gmx_set1_pr(0.000041603292906656984871f);
-    const gmx_mm_pr  VN3      = gmx_set1_pr(-0.00013134036773265025626f);
-    const gmx_mm_pr  VN2      = gmx_set1_pr(0.038657983986041781264f);
-    const gmx_mm_pr  VN1      = gmx_set1_pr(0.11285044772717598220f);
-    const gmx_mm_pr  VN0      = gmx_set1_pr(1.1283802385263030286f);
-
-    const gmx_mm_pr  VD3      = gmx_set1_pr(0.0066752224023576045451f);
-    const gmx_mm_pr  VD2      = gmx_set1_pr(0.078647795836373922256f);
-    const gmx_mm_pr  VD1      = gmx_set1_pr(0.43336185284710920150f);
-    const gmx_mm_pr  VD0      = gmx_set1_pr(1.0f);
-
-    gmx_mm_pr        z4;
-    gmx_mm_pr        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = gmx_mul_pr(z2, z2);
-
-    polyVD1        = gmx_madd_pr(VD3, z4, VD1);
-    polyVD0        = gmx_madd_pr(VD2, z4, VD0);
-    polyVD0        = gmx_madd_pr(polyVD1, z2, polyVD0);
-
-    polyVD0        = gmx_inv_pr(polyVD0);
-
-    polyVN0        = gmx_madd_pr(VN6, z4, VN4);
-    polyVN1        = gmx_madd_pr(VN5, z4, VN3);
-    polyVN0        = gmx_madd_pr(polyVN0, z4, VN2);
-    polyVN1        = gmx_madd_pr(polyVN1, z4, VN1);
-    polyVN0        = gmx_madd_pr(polyVN0, z4, VN0);
-    polyVN0        = gmx_madd_pr(polyVN1, z2, polyVN0);
-
-    return gmx_mul_pr(polyVN0, polyVD0);
+    const gmx_simd_real_t  VN6      = gmx_simd_set1_r(1.9296833005951166339e-8f);
+    const gmx_simd_real_t  VN5      = gmx_simd_set1_r(-1.4213390571557850962e-6f);
+    const gmx_simd_real_t  VN4      = gmx_simd_set1_r(0.000041603292906656984871f);
+    const gmx_simd_real_t  VN3      = gmx_simd_set1_r(-0.00013134036773265025626f);
+    const gmx_simd_real_t  VN2      = gmx_simd_set1_r(0.038657983986041781264f);
+    const gmx_simd_real_t  VN1      = gmx_simd_set1_r(0.11285044772717598220f);
+    const gmx_simd_real_t  VN0      = gmx_simd_set1_r(1.1283802385263030286f);
+
+    const gmx_simd_real_t  VD3      = gmx_simd_set1_r(0.0066752224023576045451f);
+    const gmx_simd_real_t  VD2      = gmx_simd_set1_r(0.078647795836373922256f);
+    const gmx_simd_real_t  VD1      = gmx_simd_set1_r(0.43336185284710920150f);
+    const gmx_simd_real_t  VD0      = gmx_simd_set1_r(1.0f);
+
+    gmx_simd_real_t        z4;
+    gmx_simd_real_t        polyVN0, polyVN1, polyVD0, polyVD1;
+
+    z4             = gmx_simd_mul_r(z2, z2);
+
+    polyVD1        = gmx_simd_fmadd_r(VD3, z4, VD1);
+    polyVD0        = gmx_simd_fmadd_r(VD2, z4, VD0);
+    polyVD0        = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
+
+    polyVD0        = gmx_simd_inv_r(polyVD0);
+
+    polyVN0        = gmx_simd_fmadd_r(VN6, z4, VN4);
+    polyVN1        = gmx_simd_fmadd_r(VN5, z4, VN3);
+    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN2);
+    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN1);
+    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN0);
+    polyVN0        = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
+
+    return gmx_simd_mul_r(polyVN0, polyVD0);
  }
  
  
diff --git a/src/gromacs/simd/vector_operations.h b/src/gromacs/simd/vector_operations.h

index 42448ada0d8549ed9efd316fb1cc104fbf26e696..1fb9a142e1f83ea3ae895d0279377ce8ba54f789 100644 (file)
--- a/src/gromacs/simd/vector_operations.h
+++ b/src/gromacs/simd/vector_operations.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -52,60 +52,60 @@
  
  
  /* x^2 + y^2 + z^2 */
-static gmx_inline gmx_mm_pr
-gmx_calc_rsq_pr(gmx_mm_pr x, gmx_mm_pr y, gmx_mm_pr z)
+static gmx_inline gmx_simd_real_t
+gmx_simd_calc_rsq_r(gmx_simd_real_t x, gmx_simd_real_t y, gmx_simd_real_t z)
  {
-    return gmx_madd_pr(z, z, gmx_madd_pr(y, y, gmx_mul_pr(x, x)));
+    return gmx_simd_fmadd_r(z, z, gmx_simd_fmadd_r(y, y, gmx_simd_mul_r(x, x)));
  }
  
  /* inner-product of multiple vectors */
-static gmx_inline gmx_mm_pr
-gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
-             gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz)
+static gmx_inline gmx_simd_real_t
+gmx_simd_iprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az,
+                 gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz)
  {
-    gmx_mm_pr ret;
+    gmx_simd_real_t ret;
  
-    ret = gmx_mul_pr(ax, bx);
-    ret = gmx_madd_pr(ay, by, ret);
-    ret = gmx_madd_pr(az, bz, ret);
+    ret = gmx_simd_mul_r(ax, bx);
+    ret = gmx_simd_fmadd_r(ay, by, ret);
+    ret = gmx_simd_fmadd_r(az, bz, ret);
  
      return ret;
  }
  
  /* norm squared of multiple vectors */
-static gmx_inline gmx_mm_pr
-gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az)
+static gmx_inline gmx_simd_real_t
+gmx_simd_norm2_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az)
  {
-    gmx_mm_pr ret;
+    gmx_simd_real_t ret;
  
-    ret = gmx_mul_pr(ax, ax);
-    ret = gmx_madd_pr(ay, ay, ret);
-    ret = gmx_madd_pr(az, az, ret);
+    ret = gmx_simd_mul_r(ax, ax);
+    ret = gmx_simd_fmadd_r(ay, ay, ret);
+    ret = gmx_simd_fmadd_r(az, az, ret);
  
      return ret;
  }
  
  /* cross-product of multiple vectors */
  static gmx_inline void
-gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
-             gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz,
-             gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz)
+gmx_simd_cprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az,
+                 gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz,
+                 gmx_simd_real_t *cx, gmx_simd_real_t *cy, gmx_simd_real_t *cz)
  {
-    *cx = gmx_mul_pr(ay, bz);
-    *cx = gmx_nmsub_pr(az, by, *cx);
+    *cx = gmx_simd_mul_r(ay, bz);
+    *cx = gmx_simd_fnmadd_r(az, by, *cx);
  
-    *cy = gmx_mul_pr(az, bx);
-    *cy = gmx_nmsub_pr(ax, bz, *cy);
+    *cy = gmx_simd_mul_r(az, bx);
+    *cy = gmx_simd_fnmadd_r(ax, bz, *cy);
  
-    *cz = gmx_mul_pr(ax, by);
-    *cz = gmx_nmsub_pr(ay, bx, *cz);
+    *cz = gmx_simd_mul_r(ax, by);
+    *cz = gmx_simd_fnmadd_r(ay, bx, *cz);
  }
  
  /* a + b + c + d (not really a vector operation, but where else put this?) */
-static gmx_inline gmx_mm_pr
-gmx_sum4_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d)
+static gmx_inline gmx_simd_real_t
+gmx_simd_sum4_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c, gmx_simd_real_t d)
  {
-    return gmx_add_pr(gmx_add_pr(a, b), gmx_add_pr(c, d));
+    return gmx_simd_add_r(gmx_simd_add_r(a, b), gmx_simd_add_r(c, d));
  }
  
  
diff --git a/src/gromacs/utility/gmxomp.h b/src/gromacs/utility/gmxomp.h

index 178eaac4bf3ee2a8912373eb9aafb0d6f44d084f..4b4ec6fd59c35bcb79d820a9aa618e5dce6a118a 100644 (file)
--- a/src/gromacs/utility/gmxomp.h
+++ b/src/gromacs/utility/gmxomp.h
@@ -53,7 +53,7 @@
  #include "config.h"
  #endif
  
-#ifdef GMX_X86_SSE2
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
  #include <xmmintrin.h>
  #endif
  
@@ -113,7 +113,7 @@ void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr,
  static gmx_inline void gmx_pause()
  {
      /* Replace with tbb::internal::atomic_backoff when/if we use TBB */
-#if defined GMX_X86_SSE2
+#if defined GMX_SIMD_X86_SSE2_OR_HIGHER
      _mm_pause();
  #elif defined __MIC__
      _mm_delay_32(32);
author	Erik Lindahl <erik@kth.se>
	Fri, 24 Jan 2014 20:04:47 +0000 (21:04 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Tue, 11 Feb 2014 13:02:23 +0000 (14:02 +0100)
CMakeLists.txt		patch \| blob \| history
admin/installguide/installguide.tex		patch \| blob \| history
cmake/Platform/BlueGeneL-static-XL-C.cmake		patch \| blob \| history
cmake/Platform/BlueGeneP-static-XL-C.cmake		patch \| blob \| history
cmake/TestAVXMaskload.c		patch \| blob \| history
cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake		patch \| blob \| history
cmake/Toolchain-Fujitsu-Sparc64.cmake		patch \| blob \| history
cmake/gmxBuildTypeReference.cmake		patch \| blob \| history
cmake/gmxDetectSimd.cmake	[moved from cmake/gmxDetectAcceleration.cmake with 71% similarity]	patch \| blob \| history
cmake/gmxDetectTargetArchitecture.cmake		patch \| blob \| history
cmake/gmxFindFlagsForSource.cmake		patch \| blob \| history
cmake/gmxManageFFTLibraries.cmake		patch \| blob \| history
cmake/gmxTestAVXMaskload.cmake		patch \| blob \| history
cmake/gmxTestSimd.cmake	[moved from cmake/gmxTestCPUAcceleration.cmake with 72% similarity]	patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/contrib/fftw/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/bondfree.c		patch \| blob \| history
src/gromacs/gmxlib/copyrite.cpp		patch \| blob \| history
src/gromacs/gmxlib/gmx_cpuid.c		patch \| blob \| history
src/gromacs/gmxlib/gmx_detect_hardware.c		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nonbonded.c		patch \| blob \| history
src/gromacs/gmxpreprocess/calc_verletbuf.c		patch \| blob \| history
src/gromacs/legacyheaders/gmx_cpuid.h		patch \| blob \| history
src/gromacs/legacyheaders/types/forcerec.h		patch \| blob \| history
src/gromacs/legacyheaders/types/nb_verlet.h		patch \| blob \| history
src/gromacs/legacyheaders/types/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.c		patch \| blob \| history
src/gromacs/mdlib/genborn.c		patch \| blob \| history
src/gromacs/mdlib/genborn_allvsall_sse2_double.c		patch \| blob \| history
src/gromacs/mdlib/genborn_allvsall_sse2_single.c		patch \| blob \| history
src/gromacs/mdlib/genborn_sse2_double.c		patch \| blob \| history
src/gromacs/mdlib/genborn_sse2_single.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_internal.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search_simd_2xnn.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search_simd_4xn.h		patch \| blob \| history
src/gromacs/mdlib/pme.c		patch \| blob \| history
src/gromacs/mdlib/pme_simd4.h		patch \| blob \| history
src/gromacs/mdlib/tpi.c		patch \| blob \| history
src/gromacs/simd/four_wide_macros.h		patch \| blob \| history
src/gromacs/simd/four_wide_macros_ref.h		patch \| blob \| history
src/gromacs/simd/general_x86_avx_128_fma.h		patch \| blob \| history
src/gromacs/simd/general_x86_avx_256.h		patch \| blob \| history
src/gromacs/simd/general_x86_mic.h		patch \| blob \| history
src/gromacs/simd/general_x86_sse2.h		patch \| blob \| history
src/gromacs/simd/general_x86_sse4_1.h		patch \| blob \| history
src/gromacs/simd/macros.h		patch \| blob \| history
src/gromacs/simd/macros_ref.h		patch \| blob \| history
src/gromacs/simd/math_double.h		patch \| blob \| history
src/gromacs/simd/math_single.h		patch \| blob \| history
src/gromacs/simd/vector_operations.h		patch \| blob \| history
src/gromacs/utility/gmxomp.h		patch \| blob \| history