From: Erik Lindahl Date: Fri, 24 Jan 2014 20:04:47 +0000 (+0100) Subject: First part of commit for redesigned SIMD module - namechanges. X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=e5e37f2c62c3ec8bf75b2e90774b3f603c6ce098;p=alexxy%2Fgromacs.git First part of commit for redesigned SIMD module - namechanges. This patch contains the bulk part of trivial name changes in preparation for the new SIMD module. - Add the gmx_simd_ prefix, and _r suffix for real simd data. This makes the module more generic, and we separate the gromacs SIMD layer from x86 implementation better. - epi32 (intel name to indicate 64-bit var extended to 128 bits) has been changed to int32 in types and defines. - Negated-fused-multiply-add nomenclature has been changed so "-a*b+c" is fnmsub. This is the standard on both Intel and AMD, where most people have SIMD coding experience. My apologies for first recommending Berk to use the opposite. - GMX_CPU_ACCELERATION_ strings have been changed to GMX_SIMD_, both in the code and in CMake. - Defines like GMX_X86_SSE2 that indicate instruction set availability have received the _OR_HIGHER suffix to clarify that those are cumulative in contrast to the SIMD setting, which only indicates the highest level. These will be moved away from config.h to the simd headers in the next patch. - GMX_SIMD_WITDH_HERE has been changed to GMX_SIMD_REAL_WIDTH. Change-Id: I7a2567f7ddaf8ecd95a4f3b1162bbf03342b2b63 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a3ce3c3ce..61eb7d20a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,22 +185,22 @@ gmx_add_cache_dependency(GMX_COOL_QUOTES BOOL "NOT GMX_FAHCORE" OFF) include(gmxManageGPU) # Detect the architecture the compiler is targetting, detect -# acceleration possibilities on that hardware, suggest an acceleration +# SIMD instructions possibilities on that hardware, suggest SIMD instruction set # to use if none is specified, and populate the cache option for CPU -# accleration. +# SIMD. include(gmxDetectTargetArchitecture) gmx_detect_target_architecture() -include(gmxDetectAcceleration) -gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION) -if("${GMX_SUGGESTED_CPU_ACCELERATION}" STREQUAL "AVX2_256") +include(gmxDetectSimd) +gmx_detect_simd(GMX_SUGGESTED_SIMD) +if("${GMX_SUGGESTED_SIMD}" STREQUAL "AVX2_256") message(STATUS "Changing acceleration from AVX2 to AVX (until AVX2 patches commited).") - set(GMX_SUGGESTED_CPU_ACCELERATION "AVX_256") + set(GMX_SUGGESTED_SIMD "AVX_256") endif() gmx_option_multichoice( - GMX_CPU_ACCELERATION - "Acceleration for CPU kernels and compiler optimization" - "${GMX_SUGGESTED_CPU_ACCELERATION}" + GMX_SIMD + "SIMD instruction set for CPU kernels and compiler optimization" + "${GMX_SUGGESTED_SIMD}" None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 IBM_QPX Sparc64_HPC_ACE Reference) gmx_option_multichoice( @@ -227,8 +227,8 @@ gmx_option_multichoice( None none gaussian mopac gamess orca) -gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_TYPE "Reference kernel type (4xn or 2xnn)" STRING "4xn" "GMX_CPU_ACCELERATION STREQUAL REFERENCE") -gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_WIDTH "Reference kernel width" STRING "4" "GMX_CPU_ACCELERATION STREQUAL REFERENCE") +gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_TYPE "Reference kernel type (4xn or 2xnn)" STRING "4xn" "GMX_SIMD STREQUAL REFERENCE") +gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_WIDTH "Reference kernel width" STRING "4" "GMX_SIMD STREQUAL REFERENCE") option(GMX_BROKEN_CALLOC "Work around broken calloc()" OFF) mark_as_advanced(GMX_BROKEN_CALLOC) @@ -286,7 +286,7 @@ include(gmxCFlags) gmx_c_flags() # This variable should be used for additional compiler flags which are not -# generated in gmxCFlags nor are acceleration or MPI related. +# generated in gmxCFlags nor are SIMD or MPI related. set(EXTRA_C_FLAGS "") set(EXTRA_CXX_FLAGS "") @@ -576,13 +576,13 @@ endif(NOT GMX_SYSTEM_XDR) ################################################## -# Process CPU acceleration settings +# Process SIMD instruction settings ################################################## # This checks what flags to add in order to # support the SIMD instructions we need, and sets -# correct defines for the acceleration supported. -include(gmxTestCPUAcceleration) -gmx_test_cpu_acceleration() +# correct defines for the SIMD instructions supported. +include(gmxTestSimd) +gmx_test_simd() # Process QM/MM Settings @@ -689,16 +689,16 @@ endif() # # # # # # # # # # NO MORE TESTS AFTER THIS LINE! # # # # # # # # # # # # these are set after everything else if (NOT GMX_SKIP_DEFAULT_CFLAGS) - set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "${SIMD_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${SIMD_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${CMAKE_CXX_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}") else() message("Recommended flags which are not added because GMX_SKIP_DEFAULT_CFLAGS=yes:") - message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${GMXC_CFLAGS}") + message("CMAKE_C_FLAGS: ${SIMD_C_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_C_FLAGS} ${GMXC_CFLAGS}") message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}") message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}") - message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${GMXC_CXXFLAGS}") + message("CMAKE_CXX_FLAGS: ${SIMD_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${EXTRA_CXX_FLAGS} ${GMXC_CXXFLAGS}") message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}") message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}") message("CMAKE_EXE_LINKER_FLAGS: ${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS}") diff --git a/admin/installguide/installguide.tex b/admin/installguide/installguide.tex index 953ee9f442..4368cb539a 100644 --- a/admin/installguide/installguide.tex +++ b/admin/installguide/installguide.tex @@ -1,7 +1,7 @@ % % This file is part of the GROMACS molecular simulation package. % -% Copyright (c) 2013, by the GROMACS development team, led by +% Copyright (c) 2013,2014, by the GROMACS development team, led by % Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, % and including many others, as listed in the AUTHORS file in the % top-level source directory and at http://www.gromacs.org. @@ -425,6 +425,45 @@ also do this kind of thing with \verb+ccmake+, but you should avoid this, because the options set with '\verb+-D+' will not be able to be changed interactively in that run of \verb+ccmake+. +\subsection{SIMD support} +\gromacs{} has extensive support for detecting and using the SIMD +capabilities of nearly all modern HPC CPUs. If you are building +\gromacs{} on the same hardware you will run it on, then you don't +need to read more about this. Otherwise, you may wish to choose the +value of \verb+GMX_SIMD+ to much the execution environment. If you +make no choice, the default will be based on the computer on which you +are running \cmake{}. Valid values are listed below, and the +applicable value lowest on the list is generally the one you should +choose: +\begin{enumerate} +\item \verb+None+ For use only on an architecture either lacking SIMD, + or to which \gromacs{} has not yet been ported and none of the + options below are applicable. +\item \verb+SSE2+ Essentially all x86 machines in existence have this +\item \verb+SSE4.1+ More recent x86 have this +\item \verb+AVX_128_FMA+ More recent AMD x86 have this +\item \verb+AVX_256+ More recent Intel x86 have this +\item \verb+AVX2_256+ Yet more recent Intel x86 have this +\item \verb+IBM_QPX + BlueGene/Q A2 cores have this +\item \verb+Sparc64_HPC_ACE+ Fujitsu machines like the K computer have this +\end{enumeration} +The \cmake{} configure system will check that the compiler you have +chosen can target the architecture you have chosen. mdrun will check +further at runtime, so if in doubt, choose the lowest setting you +think might work, and see what mdrun says. The configure system also +works around many known issues in many versions of common HPC +compilers. + +A further \verb+GMX_SIMD=Reference+ option exists, which is a special +SIMD-like implementation written in plain C that developers can use +when developing support in GROMACS for new SIMD architectures. It is +not designed for use in production simulations, but if you are using +an architecture with SIMD support to which \gromacs{} has not yet been +ported, you may wish to try the performance of this option, in case +the auto-vectorization in your compiler does a good job. And post on +the \gromacs{} mailing lists, because \gromacs{} can probably be +ported for new SIMD architectures in a few days. + \subsection{CMake advanced options} The options that can be seen with \verb+ccmake+ are ones that we think a reasonable number of users might want to consider diff --git a/cmake/Platform/BlueGeneL-static-XL-C.cmake b/cmake/Platform/BlueGeneL-static-XL-C.cmake index c1de08aa19..ddc87997fe 100644 --- a/cmake/Platform/BlueGeneL-static-XL-C.cmake +++ b/cmake/Platform/BlueGeneL-static-XL-C.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2010,2012,2013, by the GROMACS development team, led by +# Copyright (c) 2010,2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -82,4 +82,4 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -set(GMX_CPU_ACCELERATION "BlueGene" CACHE STRING "Forcing BlueGene acceleration when using BlueGene toolchain") +set(GMX_SIMD "BlueGene" CACHE STRING "Forcing BlueGene SIMD when using BlueGene toolchain") diff --git a/cmake/Platform/BlueGeneP-static-XL-C.cmake b/cmake/Platform/BlueGeneP-static-XL-C.cmake index 6558fd415a..b2ad826a06 100644 --- a/cmake/Platform/BlueGeneP-static-XL-C.cmake +++ b/cmake/Platform/BlueGeneP-static-XL-C.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -39,4 +39,4 @@ set(CMAKE_SYSTEM_NAME BlueGeneP-static CACHE STRING "Cross-compiling for BlueGen set(CMAKE_C_COMPILER mpixlc_r) set(CMAKE_CXX_COMPILER mpixlcxx_r) -set(GMX_CPU_ACCELERATION "BlueGene" CACHE STRING "Forcing BlueGene acceleration when using BlueGene toolchain") +set(GMX_SIMD "BlueGene" CACHE STRING "Forcing BlueGene SIMD when using BlueGene toolchain") diff --git a/cmake/TestAVXMaskload.c b/cmake/TestAVXMaskload.c index 61777b077f..508c2ec174 100644 --- a/cmake/TestAVXMaskload.c +++ b/cmake/TestAVXMaskload.c @@ -8,7 +8,7 @@ int main() a = _mm256_setzero_pd(); mask = _mm256_castpd_si256(a); -#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG +#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask)); #else a = _mm256_maskload_pd(d,mask); diff --git a/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake b/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake index 0ed9b51b6b..2b0a180b56 100644 --- a/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake +++ b/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -52,4 +52,4 @@ set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to H set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE) set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE) -set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain") +set(GMX_SIMD "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE SIMD when using Fujitsu Sparc64 toolchain") diff --git a/cmake/Toolchain-Fujitsu-Sparc64.cmake b/cmake/Toolchain-Fujitsu-Sparc64.cmake index 923869f88d..3f301eed64 100644 --- a/cmake/Toolchain-Fujitsu-Sparc64.cmake +++ b/cmake/Toolchain-Fujitsu-Sparc64.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -51,4 +51,4 @@ set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to H set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE) set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE) -set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain") +set(GMX_SIMD "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE SIMD when using Fujitsu Sparc64 toolchain") diff --git a/cmake/gmxBuildTypeReference.cmake b/cmake/gmxBuildTypeReference.cmake index 5323924408..2a6f7fac19 100644 --- a/cmake/gmxBuildTypeReference.cmake +++ b/cmake/gmxBuildTypeReference.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -42,7 +42,7 @@ mark_as_advanced( CMAKE_CXX_FLAGS_REFERENCE CMAKE_C_FLAGS_REFERENCE) if("${CMAKE_BUILD_TYPE}" STREQUAL "Reference") set(GMX_GPU OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE) set(GMX_OPENMP OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE) - set(GMX_CPU_ACCELERATION "None" CACHE STRING "Disabled for regressiontests reference builds" FORCE) + set(GMX_SIMD "None" CACHE STRING "Disabled for regressiontests reference builds" FORCE) set(GMX_FFT_LIBRARY "fftpack" CACHE STRING "Use fftpack for regressiontests reference builds" FORCE) set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE) set(GMX_THREAD_MPI OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE) diff --git a/cmake/gmxDetectAcceleration.cmake b/cmake/gmxDetectSimd.cmake similarity index 71% rename from cmake/gmxDetectAcceleration.cmake rename to cmake/gmxDetectSimd.cmake index 884368ffb2..550a9ac161 100644 --- a/cmake/gmxDetectAcceleration.cmake +++ b/cmake/gmxDetectSimd.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -34,22 +34,22 @@ # - Check the username performing the build, as well as date and time # -# gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION) +# gmx_detect_simd(GMX_SUGGESTED_SIMD) # -# Try to detect CPU information and suggest an acceleration option +# Try to detect CPU information and suggest SIMD instructions # (such as SSE/AVX) that fits the current CPU. These functions assume # that gmx_detect_target_architecture() has already been run, so that # things like GMX_TARGET_X86 are already available. # -# Sets ${GMX_SUGGESTED_CPU_ACCELERATION} in the parent scope if -# GMX_CPU_ACCELERATION is not set (e.g. by the user, or a previous run +# Sets ${GMX_SUGGESTED_SIMD} in the parent scope if +# GMX_SIMD is not set (e.g. by the user, or a previous run # of CMake). # # we rely on inline asm support for GNU! include(gmxTestInlineASM) -function(gmx_suggest_x86_acceleration _suggested_acceleration) +function(gmx_suggest_x86_simd _suggested_simd) gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM) @@ -59,47 +59,47 @@ function(gmx_suggest_x86_acceleration _suggested_acceleration) set(GCC_INLINE_ASM_DEFINE "") endif(GMX_X86_GCC_INLINE_ASM) - message(STATUS "Detecting best acceleration for this CPU") + message(STATUS "Detecting best SIMD instructions for this CPU") - # Get CPU acceleration information + # Get CPU SIMD properties information set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders -DGMX_CPUID_STANDALONE") if(GMX_TARGET_X86) set(_compile_definitions "${_compile_definitions} -DGMX_TARGET_X86") endif() - try_run(GMX_CPUID_RUN_ACC GMX_CPUID_COMPILED + try_run(GMX_CPUID_RUN_SIMD GMX_CPUID_COMPILED ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_cpuid.c COMPILE_DEFINITIONS ${_compile_definitions} RUN_OUTPUT_VARIABLE OUTPUT_TMP COMPILE_OUTPUT_VARIABLE GMX_CPUID_COMPILE_OUTPUT - ARGS "-acceleration") + ARGS "-simd") if(NOT GMX_CPUID_COMPILED) - message(WARNING "Cannot compile CPUID code, which means no CPU-specific acceleration.") + message(WARNING "Cannot compile CPUID code, which means no SIMD instructions.") message(STATUS "Compile output: ${GMX_CPUID_COMPILE_OUTPUT}") set(OUTPUT_TMP "None") - elseif(NOT GMX_CPUID_RUN_ACC EQUAL 0) - message(WARNING "Cannot run CPUID code, which means no CPU-specific optimization.") + elseif(NOT GMX_CPUID_RUN_SIMD EQUAL 0) + message(WARNING "Cannot run CPUID code, which means no SIMD instructions.") message(STATUS "Run output: ${OUTPUT_TMP}") set(OUTPUT_TMP "None") endif(NOT GMX_CPUID_COMPILED) - string(STRIP "@OUTPUT_TMP@" OUTPUT_ACC) + string(STRIP "@OUTPUT_TMP@" OUTPUT_SIMD) - set(${_suggested_acceleration} "@OUTPUT_ACC@" PARENT_SCOPE) - message(STATUS "Detected best acceleration for this CPU - @OUTPUT_ACC@") + set(${_suggested_simd} "@OUTPUT_SIMD@" PARENT_SCOPE) + message(STATUS "Detected best SIMD instructions for this CPU - @OUTPUT_SIMD@") endfunction() -function(gmx_detect_acceleration _suggested_acceleration) - if(NOT DEFINED GMX_CPU_ACCELERATION) +function(gmx_detect_simd _suggested_simd) + if(NOT DEFINED GMX_SIMD) if(GMX_TARGET_BGQ) - set(${_suggested_acceleration} "IBM_QPX") + set(${_suggested_simd} "IBM_QPX") elseif(GMX_TARGET_X86) - gmx_suggest_x86_acceleration(${_suggested_acceleration}) + gmx_suggest_x86_simd(${_suggested_simd}) else() - set(${_suggested_acceleration} "None") + set(${_suggested_simd} "None") endif() - set(${_suggested_acceleration} ${${_suggested_acceleration}} PARENT_SCOPE) + set(${_suggested_simd} ${${_suggested_simd}} PARENT_SCOPE) endif() endfunction() diff --git a/cmake/gmxDetectTargetArchitecture.cmake b/cmake/gmxDetectTargetArchitecture.cmake index e888732e77..45a2a47fd5 100644 --- a/cmake/gmxDetectTargetArchitecture.cmake +++ b/cmake/gmxDetectTargetArchitecture.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2013, by the GROMACS development team, led by +# Copyright (c) 2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -34,7 +34,7 @@ # - Define function to detect whether the compiler's target # - architecture is one for which GROMACS has special treatment -# - (e.g. kernel acceleration) +# (e.g. SIMD instructions) # # Sets GMX_TARGET_X86 or GMX_TARGET_BGQ if targetting that # architecture. May set other such variables if/when there is future diff --git a/cmake/gmxFindFlagsForSource.cmake b/cmake/gmxFindFlagsForSource.cmake index 3ebb57b6b8..6a569b22b7 100644 --- a/cmake/gmxFindFlagsForSource.cmake +++ b/cmake/gmxFindFlagsForSource.cmake @@ -38,7 +38,7 @@ # SOURCE Source code to test # The compiler is chosen based on the extension of this file # FLAGSVAR Variable (string) to which we should add the correct flag -# Args 5 through N Multiple strings with acceleration flags to test +# Args 5 through N Multiple strings with optimization flags to test FUNCTION(GMX_FIND_CFLAG_FOR_SOURCE VARIABLE DESCRIPTION SOURCE CFLAGSVAR) IF(NOT DEFINED ${VARIABLE}) # Insert a blank element last in the list (try without any flags too) @@ -71,7 +71,7 @@ ENDFUNCTION(GMX_FIND_CFLAG_FOR_SOURCE VARIABLE DESCRIPTION SOURCE CFLAGSVAR) # SOURCE Source code to test # The compiler is chosen based on the extension of this file # FLAGSVAR Variable (string) to which we should add the correct flag -# Args 5 through N Multiple strings with acceleration flags to test +# Args 5 through N Multiple strings with optimization flags to test FUNCTION(GMX_FIND_CXXFLAG_FOR_SOURCE VARIABLE DESCRIPTION SOURCE CXXFLAGSVAR) IF(NOT DEFINED ${VARIABLE}) # Insert a blank element last in the list (try without any flags too) diff --git a/cmake/gmxManageFFTLibraries.cmake b/cmake/gmxManageFFTLibraries.cmake index 653321ddc8..35c600001e 100644 --- a/cmake/gmxManageFFTLibraries.cmake +++ b/cmake/gmxManageFFTLibraries.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -81,12 +81,12 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3") set(FFT_LIBRARIES ${${FFTW}_LIBRARIES}) set(GMX_FFT_FFTW3 1) - if ((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD) + if ((${GMX_SIMD} MATCHES "SSE" OR ${GMX_SIMD} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD) message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin") endif() - if((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND ${FFTW}_HAVE_AVX) - # If we're not doing CPU acceleration, we don't care about FFTW performance on x86 either + if((${GMX_SIMD} MATCHES "SSE" OR ${GMX_SIMD} MATCHES "AVX") AND ${FFTW}_HAVE_AVX) + # If we're not using SIMD instructions, we don't care about FFTW performance on x86 either message(WARNING "The FFTW library was compiled with --enable-avx to enable AVX SIMD instructions. That might sound like a good idea for your processor, but for FFTW versions up to 3.3.3, these are slower than the SSE/SSE2 SIMD instructions for the way GROMACS uses FFTs. Limitations in the way FFTW allows GROMACS to measure performance make it awkward for either GROMACS or FFTW to make the decision for you based on runtime performance. You should compile a different FFTW library with --enable-sse or --enable-sse2. If you have a more recent FFTW, you may like to compare the performance of GROMACS with FFTW libraries compiled with and without --enable-avx. However, the GROMACS developers do not really expect the FFTW AVX optimization to help, because the performance is limited by memory access, not computation.") endif() diff --git a/cmake/gmxTestAVXMaskload.cmake b/cmake/gmxTestAVXMaskload.cmake index 8b05b12979..40d76c78ac 100644 --- a/cmake/gmxTestAVXMaskload.cmake +++ b/cmake/gmxTestAVXMaskload.cmake @@ -55,7 +55,7 @@ MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE AVX_CFLAGS) ELSE() TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}" "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c" - COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" ) + COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG" ) IF(${VARIABLE}_COMPILEOK) SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE) MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around") diff --git a/cmake/gmxTestCPUAcceleration.cmake b/cmake/gmxTestSimd.cmake similarity index 72% rename from cmake/gmxTestCPUAcceleration.cmake rename to cmake/gmxTestSimd.cmake index 537379dd68..9d910073dc 100644 --- a/cmake/gmxTestCPUAcceleration.cmake +++ b/cmake/gmxTestSimd.cmake @@ -44,75 +44,75 @@ macro(gmx_use_clang_as_with_gnu_compilers_on_osx) # compilers assembler instead - and this has to happen before we detect AVX # flags. if(APPLE AND ${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - gmx_test_cflag(GNU_C_USE_CLANG_AS "-Wa,-q" ACCELERATION_C_FLAGS) + gmx_test_cflag(GNU_C_USE_CLANG_AS "-Wa,-q" SIMD_C_FLAGS) endif() if(APPLE AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - gmx_test_cxxflag(GNU_CXX_USE_CLANG_AS "-Wa,-q" ACCELERATION_CXX_FLAGS) + gmx_test_cxxflag(GNU_CXX_USE_CLANG_AS "-Wa,-q" SIMD_CXX_FLAGS) endif() endmacro() -macro(gmx_test_cpu_acceleration) +macro(gmx_test_simd) # # To improve backward compatibility on x86 SIMD architectures, -# we set the flags for all accelerations that are supported, not only +# we set the flags for all SIMD instructions that are supported, not only # the most recent instruction set. I.e., if your machine supports AVX2_256, # we will set flags both for AVX2_256, AVX_256, SSE4.1, and SSE2 support. -if(${GMX_CPU_ACCELERATION} STREQUAL "NONE") +if(${GMX_SIMD} STREQUAL "NONE") # nothing to do configuration-wise - set(ACCELERATION_STATUS_MESSAGE "CPU SIMD acceleration disabled") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2") + set(SIMD_STATUS_MESSAGE "SIMD instructions disabled") +elseif(${GMX_SIMD} STREQUAL "SSE2") gmx_find_cflag_for_source(CFLAGS_SSE2 "C compiler SSE2 flag" "#include int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_rsqrt_ps(x);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-msse2" "/arch:SSE2") gmx_find_cxxflag_for_source(CXXFLAGS_SSE2 "C++ compiler SSE2 flag" "#include int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_rsqrt_ps(x);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-msse2" "/arch:SSE2") if(NOT CFLAGS_SSE2 OR NOT CXXFLAGS_SSE2) - message(FATAL_ERROR "Cannot find SSE2 compiler flag. Use a newer compiler, or disable acceleration (slower).") + message(FATAL_ERROR "Cannot find SSE2 compiler flag. Use a newer compiler, or disable SIMD (slower).") endif() - set(GMX_CPU_ACCELERATION_X86_SSE2 1) - set(GMX_X86_SSE2 1) + set(GMX_SIMD_X86_SSE2 1) + set(GMX_SIMD_X86_SSE2_OR_HIGHER 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling SSE2 SIMD Gromacs acceleration") + set(SIMD_STATUS_MESSAGE "Enabling SSE2 SIMD instructions") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1") +elseif(${GMX_SIMD} STREQUAL "SSE4.1") # Note: MSVC enables SSE4.1 with the SSE2 flag, so we include that in testing. gmx_find_cflag_for_source(CFLAGS_SSE4_1 "C compiler SSE4.1 flag" "#include int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_dp_ps(x,x,0x77);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-msse4.1" "/arch:SSE4.1" "/arch:SSE2") gmx_find_cxxflag_for_source(CXXFLAGS_SSE4_1 "C++ compiler SSE4.1 flag" "#include int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_dp_ps(x,x,0x77);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-msse4.1" "/arch:SSE4.1" "/arch:SSE2") if(NOT CFLAGS_SSE4_1 OR NOT CXXFLAGS_SSE4_1) message(FATAL_ERROR "Cannot find SSE4.1 compiler flag. " - "Use a newer compiler, or choose SSE2 acceleration (slower).") + "Use a newer compiler, or choose SSE2 SIMD (slower).") endif() if(CMAKE_C_COMPILER_ID MATCHES "Intel" AND CMAKE_C_COMPILER_VERSION VERSION_EQUAL "11.1") - message(FATAL_ERROR "You are using Intel compiler version 11.1, which produces incorrect results with SSE4.1 acceleration. You need to use a newer compiler (e.g. icc >= 12.0) or in worst case try a lower level of acceleration if performance is not critical.") + message(FATAL_ERROR "You are using Intel compiler version 11.1, which produces incorrect results with SSE4.1 SIMD. You need to use a newer compiler (e.g. icc >= 12.0) or in worst case try a lower level of SIMD if performance is not critical.") endif() - set(GMX_CPU_ACCELERATION_X86_SSE4_1 1) - set(GMX_X86_SSE4_1 1) - set(GMX_X86_SSE2 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling SSE4.1 SIMD Gromacs acceleration") + set(GMX_SIMD_X86_SSE4_1 1) + set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE2_OR_HIGHER 1) + set(SIMD_STATUS_MESSAGE "Enabling SSE4.1 SIMD instructions") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA") +elseif(${GMX_SIMD} STREQUAL "AVX_128_FMA") gmx_use_clang_as_with_gnu_compilers_on_osx() @@ -125,18 +125,18 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA") gmx_find_cflag_for_source(CFLAGS_AVX_128 "C compiler AVX (128 bit) flag" "#include int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_permute_ps(x,1);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-mavx" "/arch:AVX") gmx_find_cxxflag_for_source(CXXFLAGS_AVX_128 "C++ compiler AVX (128 bit) flag" "#include int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_permute_ps(x,1);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-mavx" "/arch:AVX") ### STAGE 2: Find the fused-multiply add flag. # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support. - check_include_file(x86intrin.h HAVE_X86INTRIN_H ${ACCELERATION_C_FLAGS}) - check_include_file(intrin.h HAVE_INTRIN_H ${ACCELERATION_C_FLAGS}) + check_include_file(x86intrin.h HAVE_X86INTRIN_H ${SIMD_C_FLAGS}) + check_include_file(intrin.h HAVE_INTRIN_H ${SIMD_C_FLAGS}) if(HAVE_X86INTRIN_H) set(INCLUDE_X86INTRIN_H "#include ") endif() @@ -149,19 +149,19 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA") ${INCLUDE_X86INTRIN_H} ${INCLUDE_INTRIN_H} int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_macc_ps(x,x,x);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-mfma4") gmx_find_cxxflag_for_source(CXXFLAGS_AVX_128_FMA "C++ compiler AVX (128 bit) FMA4 flag" "#include ${INCLUDE_X86INTRIN_H} ${INCLUDE_INTRIN_H} int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_macc_ps(x,x,x);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-mfma4") # We only need to check the last (FMA) test; that will always fail if the basic AVX128 test failed if(NOT CFLAGS_AVX_128_FMA OR NOT CXXFLAGS_AVX_128_FMA) - message(FATAL_ERROR "Cannot find compiler flags for 128 bit AVX with FMA support. Use a newer compiler, or choose SSE4.1 acceleration (slower).") + message(FATAL_ERROR "Cannot find compiler flags for 128 bit AVX with FMA support. Use a newer compiler, or choose SSE4.1 SIMD (slower).") endif() ### STAGE 3: Optional: Find the XOP instruction flag (No point in yelling if this does not work) @@ -170,14 +170,14 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_macc_ps(x,x,x);return 0;}" ${INCLUDE_X86INTRIN_H} ${INCLUDE_INTRIN_H} int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-mxop") gmx_find_cxxflag_for_source(CXXFLAGS_AVX_128_XOP "C++ compiler AVX (128 bit) XOP flag" "#include ${INCLUDE_X86INTRIN_H} ${INCLUDE_INTRIN_H} int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-mxop") # We don't have the full compiler version string yet (BUILD_C_COMPILER), @@ -185,7 +185,7 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}" # hackintoshes is not worth the effort. if (APPLE AND (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")) - message(WARNING "Due to a known compiler bug, Clang up to version 3.2 (and Apple Clang up to version 4.1) produces incorrect code with AVX_128_FMA acceleration. As we cannot work around this bug on OS X, you will have to select a different compiler or CPU acceleration.") + message(WARNING "Due to a known compiler bug, Clang up to version 3.2 (and Apple Clang up to version 4.1) produces incorrect code with AVX_128_FMA SIMD. As we cannot work around this bug on OS X, you will have to select a different compiler or SIMD instruction set.") endif() @@ -200,44 +200,44 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}" set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -no-integrated-as") endif() - gmx_test_avx_gcc_maskload_bug(GMX_X86_AVX_GCC_MASKLOAD_BUG "${ACCELERATION_C_FLAGS}") + gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}") - set(GMX_CPU_ACCELERATION_X86_AVX_128_FMA 1) - set(GMX_X86_AVX_128_FMA 1) - set(GMX_X86_SSE4_1 1) - set(GMX_X86_SSE2 1) + set(GMX_SIMD_X86_AVX_128_FMA 1) + set(GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE2_OR_HIGHER 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling 128-bit AVX SIMD Gromacs acceleration (with fused-multiply add)") + set(SIMD_STATUS_MESSAGE "Enabling 128-bit AVX SIMD Gromacs SIMD (with fused-multiply add)") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_256") +elseif(${GMX_SIMD} STREQUAL "AVX_256") gmx_use_clang_as_with_gnu_compilers_on_osx() gmx_find_cflag_for_source(CFLAGS_AVX "C compiler AVX (256 bit) flag" "#include int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_add_ps(x,x);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-mavx" "/arch:AVX") gmx_find_cxxflag_for_source(CXXFLAGS_AVX "C++ compiler AVX (256 bit) flag" "#include int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_add_ps(x,x);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-mavx" "/arch:AVX") if(NOT CFLAGS_AVX OR NOT CXXFLAGS_AVX) - message(FATAL_ERROR "Cannot find AVX compiler flag. Use a newer compiler, or choose SSE4.1 acceleration (slower).") + message(FATAL_ERROR "Cannot find AVX compiler flag. Use a newer compiler, or choose SSE4.1 SIMD (slower).") endif() - gmx_test_avx_gcc_maskload_bug(GMX_X86_AVX_GCC_MASKLOAD_BUG "${ACCELERATION_C_FLAGS}") + gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}") - set(GMX_CPU_ACCELERATION_X86_AVX_256 1) - set(GMX_X86_AVX_256 1) - set(GMX_X86_SSE4_1 1) - set(GMX_X86_SSE2 1) + set(GMX_SIMD_X86_AVX_256 1) + set(GMX_SIMD_X86_AVX_256_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE2_OR_HIGHER 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling 256-bit AVX SIMD Gromacs acceleration") + set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX SIMD instructions") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX2_256") +elseif(${GMX_SIMD} STREQUAL "AVX2_256") # Comment out this line for AVX2 development message(FATAL_ERROR "AVX2_256 is disabled until the implementation has been commited.") @@ -247,50 +247,50 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX2_256") gmx_find_cflag_for_source(CFLAGS_AVX2 "C compiler AVX2 flag" "#include int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_fmadd_ps(x,x,x);return 0;}" - ACCELERATION_C_FLAGS + SIMD_C_FLAGS "-march=core-avx2" "-mavx2" "/arch:AVX") # no AVX2-specific flag for MSVC yet gmx_find_cxxflag_for_source(CXXFLAGS_AVX2 "C++ compiler AVX2 flag" "#include int main(){__m256 x=_mm256_set1_ps(0.5);x=_mm256_fmadd_ps(x,x,x);return 0;}" - ACCELERATION_CXX_FLAGS + SIMD_CXX_FLAGS "-march=core-avx2" "-mavx2" "/arch:AVX") # no AVX2-specific flag for MSVC yet if(NOT CFLAGS_AVX2 OR NOT CXXFLAGS_AVX2) - message(FATAL_ERROR "Cannot find AVX2 compiler flag. Use a newer compiler, or choose AVX acceleration (slower).") + message(FATAL_ERROR "Cannot find AVX2 compiler flag. Use a newer compiler, or choose AVX SIMD (slower).") endif() # No need to test for Maskload bug - it was fixed before gcc added AVX2 support - set(GMX_CPU_ACCELERATION_X86_AVX2_256 1) - set(GMX_X86_AVX2_256 1) - set(GMX_X86_AVX_256 1) - set(GMX_X86_SSE4_1 1) - set(GMX_X86_SSE2 1) + set(GMX_SIMD_X86_AVX2_256 1) + set(GMX_SIMD_X86_AVX2_256_OR_HIGHER 1) + set(GMX_SIMD_X86_AVX_256_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1) + set(GMX_SIMD_X86_SSE2_OR_HIGHER 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling 256-bit AVX2 Gromacs acceleration") + set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX") +elseif(${GMX_SIMD} STREQUAL "IBM_QPX") try_compile(TEST_QPX ${CMAKE_BINARY_DIR} "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c") if (TEST_QPX) - message(WARNING "IBM QPX acceleration was selected. This will work, but SIMD-accelerated kernels are only available for the Verlet cut-off scheme. The plain C kernels that are used for the group cut-off scheme kernels will be slow, so please consider using the Verlet cut-off scheme.") - set(GMX_CPU_ACCELERATION_IBM_QPX 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling IBM QPX SIMD acceleration") + message(WARNING "IBM QPX SIMD instructions selected. This will work, but SIMD kernels are only available for the Verlet cut-off scheme. The plain C kernels that are used for the group cut-off scheme kernels will be slow, so please consider using the Verlet cut-off scheme.") + set(GMX_SIMD_IBM_QPX 1) + set(SIMD_STATUS_MESSAGE "Enabling IBM QPX SIMD instructions") else() message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics. If you are compiling for BlueGene/Q with the XL compilers, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-C' to set up the tool chain.") endif() -elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE") +elseif(${GMX_SIMD} STREQUAL "SPARC64_HPC_ACE") - set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1) - set(ACCELERATION_STATUS_MESSAGE "Enabling Sparc64 HPC-ACE SIMD acceleration") + set(GMX_SIMD_SPARC64_HPC_ACE 1) + set(SIMD_STATUS_MESSAGE "Enabling Sparc64 HPC-ACE SIMD instructions") -elseif(${GMX_CPU_ACCELERATION} STREQUAL "REFERENCE") +elseif(${GMX_SIMD} STREQUAL "REFERENCE") - add_definitions(-DGMX_SIMD_REFERENCE_PLAIN_C) + add_definitions(-DGMX_SIMD_REFERENCE) if(${GMX_NBNXN_REF_KERNEL_TYPE} STREQUAL "4xn") if(${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "2" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "4" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "8") add_definitions(-DGMX_NBNXN_SIMD_4XN -DGMX_SIMD_REF_WIDTH=${GMX_NBNXN_REF_KERNEL_WIDTH}) @@ -308,13 +308,13 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "REFERENCE") endif() else() - gmx_invalid_option_value(GMX_CPU_ACCELERATION) + gmx_invalid_option_value(GMX_SIMD) endif() -gmx_check_if_changed(ACCELERATION_CHANGED GMX_CPU_ACCELERATION) -if (ACCELERATION_CHANGED AND DEFINED ACCELERATION_STATUS_MESSAGE) - message(STATUS "${ACCELERATION_STATUS_MESSAGE}") +gmx_check_if_changed(SIMD_CHANGED GMX_SIMD) +if (SIMD_CHANGED AND DEFINED SIMD_STATUS_MESSAGE) + message(STATUS "${SIMD_STATUS_MESSAGE}") endif() endmacro() diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 1804b63879..1c09d5b41b 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -110,40 +110,40 @@ #cmakedefine GMX_TARGET_BGQ /* SSE2 instructions available */ -#cmakedefine GMX_X86_SSE2 +#cmakedefine GMX_SIMD_X86_SSE2_OR_HIGHER /* SSE4.1 instructions available */ -#cmakedefine GMX_X86_SSE4_1 +#cmakedefine GMX_SIMD_X86_SSE4_1_OR_HIGHER -/* AVX 128-bit FMA instructions available */ -#cmakedefine GMX_X86_AVX_128_FMA +/* AVX 128-bit FMA instructions available (AMD side of the AVX world) */ +#cmakedefine GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER -/* AVX 256-bit instructions available */ -#cmakedefine GMX_X86_AVX_256 +/* AVX 256-bit instructions available (Intel side of the AVX world) */ +#cmakedefine GMX_SIMD_X86_AVX_256_OR_HIGHER /* GCC bug in AVX maskload/maskstore arguments - worked around internally */ -#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG +#cmakedefine GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG -/* SSE2 was selected as CPU acceleration level */ -#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2 +/* SSE2 was selected for SIMD instruction set level */ +#cmakedefine GMX_SIMD_X86_SSE2 -/* SSE4.1 was selected as CPU acceleration level */ -#cmakedefine GMX_CPU_ACCELERATION_X86_SSE4_1 +/* SSE4.1 was selected as SIMD instructions */ +#cmakedefine GMX_SIMD_X86_SSE4_1 -/* AVX 128-bit FMA was selected as CPU acceleration level */ -#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_128_FMA +/* AVX 128-bit FMA was selected as SIMD instructions */ +#cmakedefine GMX_SIMD_X86_AVX_128_FMA -/* AVX 256-bit was selected as CPU acceleration level */ -#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_256 +/* AVX 256-bit was selected as SIMD instructions */ +#cmakedefine GMX_SIMD_X86_AVX_256 -/* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */ -#cmakedefine GMX_CPU_ACCELERATION_IBM_QPX +/* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */ +#cmakedefine GMX_SIMD_IBM_QPX /* Fujitsu Sparc64 HPC-ACE SIMD acceleration */ -#cmakedefine GMX_CPU_ACCELERATION_SPARC64_HPC_ACE +#cmakedefine GMX_SIMD_SPARC64_HPC_ACE -/* String for CPU acceleration choice (for writing to log files and stdout) */ -#define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@" +/* String for SIMD instruction choice (for writing to log files and stdout) */ +#define GMX_SIMD_STRING "@GMX_SIMD@" /* Integer byte order is big endian. */ #cmakedefine GMX_INTEGER_BIG_ENDIAN diff --git a/src/contrib/fftw/CMakeLists.txt b/src/contrib/fftw/CMakeLists.txt index e02a0c1ac1..1365dda4a7 100644 --- a/src/contrib/fftw/CMakeLists.txt +++ b/src/contrib/fftw/CMakeLists.txt @@ -56,7 +56,7 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND BUILD_SHARED_LIBS) # FFTW doesn endif() # Testing shows FFTW configured with --enable-avx is never better than --enable-sse2, so we do the latter always. -if(${GMX_CPU_ACCELERATION} MATCHES "^(SSE|AVX)") +if(${GMX_SIMD} MATCHES "^(SSE|AVX)") set(GMX_BUILD_OWN_FFTW_OPTIMIZATION_CONFIGURATION --enable-sse2 CACHE INTERNAL "Optimization flags for FFTW compilation") endif() diff --git a/src/gromacs/gmxlib/bondfree.c b/src/gromacs/gmxlib/bondfree.c index 40989ed89c..006221a92b 100644 --- a/src/gromacs/gmxlib/bondfree.c +++ b/src/gromacs/gmxlib/bondfree.c @@ -120,15 +120,15 @@ static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx) /* SIMD PBC data structure, containing 1/boxdiag and the box vectors */ typedef struct { - gmx_mm_pr inv_bzz; - gmx_mm_pr inv_byy; - gmx_mm_pr inv_bxx; - gmx_mm_pr bzx; - gmx_mm_pr bzy; - gmx_mm_pr bzz; - gmx_mm_pr byx; - gmx_mm_pr byy; - gmx_mm_pr bxx; + gmx_simd_real_t inv_bzz; + gmx_simd_real_t inv_byy; + gmx_simd_real_t inv_bxx; + gmx_simd_real_t bzx; + gmx_simd_real_t bzy; + gmx_simd_real_t bzz; + gmx_simd_real_t byx; + gmx_simd_real_t byy; + gmx_simd_real_t bxx; } pbc_simd_t; /* Set the SIMD pbc data from a normal t_pbc struct */ @@ -147,48 +147,48 @@ static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd) } } - pbc_simd->inv_bzz = gmx_set1_pr(inv_bdiag[ZZ]); - pbc_simd->inv_byy = gmx_set1_pr(inv_bdiag[YY]); - pbc_simd->inv_bxx = gmx_set1_pr(inv_bdiag[XX]); + pbc_simd->inv_bzz = gmx_simd_set1_r(inv_bdiag[ZZ]); + pbc_simd->inv_byy = gmx_simd_set1_r(inv_bdiag[YY]); + pbc_simd->inv_bxx = gmx_simd_set1_r(inv_bdiag[XX]); if (pbc != NULL) { - pbc_simd->bzx = gmx_set1_pr(pbc->box[ZZ][XX]); - pbc_simd->bzy = gmx_set1_pr(pbc->box[ZZ][YY]); - pbc_simd->bzz = gmx_set1_pr(pbc->box[ZZ][ZZ]); - pbc_simd->byx = gmx_set1_pr(pbc->box[YY][XX]); - pbc_simd->byy = gmx_set1_pr(pbc->box[YY][YY]); - pbc_simd->bxx = gmx_set1_pr(pbc->box[XX][XX]); + pbc_simd->bzx = gmx_simd_set1_r(pbc->box[ZZ][XX]); + pbc_simd->bzy = gmx_simd_set1_r(pbc->box[ZZ][YY]); + pbc_simd->bzz = gmx_simd_set1_r(pbc->box[ZZ][ZZ]); + pbc_simd->byx = gmx_simd_set1_r(pbc->box[YY][XX]); + pbc_simd->byy = gmx_simd_set1_r(pbc->box[YY][YY]); + pbc_simd->bxx = gmx_simd_set1_r(pbc->box[XX][XX]); } else { - pbc_simd->bzx = gmx_setzero_pr(); - pbc_simd->bzy = gmx_setzero_pr(); - pbc_simd->bzz = gmx_setzero_pr(); - pbc_simd->byx = gmx_setzero_pr(); - pbc_simd->byy = gmx_setzero_pr(); - pbc_simd->bxx = gmx_setzero_pr(); + pbc_simd->bzx = gmx_simd_setzero_r(); + pbc_simd->bzy = gmx_simd_setzero_r(); + pbc_simd->bzz = gmx_simd_setzero_r(); + pbc_simd->byx = gmx_simd_setzero_r(); + pbc_simd->byy = gmx_simd_setzero_r(); + pbc_simd->bxx = gmx_simd_setzero_r(); } } /* Correct distance vector *dx,*dy,*dz for PBC using SIMD */ static gmx_inline void -pbc_dx_simd(gmx_mm_pr *dx, gmx_mm_pr *dy, gmx_mm_pr *dz, +pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz, const pbc_simd_t *pbc) { - gmx_mm_pr sh; + gmx_simd_real_t sh; - sh = gmx_round_pr(gmx_mul_pr(*dz, pbc->inv_bzz)); - *dx = gmx_nmsub_pr(sh, pbc->bzx, *dx); - *dy = gmx_nmsub_pr(sh, pbc->bzy, *dy); - *dz = gmx_nmsub_pr(sh, pbc->bzz, *dz); + sh = gmx_simd_round_r(gmx_simd_mul_r(*dz, pbc->inv_bzz)); + *dx = gmx_simd_fnmadd_r(sh, pbc->bzx, *dx); + *dy = gmx_simd_fnmadd_r(sh, pbc->bzy, *dy); + *dz = gmx_simd_fnmadd_r(sh, pbc->bzz, *dz); - sh = gmx_round_pr(gmx_mul_pr(*dy, pbc->inv_byy)); - *dx = gmx_nmsub_pr(sh, pbc->byx, *dx); - *dy = gmx_nmsub_pr(sh, pbc->byy, *dy); + sh = gmx_simd_round_r(gmx_simd_mul_r(*dy, pbc->inv_byy)); + *dx = gmx_simd_fnmadd_r(sh, pbc->byx, *dx); + *dy = gmx_simd_fnmadd_r(sh, pbc->byy, *dy); - sh = gmx_round_pr(gmx_mul_pr(*dx, pbc->inv_bxx)); - *dx = gmx_nmsub_pr(sh, pbc->bxx, *dx); + sh = gmx_simd_round_r(gmx_simd_mul_r(*dx, pbc->inv_bxx)); + *dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx); } #endif /* SIMD_BONDEDS */ @@ -1051,57 +1051,57 @@ angles_noener_simd(int nbonds, const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd, int gmx_unused *global_atom_index) { -#define UNROLL GMX_SIMD_WIDTH_HERE - const int nfa1 = 4; - int i, iu, s, m; - int type, ai[UNROLL], aj[UNROLL], ak[UNROLL]; - real coeff_array[2*UNROLL+UNROLL], *coeff; - real dr_array[2*DIM*UNROLL+UNROLL], *dr; - real f_buf_array[6*UNROLL+UNROLL], *f_buf; - gmx_mm_pr k_S, theta0_S; - gmx_mm_pr rijx_S, rijy_S, rijz_S; - gmx_mm_pr rkjx_S, rkjy_S, rkjz_S; - gmx_mm_pr one_S; - gmx_mm_pr min_one_plus_eps_S; - gmx_mm_pr rij_rkj_S; - gmx_mm_pr nrij2_S, nrij_1_S; - gmx_mm_pr nrkj2_S, nrkj_1_S; - gmx_mm_pr cos_S, invsin_S; - gmx_mm_pr theta_S; - gmx_mm_pr st_S, sth_S; - gmx_mm_pr cik_S, cii_S, ckk_S; - gmx_mm_pr f_ix_S, f_iy_S, f_iz_S; - gmx_mm_pr f_kx_S, f_ky_S, f_kz_S; - pbc_simd_t pbc_simd; + const int nfa1 = 4; + int i, iu, s, m; + int type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH]; + int ak[GMX_SIMD_REAL_WIDTH]; + real coeff_array[2*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *coeff; + real dr_array[2*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr; + real f_buf_array[6*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *f_buf; + gmx_simd_real_t k_S, theta0_S; + gmx_simd_real_t rijx_S, rijy_S, rijz_S; + gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S; + gmx_simd_real_t one_S; + gmx_simd_real_t min_one_plus_eps_S; + gmx_simd_real_t rij_rkj_S; + gmx_simd_real_t nrij2_S, nrij_1_S; + gmx_simd_real_t nrkj2_S, nrkj_1_S; + gmx_simd_real_t cos_S, invsin_S; + gmx_simd_real_t theta_S; + gmx_simd_real_t st_S, sth_S; + gmx_simd_real_t cik_S, cii_S, ckk_S; + gmx_simd_real_t f_ix_S, f_iy_S, f_iz_S; + gmx_simd_real_t f_kx_S, f_ky_S, f_kz_S; + pbc_simd_t pbc_simd; /* Ensure register memory alignment */ - coeff = gmx_simd_align_real(coeff_array); - dr = gmx_simd_align_real(dr_array); - f_buf = gmx_simd_align_real(f_buf_array); + coeff = gmx_simd_align_r(coeff_array); + dr = gmx_simd_align_r(dr_array); + f_buf = gmx_simd_align_r(f_buf_array); set_pbc_simd(pbc, &pbc_simd); - one_S = gmx_set1_pr(1.0); + one_S = gmx_simd_set1_r(1.0); /* The smallest number > -1 */ - min_one_plus_eps_S = gmx_set1_pr(-1.0 + 2*GMX_REAL_EPS); + min_one_plus_eps_S = gmx_simd_set1_r(-1.0 + 2*GMX_REAL_EPS); - /* nbonds is the number of angles times nfa1, here we step UNROLL angles */ - for (i = 0; (i < nbonds); i += UNROLL*nfa1) + /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */ + for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1) { - /* Collect atoms for UNROLL angles. + /* Collect atoms for GMX_SIMD_REAL_WIDTH angles. * iu indexes into forceatoms, we should not let iu go beyond nbonds. */ iu = i; - for (s = 0; s < UNROLL; s++) + for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++) { type = forceatoms[iu]; ai[s] = forceatoms[iu+1]; aj[s] = forceatoms[iu+2]; ak[s] = forceatoms[iu+3]; - coeff[s] = forceparams[type].harmonic.krA; - coeff[UNROLL+s] = forceparams[type].harmonic.rA*DEG2RAD; + coeff[s] = forceparams[type].harmonic.krA; + coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD; /* If you can't use pbc_dx_simd below for PBC, e.g. because * you can't round in SIMD, use pbc_rvec_sub here. @@ -1109,8 +1109,8 @@ angles_noener_simd(int nbonds, /* Store the non PBC corrected distances packed and aligned */ for (m = 0; m < DIM; m++) { - dr[s + m *UNROLL] = x[ai[s]][m] - x[aj[s]][m]; - dr[s + (DIM+m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m]; + dr[s + m *GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m]; + dr[s + (DIM+m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m]; } /* At the end fill the arrays with identical entries */ @@ -1120,70 +1120,70 @@ angles_noener_simd(int nbonds, } } - k_S = gmx_load_pr(coeff); - theta0_S = gmx_load_pr(coeff+UNROLL); + k_S = gmx_simd_load_r(coeff); + theta0_S = gmx_simd_load_r(coeff+GMX_SIMD_REAL_WIDTH); - rijx_S = gmx_load_pr(dr + 0*UNROLL); - rijy_S = gmx_load_pr(dr + 1*UNROLL); - rijz_S = gmx_load_pr(dr + 2*UNROLL); - rkjx_S = gmx_load_pr(dr + 3*UNROLL); - rkjy_S = gmx_load_pr(dr + 4*UNROLL); - rkjz_S = gmx_load_pr(dr + 5*UNROLL); + rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH); + rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH); + rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH); + rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH); + rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH); + rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH); pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd); pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd); - rij_rkj_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S, - rkjx_S, rkjy_S, rkjz_S); + rij_rkj_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S, + rkjx_S, rkjy_S, rkjz_S); - nrij2_S = gmx_norm2_pr(rijx_S, rijy_S, rijz_S); - nrkj2_S = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S); + nrij2_S = gmx_simd_norm2_r(rijx_S, rijy_S, rijz_S); + nrkj2_S = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S); - nrij_1_S = gmx_invsqrt_pr(nrij2_S); - nrkj_1_S = gmx_invsqrt_pr(nrkj2_S); + nrij_1_S = gmx_simd_invsqrt_r(nrij2_S); + nrkj_1_S = gmx_simd_invsqrt_r(nrkj2_S); - cos_S = gmx_mul_pr(rij_rkj_S, gmx_mul_pr(nrij_1_S, nrkj_1_S)); + cos_S = gmx_simd_mul_r(rij_rkj_S, gmx_simd_mul_r(nrij_1_S, nrkj_1_S)); /* To allow for 180 degrees, we take the max of cos and -1 + 1bit, * so we can safely get the 1/sin from 1/sqrt(1 - cos^2). * This also ensures that rounding errors would cause the argument - * of gmx_acos_pr to be < -1. + * of gmx_simd_acos_r to be < -1. * Note that we do not take precautions for cos(0)=1, so the outer * atoms in an angle should not be on top of each other. */ - cos_S = gmx_max_pr(cos_S, min_one_plus_eps_S); - - theta_S = gmx_acos_pr(cos_S); - - invsin_S = gmx_invsqrt_pr(gmx_sub_pr(one_S, gmx_mul_pr(cos_S, cos_S))); - - st_S = gmx_mul_pr(gmx_mul_pr(k_S, gmx_sub_pr(theta0_S, theta_S)), - invsin_S); - sth_S = gmx_mul_pr(st_S, cos_S); - - cik_S = gmx_mul_pr(st_S, gmx_mul_pr(nrij_1_S, nrkj_1_S)); - cii_S = gmx_mul_pr(sth_S, gmx_mul_pr(nrij_1_S, nrij_1_S)); - ckk_S = gmx_mul_pr(sth_S, gmx_mul_pr(nrkj_1_S, nrkj_1_S)); - - f_ix_S = gmx_mul_pr(cii_S, rijx_S); - f_ix_S = gmx_nmsub_pr(cik_S, rkjx_S, f_ix_S); - f_iy_S = gmx_mul_pr(cii_S, rijy_S); - f_iy_S = gmx_nmsub_pr(cik_S, rkjy_S, f_iy_S); - f_iz_S = gmx_mul_pr(cii_S, rijz_S); - f_iz_S = gmx_nmsub_pr(cik_S, rkjz_S, f_iz_S); - f_kx_S = gmx_mul_pr(ckk_S, rkjx_S); - f_kx_S = gmx_nmsub_pr(cik_S, rijx_S, f_kx_S); - f_ky_S = gmx_mul_pr(ckk_S, rkjy_S); - f_ky_S = gmx_nmsub_pr(cik_S, rijy_S, f_ky_S); - f_kz_S = gmx_mul_pr(ckk_S, rkjz_S); - f_kz_S = gmx_nmsub_pr(cik_S, rijz_S, f_kz_S); - - gmx_store_pr(f_buf + 0*UNROLL, f_ix_S); - gmx_store_pr(f_buf + 1*UNROLL, f_iy_S); - gmx_store_pr(f_buf + 2*UNROLL, f_iz_S); - gmx_store_pr(f_buf + 3*UNROLL, f_kx_S); - gmx_store_pr(f_buf + 4*UNROLL, f_ky_S); - gmx_store_pr(f_buf + 5*UNROLL, f_kz_S); + cos_S = gmx_simd_max_r(cos_S, min_one_plus_eps_S); + + theta_S = gmx_simd_acos_r(cos_S); + + invsin_S = gmx_simd_invsqrt_r(gmx_simd_sub_r(one_S, gmx_simd_mul_r(cos_S, cos_S))); + + st_S = gmx_simd_mul_r(gmx_simd_mul_r(k_S, gmx_simd_sub_r(theta0_S, theta_S)), + invsin_S); + sth_S = gmx_simd_mul_r(st_S, cos_S); + + cik_S = gmx_simd_mul_r(st_S, gmx_simd_mul_r(nrij_1_S, nrkj_1_S)); + cii_S = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrij_1_S, nrij_1_S)); + ckk_S = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrkj_1_S, nrkj_1_S)); + + f_ix_S = gmx_simd_mul_r(cii_S, rijx_S); + f_ix_S = gmx_simd_fnmadd_r(cik_S, rkjx_S, f_ix_S); + f_iy_S = gmx_simd_mul_r(cii_S, rijy_S); + f_iy_S = gmx_simd_fnmadd_r(cik_S, rkjy_S, f_iy_S); + f_iz_S = gmx_simd_mul_r(cii_S, rijz_S); + f_iz_S = gmx_simd_fnmadd_r(cik_S, rkjz_S, f_iz_S); + f_kx_S = gmx_simd_mul_r(ckk_S, rkjx_S); + f_kx_S = gmx_simd_fnmadd_r(cik_S, rijx_S, f_kx_S); + f_ky_S = gmx_simd_mul_r(ckk_S, rkjy_S); + f_ky_S = gmx_simd_fnmadd_r(cik_S, rijy_S, f_ky_S); + f_kz_S = gmx_simd_mul_r(ckk_S, rkjz_S); + f_kz_S = gmx_simd_fnmadd_r(cik_S, rijz_S, f_kz_S); + + gmx_simd_store_r(f_buf + 0*GMX_SIMD_REAL_WIDTH, f_ix_S); + gmx_simd_store_r(f_buf + 1*GMX_SIMD_REAL_WIDTH, f_iy_S); + gmx_simd_store_r(f_buf + 2*GMX_SIMD_REAL_WIDTH, f_iz_S); + gmx_simd_store_r(f_buf + 3*GMX_SIMD_REAL_WIDTH, f_kx_S); + gmx_simd_store_r(f_buf + 4*GMX_SIMD_REAL_WIDTH, f_ky_S); + gmx_simd_store_r(f_buf + 5*GMX_SIMD_REAL_WIDTH, f_kz_S); iu = i; s = 0; @@ -1191,16 +1191,15 @@ angles_noener_simd(int nbonds, { for (m = 0; m < DIM; m++) { - f[ai[s]][m] += f_buf[s + m*UNROLL]; - f[aj[s]][m] -= f_buf[s + m*UNROLL] + f_buf[s + (DIM+m)*UNROLL]; - f[ak[s]][m] += f_buf[s + (DIM+m)*UNROLL]; + f[ai[s]][m] += f_buf[s + m*GMX_SIMD_REAL_WIDTH]; + f[aj[s]][m] -= f_buf[s + m*GMX_SIMD_REAL_WIDTH] + f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH]; + f[ak[s]][m] += f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH]; } s++; iu += nfa1; } - while (s < UNROLL && iu < nbonds); + while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds); } -#undef UNROLL } #endif /* SIMD_BONDEDS */ @@ -1514,125 +1513,123 @@ dih_angle_simd(const rvec *x, const int *ai, const int *aj, const int *ak, const int *al, const pbc_simd_t *pbc, real *dr, - gmx_mm_pr *phi_S, - gmx_mm_pr *mx_S, gmx_mm_pr *my_S, gmx_mm_pr *mz_S, - gmx_mm_pr *nx_S, gmx_mm_pr *ny_S, gmx_mm_pr *nz_S, - gmx_mm_pr *nrkj_m2_S, - gmx_mm_pr *nrkj_n2_S, + gmx_simd_real_t *phi_S, + gmx_simd_real_t *mx_S, gmx_simd_real_t *my_S, gmx_simd_real_t *mz_S, + gmx_simd_real_t *nx_S, gmx_simd_real_t *ny_S, gmx_simd_real_t *nz_S, + gmx_simd_real_t *nrkj_m2_S, + gmx_simd_real_t *nrkj_n2_S, real *p, real *q) { -#define UNROLL GMX_SIMD_WIDTH_HERE - int s, m; - gmx_mm_pr rijx_S, rijy_S, rijz_S; - gmx_mm_pr rkjx_S, rkjy_S, rkjz_S; - gmx_mm_pr rklx_S, rkly_S, rklz_S; - gmx_mm_pr cx_S, cy_S, cz_S; - gmx_mm_pr cn_S; - gmx_mm_pr s_S; - gmx_mm_pr ipr_S; - gmx_mm_pr iprm_S, iprn_S; - gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S; - gmx_mm_pr toler_S; - gmx_mm_pr p_S, q_S; - gmx_mm_pr nrkj2_min_S; - gmx_mm_pr real_eps_S; + int s, m; + gmx_simd_real_t rijx_S, rijy_S, rijz_S; + gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S; + gmx_simd_real_t rklx_S, rkly_S, rklz_S; + gmx_simd_real_t cx_S, cy_S, cz_S; + gmx_simd_real_t cn_S; + gmx_simd_real_t s_S; + gmx_simd_real_t ipr_S; + gmx_simd_real_t iprm_S, iprn_S; + gmx_simd_real_t nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S; + gmx_simd_real_t toler_S; + gmx_simd_real_t p_S, q_S; + gmx_simd_real_t nrkj2_min_S; + gmx_simd_real_t real_eps_S; /* Used to avoid division by zero. * We take into acount that we multiply the result by real_eps_S. */ - nrkj2_min_S = gmx_set1_pr(GMX_REAL_MIN/(2*GMX_REAL_EPS)); + nrkj2_min_S = gmx_simd_set1_r(GMX_REAL_MIN/(2*GMX_REAL_EPS)); /* The value of the last significant bit (GMX_REAL_EPS is half of that) */ - real_eps_S = gmx_set1_pr(2*GMX_REAL_EPS); + real_eps_S = gmx_simd_set1_r(2*GMX_REAL_EPS); - for (s = 0; s < UNROLL; s++) + for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++) { /* If you can't use pbc_dx_simd below for PBC, e.g. because * you can't round in SIMD, use pbc_rvec_sub here. */ for (m = 0; m < DIM; m++) { - dr[s + (0*DIM + m)*UNROLL] = x[ai[s]][m] - x[aj[s]][m]; - dr[s + (1*DIM + m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m]; - dr[s + (2*DIM + m)*UNROLL] = x[ak[s]][m] - x[al[s]][m]; + dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m]; + dr[s + (1*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m]; + dr[s + (2*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[al[s]][m]; } } - rijx_S = gmx_load_pr(dr + 0*UNROLL); - rijy_S = gmx_load_pr(dr + 1*UNROLL); - rijz_S = gmx_load_pr(dr + 2*UNROLL); - rkjx_S = gmx_load_pr(dr + 3*UNROLL); - rkjy_S = gmx_load_pr(dr + 4*UNROLL); - rkjz_S = gmx_load_pr(dr + 5*UNROLL); - rklx_S = gmx_load_pr(dr + 6*UNROLL); - rkly_S = gmx_load_pr(dr + 7*UNROLL); - rklz_S = gmx_load_pr(dr + 8*UNROLL); + rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH); + rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH); + rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH); + rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH); + rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH); + rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH); + rklx_S = gmx_simd_load_r(dr + 6*GMX_SIMD_REAL_WIDTH); + rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH); + rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH); pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc); pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc); pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc); - gmx_cprod_pr(rijx_S, rijy_S, rijz_S, - rkjx_S, rkjy_S, rkjz_S, - mx_S, my_S, mz_S); + gmx_simd_cprod_r(rijx_S, rijy_S, rijz_S, + rkjx_S, rkjy_S, rkjz_S, + mx_S, my_S, mz_S); - gmx_cprod_pr(rkjx_S, rkjy_S, rkjz_S, - rklx_S, rkly_S, rklz_S, - nx_S, ny_S, nz_S); + gmx_simd_cprod_r(rkjx_S, rkjy_S, rkjz_S, + rklx_S, rkly_S, rklz_S, + nx_S, ny_S, nz_S); - gmx_cprod_pr(*mx_S, *my_S, *mz_S, - *nx_S, *ny_S, *nz_S, - &cx_S, &cy_S, &cz_S); + gmx_simd_cprod_r(*mx_S, *my_S, *mz_S, + *nx_S, *ny_S, *nz_S, + &cx_S, &cy_S, &cz_S); - cn_S = gmx_sqrt_pr(gmx_norm2_pr(cx_S, cy_S, cz_S)); + cn_S = gmx_simd_sqrt_r(gmx_simd_norm2_r(cx_S, cy_S, cz_S)); - s_S = gmx_iprod_pr(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S); + s_S = gmx_simd_iprod_r(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S); /* Determine the dihedral angle, the sign might need correction */ - *phi_S = gmx_atan2_pr(cn_S, s_S); + *phi_S = gmx_simd_atan2_r(cn_S, s_S); - ipr_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S, - *nx_S, *ny_S, *nz_S); + ipr_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S, + *nx_S, *ny_S, *nz_S); - iprm_S = gmx_norm2_pr(*mx_S, *my_S, *mz_S); - iprn_S = gmx_norm2_pr(*nx_S, *ny_S, *nz_S); + iprm_S = gmx_simd_norm2_r(*mx_S, *my_S, *mz_S); + iprn_S = gmx_simd_norm2_r(*nx_S, *ny_S, *nz_S); - nrkj2_S = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S); + nrkj2_S = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S); /* Avoid division by zero. When zero, the result is multiplied by 0 * anyhow, so the 3 max below do not affect the final result. */ - nrkj2_S = gmx_max_pr(nrkj2_S, nrkj2_min_S); - nrkj_1_S = gmx_invsqrt_pr(nrkj2_S); - nrkj_2_S = gmx_mul_pr(nrkj_1_S, nrkj_1_S); - nrkj_S = gmx_mul_pr(nrkj2_S, nrkj_1_S); + nrkj2_S = gmx_simd_max_r(nrkj2_S, nrkj2_min_S); + nrkj_1_S = gmx_simd_invsqrt_r(nrkj2_S); + nrkj_2_S = gmx_simd_mul_r(nrkj_1_S, nrkj_1_S); + nrkj_S = gmx_simd_mul_r(nrkj2_S, nrkj_1_S); - toler_S = gmx_mul_pr(nrkj2_S, real_eps_S); + toler_S = gmx_simd_mul_r(nrkj2_S, real_eps_S); /* Here the plain-C code uses a conditional, but we can't do that in SIMD. * So we take a max with the tolerance instead. Since we multiply with * m or n later, the max does not affect the results. */ - iprm_S = gmx_max_pr(iprm_S, toler_S); - iprn_S = gmx_max_pr(iprn_S, toler_S); - *nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S)); - *nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S)); + iprm_S = gmx_simd_max_r(iprm_S, toler_S); + iprn_S = gmx_simd_max_r(iprn_S, toler_S); + *nrkj_m2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprm_S)); + *nrkj_n2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprn_S)); /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */ *phi_S = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S); - p_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S, - rkjx_S, rkjy_S, rkjz_S); - p_S = gmx_mul_pr(p_S, nrkj_2_S); + p_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S, + rkjx_S, rkjy_S, rkjz_S); + p_S = gmx_simd_mul_r(p_S, nrkj_2_S); - q_S = gmx_iprod_pr(rklx_S, rkly_S, rklz_S, - rkjx_S, rkjy_S, rkjz_S); - q_S = gmx_mul_pr(q_S, nrkj_2_S); + q_S = gmx_simd_iprod_r(rklx_S, rkly_S, rklz_S, + rkjx_S, rkjy_S, rkjz_S); + q_S = gmx_simd_mul_r(q_S, nrkj_2_S); - gmx_store_pr(p, p_S); - gmx_store_pr(q, q_S); -#undef UNROLL + gmx_simd_store_r(p, p_S); + gmx_simd_store_r(q, q_S); } #endif /* SIMD_BONDEDS */ @@ -1982,48 +1979,47 @@ pdihs_noener_simd(int nbonds, const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd, int gmx_unused *global_atom_index) { -#define UNROLL GMX_SIMD_WIDTH_HERE - const int nfa1 = 5; - int i, iu, s; - int type, ai[UNROLL], aj[UNROLL], ak[UNROLL], al[UNROLL]; - int t1[UNROLL], t2[UNROLL], t3[UNROLL]; - real ddphi; - real dr_array[3*DIM*UNROLL+UNROLL], *dr; - real buf_array[7*UNROLL+UNROLL], *buf; - real *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l; - gmx_mm_pr phi0_S, phi_S; - gmx_mm_pr mx_S, my_S, mz_S; - gmx_mm_pr nx_S, ny_S, nz_S; - gmx_mm_pr nrkj_m2_S, nrkj_n2_S; - gmx_mm_pr cp_S, mdphi_S, mult_S; - gmx_mm_pr sin_S, cos_S; - gmx_mm_pr mddphi_S; - gmx_mm_pr sf_i_S, msf_l_S; - pbc_simd_t pbc_simd; + const int nfa1 = 5; + int i, iu, s; + int type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH]; + int t1[GMX_SIMD_REAL_WIDTH], t2[GMX_SIMD_REAL_WIDTH], t3[GMX_SIMD_REAL_WIDTH]; + real ddphi; + real dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr; + real buf_array[7*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf; + real *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l; + gmx_simd_real_t phi0_S, phi_S; + gmx_simd_real_t mx_S, my_S, mz_S; + gmx_simd_real_t nx_S, ny_S, nz_S; + gmx_simd_real_t nrkj_m2_S, nrkj_n2_S; + gmx_simd_real_t cp_S, mdphi_S, mult_S; + gmx_simd_real_t sin_S, cos_S; + gmx_simd_real_t mddphi_S; + gmx_simd_real_t sf_i_S, msf_l_S; + pbc_simd_t pbc_simd; /* Ensure SIMD register alignment */ - dr = gmx_simd_align_real(dr_array); - buf = gmx_simd_align_real(buf_array); + dr = gmx_simd_align_r(dr_array); + buf = gmx_simd_align_r(buf_array); /* Extract aligned pointer for parameters and variables */ - cp = buf + 0*UNROLL; - phi0 = buf + 1*UNROLL; - mult = buf + 2*UNROLL; - p = buf + 3*UNROLL; - q = buf + 4*UNROLL; - sf_i = buf + 5*UNROLL; - msf_l = buf + 6*UNROLL; + cp = buf + 0*GMX_SIMD_REAL_WIDTH; + phi0 = buf + 1*GMX_SIMD_REAL_WIDTH; + mult = buf + 2*GMX_SIMD_REAL_WIDTH; + p = buf + 3*GMX_SIMD_REAL_WIDTH; + q = buf + 4*GMX_SIMD_REAL_WIDTH; + sf_i = buf + 5*GMX_SIMD_REAL_WIDTH; + msf_l = buf + 6*GMX_SIMD_REAL_WIDTH; set_pbc_simd(pbc, &pbc_simd); - /* nbonds is the number of dihedrals times nfa1, here we step UNROLL dihs */ - for (i = 0; (i < nbonds); i += UNROLL*nfa1) + /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */ + for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1) { - /* Collect atoms quadruplets for UNROLL dihedrals. + /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals. * iu indexes into forceatoms, we should not let iu go beyond nbonds. */ iu = i; - for (s = 0; s < UNROLL; s++) + for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++) { type = forceatoms[iu]; ai[s] = forceatoms[iu+1]; @@ -2042,7 +2038,7 @@ pdihs_noener_simd(int nbonds, } } - /* Caclulate UNROLL dihedral angles at once */ + /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */ dih_angle_simd(x, ai, aj, ak, al, &pbc_simd, dr, &phi_S, @@ -2052,34 +2048,34 @@ pdihs_noener_simd(int nbonds, &nrkj_n2_S, p, q); - cp_S = gmx_load_pr(cp); - phi0_S = gmx_load_pr(phi0); - mult_S = gmx_load_pr(mult); + cp_S = gmx_simd_load_r(cp); + phi0_S = gmx_simd_load_r(phi0); + mult_S = gmx_simd_load_r(mult); - mdphi_S = gmx_sub_pr(gmx_mul_pr(mult_S, phi_S), phi0_S); + mdphi_S = gmx_simd_sub_r(gmx_simd_mul_r(mult_S, phi_S), phi0_S); - /* Calculate UNROLL sines at once */ - gmx_sincos_pr(mdphi_S, &sin_S, &cos_S); - mddphi_S = gmx_mul_pr(gmx_mul_pr(cp_S, mult_S), sin_S); - sf_i_S = gmx_mul_pr(mddphi_S, nrkj_m2_S); - msf_l_S = gmx_mul_pr(mddphi_S, nrkj_n2_S); + /* Calculate GMX_SIMD_REAL_WIDTH sines at once */ + gmx_simd_sincos_r(mdphi_S, &sin_S, &cos_S); + mddphi_S = gmx_simd_mul_r(gmx_simd_mul_r(cp_S, mult_S), sin_S); + sf_i_S = gmx_simd_mul_r(mddphi_S, nrkj_m2_S); + msf_l_S = gmx_simd_mul_r(mddphi_S, nrkj_n2_S); /* After this m?_S will contain f[i] */ - mx_S = gmx_mul_pr(sf_i_S, mx_S); - my_S = gmx_mul_pr(sf_i_S, my_S); - mz_S = gmx_mul_pr(sf_i_S, mz_S); + mx_S = gmx_simd_mul_r(sf_i_S, mx_S); + my_S = gmx_simd_mul_r(sf_i_S, my_S); + mz_S = gmx_simd_mul_r(sf_i_S, mz_S); /* After this m?_S will contain -f[l] */ - nx_S = gmx_mul_pr(msf_l_S, nx_S); - ny_S = gmx_mul_pr(msf_l_S, ny_S); - nz_S = gmx_mul_pr(msf_l_S, nz_S); + nx_S = gmx_simd_mul_r(msf_l_S, nx_S); + ny_S = gmx_simd_mul_r(msf_l_S, ny_S); + nz_S = gmx_simd_mul_r(msf_l_S, nz_S); - gmx_store_pr(dr + 0*UNROLL, mx_S); - gmx_store_pr(dr + 1*UNROLL, my_S); - gmx_store_pr(dr + 2*UNROLL, mz_S); - gmx_store_pr(dr + 3*UNROLL, nx_S); - gmx_store_pr(dr + 4*UNROLL, ny_S); - gmx_store_pr(dr + 5*UNROLL, nz_S); + gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S); + gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S); + gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S); + gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S); + gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S); + gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S); iu = i; s = 0; @@ -2087,19 +2083,18 @@ pdihs_noener_simd(int nbonds, { do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s], p[s], q[s], - dr[ XX *UNROLL+s], - dr[ YY *UNROLL+s], - dr[ ZZ *UNROLL+s], - dr[(DIM+XX)*UNROLL+s], - dr[(DIM+YY)*UNROLL+s], - dr[(DIM+ZZ)*UNROLL+s], + dr[ XX *GMX_SIMD_REAL_WIDTH+s], + dr[ YY *GMX_SIMD_REAL_WIDTH+s], + dr[ ZZ *GMX_SIMD_REAL_WIDTH+s], + dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s], + dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s], + dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s], f); s++; iu += nfa1; } - while (s < UNROLL && iu < nbonds); + while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds); } -#undef UNROLL } #endif /* SIMD_BONDEDS */ diff --git a/src/gromacs/gmxlib/copyrite.cpp b/src/gromacs/gmxlib/copyrite.cpp index 144825abf5..b59d759f1e 100644 --- a/src/gromacs/gmxlib/copyrite.cpp +++ b/src/gromacs/gmxlib/copyrite.cpp @@ -679,7 +679,7 @@ static void gmx_print_version_info(FILE *fp) #define gmx_stringify2(x) #x #define gmx_stringify(x) gmx_stringify2(x) fprintf(fp, "invsqrt routine: %s\n", gmx_stringify(gmx_invsqrt(x))); - fprintf(fp, "CPU acceleration: %s\n", GMX_CPU_ACCELERATION_STRING); + fprintf(fp, "SIMD instructions: %s\n", GMX_SIMD_STRING); fprintf(fp, "FFT library: %s\n", gmx_fft_get_version_info()); #ifdef HAVE_RDTSCP diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c index 0824591f1c..ec1ce471d5 100644 --- a/src/gromacs/gmxlib/gmx_cpuid.c +++ b/src/gromacs/gmxlib/gmx_cpuid.c @@ -134,7 +134,7 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] = }; const char * -gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] = +gmx_cpuid_simd_string[GMX_CPUID_NSIMD] = { "CannotDetect", "None", @@ -222,38 +222,24 @@ gmx_cpuid_feature (gmx_cpuid_t cpuid, -/* What type of acceleration was compiled in, if any? +/* What type of SIMD was compiled in, if any? * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for * AVX too, so it is important that they appear last in the list. */ -#ifdef GMX_X86_AVX_256 -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256; -#elif defined GMX_X86_AVX_128_FMA -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA; -#elif defined GMX_X86_SSE4_1 -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1; -#elif defined GMX_X86_SSE2 -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2; -#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE; -#elif defined GMX_CPU_ACCELERATION_IBM_QPX -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_IBM_QPX; +#ifdef GMX_SIMD_X86_AVX_256 +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_256; +#elif defined GMX_SIMD_X86_AVX_128_FMA +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_128_FMA; +#elif defined GMX_SIMD_X86_SSE4_1 +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE4_1; +#elif defined GMX_SIMD_X86_SSE2 +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE2; +#elif defined GMX_SIMD_SPARC64_HPC_ACE +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE; +#elif defined GMX_SIMD_IBM_QPX +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_QPX; #else -static const -enum gmx_cpuid_acceleration - compiled_acc = GMX_CPUID_ACCELERATION_NONE; +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_NONE; #endif @@ -321,7 +307,7 @@ execute_x86cpuid(unsigned int level, /* Death and horror! * Apparently this is an x86 platform where we don't know how to call cpuid. * - * This is REALLY bad, since we will lose all Gromacs acceleration. + * This is REALLY bad, since we will lose all Gromacs SIMD support. */ *eax = 0; *ebx = 0; @@ -1049,78 +1035,78 @@ gmx_cpuid_formatstring (gmx_cpuid_t cpuid, -enum gmx_cpuid_acceleration -gmx_cpuid_acceleration_suggest (gmx_cpuid_t cpuid) +enum gmx_cpuid_simd +gmx_cpuid_simd_suggest (gmx_cpuid_t cpuid) { - enum gmx_cpuid_acceleration tmpacc; + enum gmx_cpuid_simd tmpsimd; - tmpacc = GMX_CPUID_ACCELERATION_NONE; + tmpsimd = GMX_CPUID_SIMD_NONE; if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_INTEL) { if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX2)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_AVX2_256; + tmpsimd = GMX_CPUID_SIMD_X86_AVX2_256; } else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256; + tmpsimd = GMX_CPUID_SIMD_X86_AVX_256; } else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1; + tmpsimd = GMX_CPUID_SIMD_X86_SSE4_1; } else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2; + tmpsimd = GMX_CPUID_SIMD_X86_SSE2; } } else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_AMD) { if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA; + tmpsimd = GMX_CPUID_SIMD_X86_AVX_128_FMA; } else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1; + tmpsimd = GMX_CPUID_SIMD_X86_SSE4_1; } else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2)) { - tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2; + tmpsimd = GMX_CPUID_SIMD_X86_SSE2; } } else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_FUJITSU) { if (strstr(gmx_cpuid_brand(cpuid), "SPARC64")) { - tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE; + tmpsimd = GMX_CPUID_SIMD_SPARC64_HPC_ACE; } } else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_IBM) { if (strstr(gmx_cpuid_brand(cpuid), "A2")) { - tmpacc = GMX_CPUID_ACCELERATION_IBM_QPX; + tmpsimd = GMX_CPUID_SIMD_IBM_QPX; } } - return tmpacc; + return tmpsimd; } int -gmx_cpuid_acceleration_check(gmx_cpuid_t cpuid, - FILE * log, - int print_to_stderr) +gmx_cpuid_simd_check(gmx_cpuid_t cpuid, + FILE * log, + int print_to_stderr) { int rc; char str[1024]; - enum gmx_cpuid_acceleration acc; + enum gmx_cpuid_simd simd; - acc = gmx_cpuid_acceleration_suggest(cpuid); + simd = gmx_cpuid_simd_suggest(cpuid); - rc = (acc != compiled_acc); + rc = (simd != compiled_simd); gmx_cpuid_formatstring(cpuid, str, 1023); str[1023] = '\0'; @@ -1128,13 +1114,13 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t cpuid, if (log != NULL) { fprintf(log, - "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n" + "\nDetecting CPU SIMD instructions.\nPresent hardware specification:\n" "%s" - "Acceleration most likely to fit this hardware: %s\n" - "Acceleration selected at GROMACS compile time: %s\n\n", + "SIMD instructions most likely to fit this hardware: %s\n" + "SIMD instructions selected at GROMACS compile time: %s\n\n", str, - gmx_cpuid_acceleration_string[acc], - gmx_cpuid_acceleration_string[compiled_acc]); + gmx_cpuid_simd_string[simd], + gmx_cpuid_simd_string[compiled_simd]); } if (rc != 0) @@ -1142,16 +1128,16 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t cpuid, if (log != NULL) { fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n" - "Acceleration most likely to fit this hardware: %s\n" - "Acceleration selected at GROMACS compile time: %s\n\n", - gmx_cpuid_acceleration_string[acc], - gmx_cpuid_acceleration_string[compiled_acc]); + "SIMD instructions most likely to fit this hardware: %s\n" + "SIMD instructions selected at GROMACS compile time: %s\n\n", + gmx_cpuid_simd_string[simd], + gmx_cpuid_simd_string[compiled_simd]); } if (print_to_stderr) { - fprintf(stderr, "Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n", - gmx_cpuid_acceleration_string[compiled_acc], - gmx_cpuid_acceleration_string[acc]); + fprintf(stderr, "Compiled SIMD instructions: %s (Gromacs could use %s on this machine, which is better)\n", + gmx_cpuid_simd_string[compiled_simd], + gmx_cpuid_simd_string[simd]); } } return rc; @@ -1167,7 +1153,7 @@ int main(int argc, char **argv) { gmx_cpuid_t cpuid; - enum gmx_cpuid_acceleration acc; + enum gmx_cpuid_simd simd; int i, cnt; if (argc < 2) @@ -1181,7 +1167,7 @@ main(int argc, char **argv) "-model Print CPU model version.\n" "-stepping Print CPU stepping version.\n" "-features Print CPU feature flags.\n" - "-acceleration Print suggested GROMACS acceleration.\n", + "-simd Print suggested GROMACS SIMD instructions.\n", argv[0]); exit(0); } @@ -1224,10 +1210,10 @@ main(int argc, char **argv) } printf("\n"); } - else if (!strncmp(argv[1], "-acceleration", 3)) + else if (!strncmp(argv[1], "-simd", 3)) { - acc = gmx_cpuid_acceleration_suggest(cpuid); - fprintf(stdout, "%s\n", gmx_cpuid_acceleration_string[acc]); + simd = gmx_cpuid_simd_suggest(cpuid); + fprintf(stdout, "%s\n", gmx_cpuid_simd_string[simd]); } gmx_cpuid_done(cpuid); diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.c b/src/gromacs/gmxlib/gmx_detect_hardware.c index 2e56119e47..2ba16d4ce4 100644 --- a/src/gromacs/gmxlib/gmx_detect_hardware.c +++ b/src/gromacs/gmxlib/gmx_detect_hardware.c @@ -251,11 +251,11 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL); bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL); - /* check the acceleration mdrun is compiled with against hardware + /* check the SIMD level mdrun is compiled with against hardware capabilities */ /* TODO: Here we assume homogeneous hardware which is not necessarily the case! Might not hurt to add an extra check over MPI. */ - gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr)); + gmx_cpuid_simd_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr)); /* NOTE: this print is only for and on one physical node */ print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr); diff --git a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt index 862361b937..62939813af 100644 --- a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt +++ b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2012,2013, by the GROMACS development team, led by +# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -35,39 +35,39 @@ # Sources that should always be built file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c) -if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND NOT GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "SSE2" AND NOT GMX_DOUBLE) file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE) file(GLOB NONBONDED_SSE4_1_SINGLE_SOURCES nb_kernel_sse4_1_single/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE) file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE) file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "SSE2" AND GMX_DOUBLE) file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "SSE4.1" AND GMX_DOUBLE) file(GLOB NONBONDED_SSE4_1_DOUBLE_SOURCES nb_kernel_sse4_1_double/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE) file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "AVX_256" AND GMX_DOUBLE) file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c) endif() -if("${GMX_CPU_ACCELERATION}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE) +if("${GMX_SIMD}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE) file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c) endif() diff --git a/src/gromacs/gmxlib/nonbonded/nonbonded.c b/src/gromacs/gmxlib/nonbonded/nonbonded.c index fdd3b0111e..42209eef7d 100644 --- a/src/gromacs/gmxlib/nonbonded/nonbonded.c +++ b/src/gromacs/gmxlib/nonbonded/nonbonded.c @@ -71,31 +71,31 @@ /* Different default (c) and accelerated interaction-specific kernels */ #include "nb_kernel_c/nb_kernel_c.h" -#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE) # include "nb_kernel_sse2_single/nb_kernel_sse2_single.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE) # include "nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE) # include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE) # include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE) # include "nb_kernel_sse2_double/nb_kernel_sse2_double.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE) # include "nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE) # include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h" #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE) # include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h" #endif -#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE) +#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE) # include "nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h" #endif @@ -117,36 +117,36 @@ gmx_nonbonded_setup(t_forcerec * fr, /* Add the generic kernels to the structure stored statically in nb_kernel.c */ nb_kernel_list_add_kernels(kernellist_c, kernellist_c_size); - if (!(fr != NULL && fr->use_cpu_acceleration == FALSE)) + if (!(fr != NULL && fr->use_simd_kernels == FALSE)) { /* Add interaction-specific kernels for different architectures */ /* Single precision */ -#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_sse2_single, kernellist_sse2_single_size); #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_sse4_1_single, kernellist_sse4_1_single_size); #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_avx_128_fma_single, kernellist_avx_128_fma_single_size); #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_avx_256_single, kernellist_avx_256_single_size); #endif /* Double precision */ -#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_sse2_double, kernellist_sse2_double_size); #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_sse4_1_double, kernellist_sse4_1_double_size); #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_avx_128_fma_double, kernellist_avx_128_fma_double_size); #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size); #endif -#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE) +#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_sparc64_hpc_ace_double, kernellist_sparc64_hpc_ace_double_size); #endif ; /* empty statement to avoid a completely empty block */ @@ -181,38 +181,38 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl) arch_and_padding[] = { /* Single precision */ -#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE) { "avx_256_single", 8 }, #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE) { "avx_128_fma_single", 4 }, #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE) { "sse4_1_single", 4 }, #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE) { "sse2_single", 4 }, #endif /* Double precision */ -#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE) { "avx_256_double", 4 }, #endif -#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE) /* Sic. Double precision 2-way SIMD does not require neighbor list padding, * since the kernels execute a loop unrolled a factor 2, followed by * a possible single odd-element epilogue. */ { "avx_128_fma_double", 1 }, #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE) /* No padding - see comment above */ { "sse2_double", 1 }, #endif -#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE) +#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE) /* No padding - see comment above */ { "sse4_1_double", 1 }, #endif -#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE) +#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE) /* No padding - see comment above */ { "sparc64_hpc_ace_double", 1 }, #endif diff --git a/src/gromacs/gmxpreprocess/calc_verletbuf.c b/src/gromacs/gmxpreprocess/calc_verletbuf.c index 7c28034583..13baaf0ce5 100644 --- a/src/gromacs/gmxpreprocess/calc_verletbuf.c +++ b/src/gromacs/gmxpreprocess/calc_verletbuf.c @@ -131,7 +131,7 @@ void verletbuf_get_list_setup(gmx_bool bGPU, #ifndef GMX_NBNXN_SIMD list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE; #else - list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE; + list_setup->cluster_size_j = GMX_SIMD_REAL_WIDTH; #ifdef GMX_NBNXN_SIMD_2XNN /* We assume the smallest cluster size to be on the safe side */ list_setup->cluster_size_j /= 2; diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h index 15c2b3bbf3..a0e1e0a8bb 100644 --- a/src/gromacs/legacyheaders/gmx_cpuid.h +++ b/src/gromacs/legacyheaders/gmx_cpuid.h @@ -115,23 +115,23 @@ enum gmx_cpuid_feature }; -/* Currently supported acceleration instruction sets, intrinsics or other similar combinations +/* Currently supported SIMD instruction sets, intrinsics or other similar combinations * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed), * and we still haven't written the AVX2 kernels. */ -enum gmx_cpuid_acceleration +enum gmx_cpuid_simd { - GMX_CPUID_ACCELERATION_CANNOTDETECT, /* Should only be used if something fails */ - GMX_CPUID_ACCELERATION_NONE, - GMX_CPUID_ACCELERATION_X86_SSE2, - GMX_CPUID_ACCELERATION_X86_SSE4_1, - GMX_CPUID_ACCELERATION_X86_AVX_128_FMA, - GMX_CPUID_ACCELERATION_X86_AVX_256, - GMX_CPUID_ACCELERATION_X86_AVX2_256, - GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE, - GMX_CPUID_ACCELERATION_IBM_QPX, - GMX_CPUID_NACCELERATIONS + GMX_CPUID_SIMD_CANNOTDETECT, /* Should only be used if something fails */ + GMX_CPUID_SIMD_NONE, + GMX_CPUID_SIMD_X86_SSE2, + GMX_CPUID_SIMD_X86_SSE4_1, + GMX_CPUID_SIMD_X86_AVX_128_FMA, + GMX_CPUID_SIMD_X86_AVX_256, + GMX_CPUID_SIMD_X86_AVX2_256, + GMX_CPUID_SIMD_SPARC64_HPC_ACE, + GMX_CPUID_SIMD_IBM_QPX, + GMX_CPUID_NSIMD }; /* Text strings corresponding to CPU vendors */ @@ -142,9 +142,9 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS]; extern const char * gmx_cpuid_feature_string[GMX_CPUID_NFEATURES]; -/* Text strings for Gromacs acceleration/instruction sets */ +/* Text strings for Gromacs SIMD instruction sets */ extern const char * -gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS]; +gmx_cpuid_simd_string[GMX_CPUID_NSIMD]; /* Abstract data type with CPU detection information. Set by gmx_cpuid_init(). */ @@ -281,22 +281,22 @@ gmx_cpuid_formatstring (gmx_cpuid_t cpuid, int n); -/* Suggests a suitable gromacs acceleration based on the support in the +/* Suggests a suitable gromacs SIMD based on the support in the * hardware. */ -enum gmx_cpuid_acceleration -gmx_cpuid_acceleration_suggest (gmx_cpuid_t cpuid); +enum gmx_cpuid_simd +gmx_cpuid_simd_suggest (gmx_cpuid_t cpuid); -/* Check if this binary was compiled with the same acceleration as we +/* Check if this binary was compiled with the same SIMD instructions as we * would suggest for the current hardware. Always print stats to the log file * if it is non-NULL, and if we don't have a match, print a warning in log * (if non-NULL) and if print_to_stderr!=0 also to stderr. */ int -gmx_cpuid_acceleration_check (gmx_cpuid_t cpuid, - FILE * log, - int print_to_stderr); +gmx_cpuid_simd_check (gmx_cpuid_t cpuid, + FILE * log, + int print_to_stderr); /* Release resources used by data structure. Note that the pointer to the diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h index c936ccd0e4..e1cf22ff95 100644 --- a/src/gromacs/legacyheaders/types/forcerec.h +++ b/src/gromacs/legacyheaders/types/forcerec.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -202,7 +202,7 @@ typedef struct { const gmx_hw_info_t *hwinfo; const gmx_gpu_opt_t *gpu_opt; - gmx_bool use_cpu_acceleration; + gmx_bool use_simd_kernels; /* Interaction for calculated in kernels. In many cases this is similar to * the electrostatics settings in the inputrecord, but the difference is that diff --git a/src/gromacs/legacyheaders/types/nb_verlet.h b/src/gromacs/legacyheaders/types/nb_verlet.h index 7099912d66..8c6228e276 100644 --- a/src/gromacs/legacyheaders/types/nb_verlet.h +++ b/src/gromacs/legacyheaders/types/nb_verlet.h @@ -43,11 +43,11 @@ extern "C" { #endif -#ifdef GMX_SIMD_REFERENCE_PLAIN_C +#ifdef GMX_SIMD_REFERENCE #define GMX_NBNXN_SIMD #endif -#if (defined GMX_X86_SSE2) || (defined GMX_CPU_ACCELERATION_IBM_QPX) +#if (defined GMX_SIMD_X86_SSE2_OR_HIGHER) || (defined GMX_SIMD_IBM_QPX) /* Use SIMD accelerated nbnxn search and kernels */ #define GMX_NBNXN_SIMD @@ -60,7 +60,7 @@ extern "C" { * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there */ #define GMX_NBNXN_SIMD_4XN -#if defined GMX_X86_AVX_256 && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD) +#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD) #define GMX_NBNXN_SIMD_2XNN #endif diff --git a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h index 3422968b38..2de1dde4ea 100644 --- a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h +++ b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h @@ -79,7 +79,7 @@ typedef void nbnxn_free_t (void *ptr); typedef struct { int cj; /* The j-cluster */ unsigned excl; /* The exclusion (interaction) bits */ -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX /* Indices into the arrays of SIMD interaction masks. */ char interaction_mask_indices[4]; #endif @@ -264,7 +264,7 @@ typedef struct { */ unsigned *simd_exclusion_filter1; unsigned *simd_exclusion_filter2; -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX real *simd_interaction_array; /* Array of masks needed for exclusions on QPX */ #endif int nout; /* The number of force arrays */ diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c index decbeffd52..8db994eb5f 100644 --- a/src/gromacs/mdlib/forcerec.c +++ b/src/gromacs/mdlib/forcerec.c @@ -1495,7 +1495,7 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec * if (bAllvsAll && fp && MASTER(cr)) { - fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n"); + fprintf(fp, "\nUsing SIMD all-vs-all kernels.\n\n"); } return bAllvsAll; @@ -1545,7 +1545,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, *kernel_type = nbnxnk4xN_SIMD_2xNN; #endif -#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256 +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_SIMD_X86_AVX_256_OR_HIGHER if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT) { /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4), @@ -1578,7 +1578,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, * of precision. In single precision, this is faster on * Bulldozer, and slightly faster on Sandy Bridge. */ -#if ((defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256 || defined __MIC__) && !defined GMX_DOUBLE) || (defined GMX_CPU_ACCELERATION_IBM_QPX) +#if ((defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__) && !defined GMX_DOUBLE) || (defined GMX_SIMD_IBM_QPX) *ewald_excl = ewaldexclAnalytical; #endif if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL) @@ -1609,12 +1609,12 @@ const char *lookup_nbnxn_kernel_name(int kernel_type) case nbnxnk4xN_SIMD_4xN: case nbnxnk4xN_SIMD_2xNN: #ifdef GMX_NBNXN_SIMD -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER /* We have x86 SSE2 compatible SIMD */ -#ifdef GMX_X86_AVX_128_FMA +#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER returnvalue = "AVX-128-FMA"; #else -#if defined GMX_X86_AVX_256 || defined __AVX__ +#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __AVX__ /* x86 SIMD intrinsics can be converted to SSE or AVX depending * on compiler flags. As we use nearly identical intrinsics, * compiling for AVX without an AVX macros effectively results @@ -1622,23 +1622,23 @@ const char *lookup_nbnxn_kernel_name(int kernel_type) * For gcc we check for __AVX__ * At least a check for icc should be added (if there is a macro) */ -#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD +#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_NBNXN_HALF_WIDTH_SIMD returnvalue = "AVX-256"; #else returnvalue = "AVX-128"; #endif #else -#ifdef GMX_X86_SSE4_1 +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER returnvalue = "SSE4.1"; #else returnvalue = "SSE2"; #endif #endif #endif -#else /* GMX_X86_SSE2 */ - /* not GMX_X86_SSE2, but other SIMD */ +#else /* GMX_SIMD_X86_SSE2_OR_HIGHER */ + /* not GMX_SIMD_X86_SSE2_OR_HIGHER, but other SIMD */ returnvalue = "SIMD"; -#endif /* GMX_X86_SSE2 */ +#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */ #else /* GMX_NBNXN_SIMD */ returnvalue = "not available"; #endif /* GMX_NBNXN_SIMD */ @@ -1657,7 +1657,7 @@ const char *lookup_nbnxn_kernel_name(int kernel_type) static void pick_nbnxn_kernel(FILE *fp, const t_commrec *cr, - gmx_bool use_cpu_acceleration, + gmx_bool use_simd_kernels, gmx_bool bUseGPU, gmx_bool bEmulateGPU, const t_inputrec *ir, @@ -1686,7 +1686,7 @@ static void pick_nbnxn_kernel(FILE *fp, if (*kernel_type == nbnxnkNotSet) { - if (use_cpu_acceleration) + if (use_simd_kernels) { pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl); } @@ -1985,7 +1985,7 @@ static void init_nb_verlet(FILE *fp, if (i == 0) /* local */ { - pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration, + pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels, nbv->bUseGPU, bEmulateGPU, ir, &nbv->grp[i].kernel_type, &nbv->grp[i].ewald_excl, @@ -1996,7 +1996,7 @@ static void init_nb_verlet(FILE *fp, if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0) { /* Use GPU for local, select a CPU kernel for non-local */ - pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration, + pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels, FALSE, FALSE, ir, &nbv->grp[i].kernel_type, &nbv->grp[i].ewald_excl, @@ -2133,8 +2133,8 @@ void init_forcerec(FILE *fp, fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE); } - /* By default we turn acceleration on, but it might be turned off further down... */ - fr->use_cpu_acceleration = TRUE; + /* By default we turn SIMD kernels on, but it might be turned off further down... */ + fr->use_simd_kernels = TRUE; fr->bDomDec = DOMAINDECOMP(cr); @@ -2274,12 +2274,12 @@ void init_forcerec(FILE *fp, if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) ) { - fr->use_cpu_acceleration = FALSE; + fr->use_simd_kernels = FALSE; if (fp != NULL) { fprintf(fp, "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n" - "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n"); + "Disabling the usage of most SIMD-specific kernel (e.g. SSE2/SSE4/AVX) routines.\n\n"); } } @@ -2291,12 +2291,12 @@ void init_forcerec(FILE *fp, fr->AllvsAll_workgb = NULL; /* All-vs-all kernels have not been implemented in 4.6, and - * the SIMD group kernels are also buggy in this case. Non-accelerated + * the SIMD group kernels are also buggy in this case. Non-SIMD * group kernels are OK. See Redmine #1249. */ if (fr->bAllvsAll) { fr->bAllvsAll = FALSE; - fr->use_cpu_acceleration = FALSE; + fr->use_simd_kernels = FALSE; if (fp != NULL) { fprintf(fp, diff --git a/src/gromacs/mdlib/genborn.c b/src/gromacs/mdlib/genborn.c index 582d432780..d320af6bf8 100644 --- a/src/gromacs/mdlib/genborn.c +++ b/src/gromacs/mdlib/genborn.c @@ -60,7 +60,7 @@ #include "gromacs/utility/gmxmpi.h" -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER # ifdef GMX_DOUBLE # include "genborn_sse2_double.h" # include "genborn_allvsall_sse2_double.h" @@ -1090,8 +1090,8 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t if (ir->gb_algorithm == egbSTILL) { -#if 0 && defined (GMX_X86_SSE2) - if (fr->use_acceleration) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) + if (fr->use_simd_kernels) { # ifdef GMX_DOUBLE genborn_allvsall_calc_still_radii_sse2_double(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb); @@ -1111,8 +1111,8 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t } else if (ir->gb_algorithm == egbHCT || ir->gb_algorithm == egbOBC) { -#if 0 && defined (GMX_X86_SSE2) - if (fr->use_acceleration) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) + if (fr->use_simd_kernels) { # ifdef GMX_DOUBLE genborn_allvsall_calc_hct_obc_radii_sse2_double(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb); @@ -1140,12 +1140,12 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t /* Switch for determining which algorithm to use for Born radii calculation */ #ifdef GMX_DOUBLE -#if 0 && defined (GMX_X86_SSE2) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */ switch (ir->gb_algorithm) { case egbSTILL: - if (fr->use_acceleration) + if (fr->use_simd_kernels) { calc_gb_rad_still_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born); } @@ -1155,7 +1155,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t } break; case egbHCT: - if (fr->use_acceleration) + if (fr->use_simd_kernels) { calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm); } @@ -1165,7 +1165,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t } break; case egbOBC: - if (fr->use_acceleration) + if (fr->use_simd_kernels) { calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm); } @@ -1199,12 +1199,12 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t #else -#if 0 && defined (GMX_X86_SSE2) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */ switch (ir->gb_algorithm) { case egbSTILL: - if (fr->use_acceleration) + if (fr->use_simd_kernels) { calc_gb_rad_still_sse2_single(cr, fr, born->nr, top, x[0], nl, born); } @@ -1214,7 +1214,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t } break; case egbHCT: - if (fr->use_acceleration) + if (fr->use_simd_kernels) { calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm); } @@ -1225,7 +1225,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t break; case egbOBC: - if (fr->use_acceleration) + if (fr->use_simd_kernels) { calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm); } @@ -1654,8 +1654,8 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t if (fr->bAllvsAll) { -#if 0 && defined (GMX_X86_SSE2) - if (fr->use_acceleration) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) + if (fr->use_simd_kernels) { # ifdef GMX_DOUBLE genborn_allvsall_calc_chainrule_sse2_double(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb); @@ -1676,8 +1676,8 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t return; } -#if 0 && defined (GMX_X86_SSE2) - if (fr->use_acceleration) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) + if (fr->use_simd_kernels) { # ifdef GMX_DOUBLE calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0], diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c b/src/gromacs/mdlib/genborn_allvsall_sse2_double.c index bcfea3c338..b0e6cd3d82 100644 --- a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c +++ b/src/gromacs/mdlib/genborn_allvsall_sse2_double.c @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2009, The GROMACS Development Team. - * Copyright (c) 2012, by the GROMACS development team, led by + * Copyright (c) 2012,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -51,7 +51,7 @@ #include "genborn_allvsall.h" -#if 0 && defined (GMX_X86_SSE2) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) #include diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c b/src/gromacs/mdlib/genborn_allvsall_sse2_single.c index 2a96cc1422..27d5c910c3 100644 --- a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c +++ b/src/gromacs/mdlib/genborn_allvsall_sse2_single.c @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2009, The GROMACS Development Team. - * Copyright (c) 2012, by the GROMACS development team, led by + * Copyright (c) 2012,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -50,7 +50,7 @@ #include "genborn.h" #include "genborn_allvsall.h" -#if 0 && defined (GMX_X86_SSE2) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) #include diff --git a/src/gromacs/mdlib/genborn_sse2_double.c b/src/gromacs/mdlib/genborn_sse2_double.c index cdb7ecaf15..40407177e8 100644 --- a/src/gromacs/mdlib/genborn_sse2_double.c +++ b/src/gromacs/mdlib/genborn_sse2_double.c @@ -58,7 +58,7 @@ #include "gromacs/utility/gmxmpi.h" /* Only compile this file if SSE2 intrinsics are available */ -#if 0 && defined (GMX_X86_SSE2) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) #include #include diff --git a/src/gromacs/mdlib/genborn_sse2_single.c b/src/gromacs/mdlib/genborn_sse2_single.c index 1f62188b7e..74f700ba60 100644 --- a/src/gromacs/mdlib/genborn_sse2_single.c +++ b/src/gromacs/mdlib/genborn_sse2_single.c @@ -59,7 +59,7 @@ /* Only compile this file if SSE intrinsics are available */ -#if 0 && defined (GMX_X86_SSE2) +#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) #include #include diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c index 6ab0da6d50..1abfe492e8 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.c +++ b/src/gromacs/mdlib/nbnxn_atomdata.c @@ -432,7 +432,7 @@ static void nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat) { int i, j; - const int simd_width = GMX_SIMD_WIDTH_HERE; + const int simd_width = GMX_SIMD_REAL_WIDTH; int simd_excl_size; /* Set the diagonal cluster pair exclusion mask setup data. * In the kernel we check 0 < j - i to generate the masks. @@ -482,7 +482,7 @@ nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat) nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j); } -#if (defined GMX_CPU_ACCELERATION_IBM_QPX) +#if (defined GMX_SIMD_IBM_QPX) /* The QPX kernels shouldn't do the bit masking that is done on * x86, because the SIMD units lack bit-wise operations. Instead, * we generate a vector of all 2^4 possible ways an i atom @@ -497,7 +497,7 @@ nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat) * indices are used in the kernels. */ simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE; - const int qpx_simd_width = GMX_SIMD_WIDTH_HERE; + const int qpx_simd_width = GMX_SIMD_REAL_WIDTH; snew_aligned(simd_interaction_array, simd_excl_size * qpx_simd_width, NBNXN_MEM_ALIGN); for (j = 0; j < simd_excl_size; j++) { @@ -1158,33 +1158,33 @@ nbnxn_atomdata_reduce_reals_simd(real gmx_unused * gmx_restrict dest, /* The SIMD width here is actually independent of that in the kernels, * but we use the same width for simplicity (usually optimal anyhow). */ - int i, s; - gmx_mm_pr dest_SSE, src_SSE; + int i, s; + gmx_simd_real_t dest_SSE, src_SSE; if (bDestSet) { - for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE) + for (i = i0; i < i1; i += GMX_SIMD_REAL_WIDTH) { - dest_SSE = gmx_load_pr(dest+i); + dest_SSE = gmx_simd_load_r(dest+i); for (s = 0; s < nsrc; s++) { - src_SSE = gmx_load_pr(src[s]+i); - dest_SSE = gmx_add_pr(dest_SSE, src_SSE); + src_SSE = gmx_simd_load_r(src[s]+i); + dest_SSE = gmx_simd_add_r(dest_SSE, src_SSE); } - gmx_store_pr(dest+i, dest_SSE); + gmx_simd_store_r(dest+i, dest_SSE); } } else { - for (i = i0; i < i1; i += GMX_SIMD_WIDTH_HERE) + for (i = i0; i < i1; i += GMX_SIMD_REAL_WIDTH) { - dest_SSE = gmx_load_pr(src[0]+i); + dest_SSE = gmx_simd_load_r(src[0]+i); for (s = 1; s < nsrc; s++) { - src_SSE = gmx_load_pr(src[s]+i); - dest_SSE = gmx_add_pr(dest_SSE, src_SSE); + src_SSE = gmx_simd_load_r(src[s]+i); + dest_SSE = gmx_simd_add_r(dest_SSE, src_SSE); } - gmx_store_pr(dest+i, dest_SSE); + gmx_simd_store_r(dest+i, dest_SSE); } } #endif diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h index 1e8891c129..e71921699f 100644 --- a/src/gromacs/mdlib/nbnxn_internal.h +++ b/src/gromacs/mdlib/nbnxn_internal.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -70,7 +70,7 @@ extern "C" { #ifdef GMX_NBNXN_SIMD /* Memory alignment in bytes as required by SIMD aligned loads/stores */ -#define NBNXN_MEM_ALIGN (GMX_SIMD_WIDTH_HERE*sizeof(real)) +#define NBNXN_MEM_ALIGN (GMX_SIMD_REAL_WIDTH*sizeof(real)) #else /* No alignment required, but set it so we can call the same routines */ #define NBNXN_MEM_ALIGN 32 @@ -153,16 +153,16 @@ typedef struct { typedef struct nbnxn_x_ci_simd_4xn { /* The i-cluster coordinates for simple search */ - gmx_mm_pr ix_S0, iy_S0, iz_S0; - gmx_mm_pr ix_S1, iy_S1, iz_S1; - gmx_mm_pr ix_S2, iy_S2, iz_S2; - gmx_mm_pr ix_S3, iy_S3, iz_S3; + gmx_simd_real_t ix_S0, iy_S0, iz_S0; + gmx_simd_real_t ix_S1, iy_S1, iz_S1; + gmx_simd_real_t ix_S2, iy_S2, iz_S2; + gmx_simd_real_t ix_S3, iy_S3, iz_S3; } nbnxn_x_ci_simd_4xn_t; typedef struct nbnxn_x_ci_simd_2xnn { /* The i-cluster coordinates for simple search */ - gmx_mm_pr ix_S0, iy_S0, iz_S0; - gmx_mm_pr ix_S2, iy_S2, iz_S2; + gmx_simd_real_t ix_S0, iy_S0, iz_S0; + gmx_simd_real_t ix_S2, iy_S2, iz_S2; } nbnxn_x_ci_simd_2xnn_t; #endif diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py index cc109228c5..2d6c91824a 100755 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/make_verlet_simd_kernel_files.py @@ -2,7 +2,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2013, by the GROMACS development team, led by +# Copyright (c) 2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -132,7 +132,7 @@ VerletKernelTypeDict = { '2xnn' : { 'Define' : 'GMX_NBNXN_SIMD_2XNN', 'WidthSetup' : '/* Include the full-width SIMD macros */\n', - 'WidthCheck' : ('#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)\n' \ + 'WidthCheck' : ('#if !(GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16)\n' \ '#error "unsupported SIMD width"\n' \ '#endif\n'), 'UnrollSize' : 2, @@ -142,7 +142,7 @@ VerletKernelTypeDict = { 'WidthSetup' : ('#ifdef GMX_NBNXN_HALF_WIDTH_SIMD\n' \ '#define GMX_USE_HALF_WIDTH_SIMD_HERE\n' \ '#endif\n'), - 'WidthCheck' : ('#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)\n' \ + 'WidthCheck' : ('#if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8)\n' \ '#error "unsupported SIMD width"\n' \ '#endif\n'), 'UnrollSize' : 1, diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre index f2478956fc..2b5419414d 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -69,7 +69,7 @@ reduce_group_energies(int ng, int ng_2log, const real *VSvdw, const real *VSc, real *Vvdw, real *Vc) {{ - const int unrollj = GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE; + const int unrollj = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE; const int unrollj_half = unrollj/2; int ng_p2, i, j, j0, j1, c, s; diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h index 75c1313f4f..59be0fbd75 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h @@ -54,7 +54,7 @@ #error "Must define an NBNxN kernel flavour before including NBNxN kernel utility functions" #endif -#ifdef GMX_SIMD_REFERENCE_PLAIN_C +#ifdef GMX_SIMD_REFERENCE /* Align a stack-based thread-local working array. */ static gmx_inline int * @@ -65,9 +65,9 @@ prepare_table_load_buffer(const int gmx_unused *array) #include "nbnxn_kernel_simd_utils_ref.h" -#else /* GMX_SIMD_REFERENCE_PLAIN_C */ +#else /* GMX_SIMD_REFERENCE */ -#if defined GMX_X86_SSE2 && !defined __MIC__ +#if defined GMX_SIMD_X86_SSE2_OR_HIGHER && !defined __MIC__ /* Include x86 SSE2 compatible SIMD functions */ /* Set the stride for the lookup of the two LJ parameters from their @@ -85,30 +85,29 @@ static const int nbfp_stride = 4; static gmx_inline int * prepare_table_load_buffer(const int gmx_unused *array) { -#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE - return gmx_simd_align_int(array); +#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE + return gmx_simd_align_i(array); #else return NULL; #endif } -#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE +#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE /* With full AVX-256 SIMD, half SIMD-width table loads are optimal */ -#if GMX_SIMD_WIDTH_HERE == 8 +#if GMX_SIMD_REAL_WIDTH == 8 #define TAB_FDV0 #endif - #ifdef GMX_DOUBLE #include "nbnxn_kernel_simd_utils_x86_256d.h" #else /* GMX_DOUBLE */ #include "nbnxn_kernel_simd_utils_x86_256s.h" #endif /* GMX_DOUBLE */ -#else /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */ +#else /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */ /* We use the FDV0 table layout when we can use aligned table loads */ -#if GMX_SIMD_WIDTH_HERE == 4 +#if GMX_SIMD_REAL_WIDTH == 4 #define TAB_FDV0 #endif @@ -118,48 +117,48 @@ prepare_table_load_buffer(const int gmx_unused *array) #include "nbnxn_kernel_simd_utils_x86_128s.h" #endif /* GMX_DOUBLE */ -#endif /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */ +#endif /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */ -#else /* GMX_X86_SSE2 */ +#else /* GMX_SIMD_X86_SSE2_OR_HIGHER */ -#if GMX_SIMD_WIDTH_HERE > 4 +#if GMX_SIMD_REAL_WIDTH > 4 static const int nbfp_stride = 4; #else -static const int nbfp_stride = GMX_SIMD_WIDTH_HERE; +static const int nbfp_stride = GMX_SIMD_REAL_WIDTH; #endif /* We use the FDV0 table layout when we can use aligned table loads */ -#if GMX_SIMD_WIDTH_HERE == 4 +#if GMX_SIMD_REAL_WIDTH == 4 #define TAB_FDV0 #endif -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX #include "nbnxn_kernel_simd_utils_ibm_qpx.h" -#endif /* GMX_CPU_ACCELERATION_IBM_QPX */ +#endif /* GMX_SIMD_IBM_QPX */ #ifdef __MIC__ #include "nbnxn_kernel_simd_utils_x86_mic.h" #endif -#endif /* GMX_X86_SSE2 */ -#endif /* GMX_SIMD_REFERENCE_PLAIN_C */ +#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */ +#endif /* GMX_SIMD_REFERENCE */ -#if GMX_SIMD_WIDTH_HERE == 4 && !defined GMX_SIMD_REFERENCE_PLAIN_C -#define gmx_mm_pr4 gmx_mm_pr -#define gmx_load_pr4 gmx_load_pr -#define gmx_store_pr4 gmx_store_pr -#define gmx_add_pr4 gmx_add_pr +#if GMX_SIMD_REAL_WIDTH == 4 && !defined GMX_SIMD_REFERENCE +#define gmx_mm_pr4 gmx_simd_real_t +#define gmx_load_pr4 gmx_simd_load_r +#define gmx_store_pr4 gmx_simd_store_r +#define gmx_add_pr4 gmx_simd_add_r #endif #ifndef HAVE_GMX_SUM_SIMD /* should be defined for arch with hardware reduce */ static gmx_inline real -gmx_sum_simd2(gmx_mm_pr x, real* b) +gmx_sum_simd2(gmx_simd_real_t x, real* b) { - gmx_store_pr(b, x); + gmx_simd_store_r(b, x); return b[0]+b[1]; } -#if GMX_SIMD_WIDTH_HERE >= 4 +#if GMX_SIMD_REAL_WIDTH >= 4 static gmx_inline real gmx_sum_simd4(gmx_mm_pr4 x, real* b) { @@ -168,31 +167,31 @@ gmx_sum_simd4(gmx_mm_pr4 x, real* b) } #endif -#if GMX_SIMD_WIDTH_HERE == 2 -static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b) +#if GMX_SIMD_REAL_WIDTH == 2 +static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b) { - gmx_store_pr(b, x); + gmx_simd_store_r(b, x); return b[0]+b[1]; } -#elif GMX_SIMD_WIDTH_HERE == 4 -static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b) +#elif GMX_SIMD_REAL_WIDTH == 4 +static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b) { - gmx_store_pr(b, x); + gmx_simd_store_r(b, x); return b[0]+b[1]+b[2]+b[3]; } -#elif GMX_SIMD_WIDTH_HERE == 8 -static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b) +#elif GMX_SIMD_REAL_WIDTH == 8 +static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b) { - gmx_store_pr(b, x); + gmx_simd_store_r(b, x); return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7]; } -#elif GMX_SIMD_WIDTH_HERE == 16 +#elif GMX_SIMD_REAL_WIDTH == 16 /* This is getting ridiculous, SIMD horizontal adds would help, * but this is not performance critical (only used to reduce energies) */ -static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b) +static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b) { - gmx_store_pr(b, x); + gmx_simd_store_r(b, x); return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7]+b[8]+b[9]+b[10]+b[11]+b[12]+b[13]+b[14]+b[15]; } #else @@ -202,7 +201,7 @@ static gmx_inline real gmx_sum_simd(gmx_mm_pr x, real* b) #ifdef UNROLLJ /* Add energy register to possibly multiple terms in the energy array */ -static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj) +static inline void add_ener_grp(gmx_simd_real_t e_S, real *v, const int *offset_jj) { int jj; @@ -212,10 +211,10 @@ static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj) */ for (jj = 0; jj < (UNROLLJ/2); jj++) { - gmx_mm_pr v_S; + gmx_simd_real_t v_S; - v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE); - gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S)); + v_S = gmx_simd_load_r(v+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH); + gmx_simd_store_r(v+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH, gmx_simd_add_r(v_S, e_S)); } } #endif @@ -225,7 +224,7 @@ static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj) * a single SIMD register. */ static inline void -add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj) +add_ener_grp_halves(gmx_simd_real_t e_S, real *v0, real *v1, const int *offset_jj) { gmx_mm_hpr e_S0, e_S1; int jj; @@ -236,15 +235,15 @@ add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj) { gmx_mm_hpr v_S; - gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2); - gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0)); + gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2); + gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2, gmx_add_hpr(v_S, e_S0)); } for (jj = 0; jj < (UNROLLJ/2); jj++) { gmx_mm_hpr v_S; - gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2); - gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1)); + gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2); + gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_REAL_WIDTH/2, gmx_add_hpr(v_S, e_S1)); } } #endif diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h index bfbf9e24ee..fd857475b4 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -35,11 +35,11 @@ #ifndef _nbnxn_kernel_simd_utils_ibm_qpx_h_ #define _nbnxn_kernel_simd_utils_ibm_qpx_h_ -typedef gmx_mm_pr gmx_exclfilter; +typedef gmx_simd_real_t gmx_exclfilter; static const int filter_stride = 1; /* The 4xn kernel operates on 4-wide i-force registers */ -typedef gmx_mm_pr gmx_mm_pr4; +typedef gmx_simd_real_t gmx_mm_pr4; /* This files contains all functions/macros for the SIMD kernels * which have explicit dependencies on the j-cluster size and/or SIMD-width. @@ -51,9 +51,10 @@ typedef gmx_mm_pr gmx_mm_pr4; /* Collect all [0123] elements of the 4 inputs to out[0123], respectively */ static gmx_inline void -gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d, - gmx_mm_pr *out0, gmx_mm_pr *out1, - gmx_mm_pr *out2, gmx_mm_pr *out3) +gmx_transpose_4_ps(gmx_simd_real_t a, gmx_simd_real_t b, + gmx_simd_real_t c, gmx_simd_real_t d, + gmx_simd_real_t *out0, gmx_simd_real_t *out1, + gmx_simd_real_t *out2, gmx_simd_real_t *out3) { /* Prepare control vectors for swizzling. In its third input, vec_perm accepts indices into the effective 8-wide SIMD vector @@ -63,14 +64,14 @@ gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d, vec_gpci() converts an octal literal of the indices into the correct form for vec_perm() to use. That form is an octal digit in bits 0-2 of the mantissa of each double. */ - gmx_mm_pr p6420 = vec_gpci(06420); - gmx_mm_pr p7531 = vec_gpci(07531); + gmx_simd_real_t p6420 = vec_gpci(06420); + gmx_simd_real_t p7531 = vec_gpci(07531); /* Four-way swizzle (i.e. transpose) of vectors a = a0a1a2a3, etc. */ - gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420); - gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531); - gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420); - gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531); + gmx_simd_real_t b2b0a2a0 = vec_perm(a, b, p6420); + gmx_simd_real_t b3b1a3a1 = vec_perm(a, b, p7531); + gmx_simd_real_t d2d0c2c0 = vec_perm(c, d, p6420); + gmx_simd_real_t d3d1c3c1 = vec_perm(c, d, p7531); *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531); *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531); *out2 = vec_perm(d2d0c2c0, b2b0a2a0, p6420); @@ -79,30 +80,32 @@ gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d, /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */ static gmx_inline void -gmx_shuffle_4_ps_fil01_to_2_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d, - gmx_mm_pr *out0, gmx_mm_pr *out1) +gmx_shuffle_4_ps_fil01_to_2_ps(gmx_simd_real_t a, gmx_simd_real_t b, + gmx_simd_real_t c, gmx_simd_real_t d, + gmx_simd_real_t *out0, gmx_simd_real_t *out1) { - gmx_mm_pr p6420 = vec_gpci(06420); - gmx_mm_pr p7531 = vec_gpci(07531); + gmx_simd_real_t p6420 = vec_gpci(06420); + gmx_simd_real_t p7531 = vec_gpci(07531); /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */ - gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420); - gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531); - gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420); - gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531); + gmx_simd_real_t b2b0a2a0 = vec_perm(a, b, p6420); + gmx_simd_real_t b3b1a3a1 = vec_perm(a, b, p7531); + gmx_simd_real_t d2d0c2c0 = vec_perm(c, d, p6420); + gmx_simd_real_t d3d1c3c1 = vec_perm(c, d, p7531); *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531); *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531); } /* Collect element 2 of the 4 inputs to out */ -static gmx_inline gmx_mm_pr -gmx_shuffle_4_ps_fil2_to_1_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d) +static gmx_inline gmx_simd_real_t +gmx_shuffle_4_ps_fil2_to_1_ps(gmx_simd_real_t a, gmx_simd_real_t b, + gmx_simd_real_t c, gmx_simd_real_t d) { - gmx_mm_pr p6420 = vec_gpci(06420); + gmx_simd_real_t p6420 = vec_gpci(06420); /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */ - gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420); - gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420); + gmx_simd_real_t b2b0a2a0 = vec_perm(a, b, p6420); + gmx_simd_real_t d2d0c2c0 = vec_perm(c, d, p6420); return vec_perm(d2d0c2c0, b2b0a2a0, p6420); } @@ -112,12 +115,12 @@ gmx_shuffle_4_ps_fil2_to_1_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d static gmx_inline int * prepare_table_load_buffer(const int *array) { - return gmx_simd_align_int(array); + return gmx_simd_align_i(array); } static gmx_inline void -load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti, - gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S) +load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti, + gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S) { #ifdef NDEBUG /* Just like 256-bit AVX, we need to use memory to get indices @@ -128,19 +131,19 @@ load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti, #endif /* Here we load 4 aligned reals, but we need just 2 elements of each */ - gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride); - gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride); - gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride); - gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride); + gmx_simd_real_t a = gmx_simd_load_r(tab_coul_FDV0 + ti[0] * nbfp_stride); + gmx_simd_real_t b = gmx_simd_load_r(tab_coul_FDV0 + ti[1] * nbfp_stride); + gmx_simd_real_t c = gmx_simd_load_r(tab_coul_FDV0 + ti[2] * nbfp_stride); + gmx_simd_real_t d = gmx_simd_load_r(tab_coul_FDV0 + ti[3] * nbfp_stride); gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S); } static gmx_inline void load_table_f_v(const real *tab_coul_FDV0, - gmx_epi32 ti_S, int *ti, - gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S, - gmx_mm_pr *ctabv_S) + gmx_simd_int32_t ti_S, int *ti, + gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S, + gmx_simd_real_t *ctabv_S) { #ifdef NDEBUG /* Just like 256-bit AVX, we need to use memory to get indices @@ -151,10 +154,10 @@ load_table_f_v(const real *tab_coul_FDV0, #endif /* Here we load 4 aligned reals, but we need just 3 elements of each. */ - gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride); - gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride); - gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride); - gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride); + gmx_simd_real_t a = gmx_simd_load_r(tab_coul_FDV0 + ti[0] * nbfp_stride); + gmx_simd_real_t b = gmx_simd_load_r(tab_coul_FDV0 + ti[1] * nbfp_stride); + gmx_simd_real_t c = gmx_simd_load_r(tab_coul_FDV0 + ti[2] * nbfp_stride); + gmx_simd_real_t d = gmx_simd_load_r(tab_coul_FDV0 + ti[3] * nbfp_stride); gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S); *ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(a, b, c, d); @@ -167,20 +170,20 @@ load_table_f_v(const real *tab_coul_FDV0, /* Sum the elements within each input register and store the sums in out. */ -static gmx_inline gmx_mm_pr -gmx_mm_transpose_sum4_pr(gmx_mm_pr a, gmx_mm_pr b, - gmx_mm_pr c, gmx_mm_pr d) +static gmx_inline gmx_simd_real_t +gmx_mm_transpose_sum4_pr(gmx_simd_real_t a, gmx_simd_real_t b, + gmx_simd_real_t c, gmx_simd_real_t d) { - gmx_mm_pr a0b0c0d0, a1b1c1d1, a2b2c2d2, a3b3c3d3; + gmx_simd_real_t a0b0c0d0, a1b1c1d1, a2b2c2d2, a3b3c3d3; gmx_transpose_4_ps(a, b, c, d, &a0b0c0d0, &a1b1c1d1, &a2b2c2d2, &a3b3c3d3); /* Now reduce the transposed vectors */ - gmx_mm_pr sum01 = gmx_add_pr(a0b0c0d0, a1b1c1d1); - gmx_mm_pr sim23 = gmx_add_pr(a2b2c2d2, a3b3c3d3); - return gmx_add_pr(sum01, sim23); + gmx_simd_real_t sum01 = gmx_simd_add_r(a0b0c0d0, a1b1c1d1); + gmx_simd_real_t sim23 = gmx_simd_add_r(a2b2c2d2, a3b3c3d3); + return gmx_simd_add_r(sum01, sim23); } #ifdef GMX_DOUBLE @@ -191,23 +194,23 @@ gmx_mm_transpose_sum4_pr(gmx_mm_pr a, gmx_mm_pr b, * reciprocal square roots. */ static gmx_inline void -gmx_mm_invsqrt2_pd(gmx_mm_pr in0, gmx_mm_pr in1, - gmx_mm_pr *out0, gmx_mm_pr *out1) +gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1, + gmx_simd_real_t *out0, gmx_simd_real_t *out1) { - *out0 = gmx_invsqrt_pr(in0); - *out1 = gmx_invsqrt_pr(in1); + *out0 = gmx_simd_invsqrt_r(in0); + *out1 = gmx_simd_invsqrt_r(in1); } #endif static gmx_inline void load_lj_pair_params(const real *nbfp, const int *type, int aj, - gmx_mm_pr *c6_S, gmx_mm_pr *c12_S) + gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S) { /* Here we load 4 aligned reals, but we need just 2 elemnts of each. */ - gmx_mm_pr a = gmx_load_pr(nbfp + type[aj+0] * nbfp_stride); - gmx_mm_pr b = gmx_load_pr(nbfp + type[aj+1] * nbfp_stride); - gmx_mm_pr c = gmx_load_pr(nbfp + type[aj+2] * nbfp_stride); - gmx_mm_pr d = gmx_load_pr(nbfp + type[aj+3] * nbfp_stride); + gmx_simd_real_t a = gmx_simd_load_r(nbfp + type[aj+0] * nbfp_stride); + gmx_simd_real_t b = gmx_simd_load_r(nbfp + type[aj+1] * nbfp_stride); + gmx_simd_real_t c = gmx_simd_load_r(nbfp + type[aj+2] * nbfp_stride); + gmx_simd_real_t d = gmx_simd_load_r(nbfp + type[aj+3] * nbfp_stride); gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, c6_S, c12_S); } @@ -234,7 +237,7 @@ static gmx_inline gmx_exclfilter gmx_load_exclusion_filter(const unsigned *a) /* Code for handling loading and applying exclusion masks. Note that parameter a is not treated like an array index; it is naively added to b, so should be in bytes. */ -static gmx_inline gmx_mm_pb gmx_load_interaction_mask_pb(long a, const real *b) +static gmx_inline gmx_simd_bool_t gmx_load_interaction_mask_pb(long a, const real *b) { #ifdef NDEBUG return vec_ld(a, (real *) b); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h index e757a8d561..90f52c7a2f 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h @@ -37,7 +37,7 @@ typedef gmx_simd_ref_epi32 gmx_simd_ref_exclfilter; typedef gmx_simd_ref_exclfilter gmx_exclfilter; -static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE; +static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH; /* Set the stride for the lookup of the two LJ parameters from their (padded) array. Only strides of 2 and 4 are currently supported. */ @@ -49,7 +49,7 @@ static const int nbfp_stride = 2; static const int nbfp_stride = 4; #endif -#if GMX_SIMD_WIDTH_HERE > 4 +#if GMX_SIMD_REAL_WIDTH > 4 /* The 4xn kernel operates on 4-wide i-force registers */ /* float/double SIMD register type */ @@ -114,7 +114,7 @@ typedef gmx_simd_ref_pr gmx_mm_pr4; /* Half-width SIMD real type */ /* float/double SIMD register type */ typedef struct { - real r[GMX_SIMD_WIDTH_HERE/2]; + real r[GMX_SIMD_REAL_WIDTH/2]; } gmx_mm_hpr; /* Half-width SIMD operations */ @@ -125,7 +125,7 @@ gmx_load_hpr(gmx_mm_hpr *a, const real *b) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { a->r[i] = b[i]; } @@ -137,7 +137,7 @@ gmx_set1_hpr(gmx_mm_hpr *a, real b) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { a->r[i] = b; } @@ -149,10 +149,10 @@ gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { a->r[ i] = b[0]; - a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1]; + a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[1]; } } @@ -162,10 +162,10 @@ gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { a->r[i] = b[i]; - a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i]; + a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[i]; } } @@ -175,7 +175,7 @@ gmx_store_hpr(real *a, gmx_mm_hpr b) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { a[i] = b.r[i]; } @@ -187,7 +187,7 @@ gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b) gmx_mm_hpr c; int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { c.r[i] = a.r[i] + b.r[i]; } @@ -201,7 +201,7 @@ gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b) gmx_mm_hpr c; int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { c.r[i] = a.r[i] - b.r[i]; } @@ -216,13 +216,13 @@ gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b) gmx_mm_hpr c; int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { c.r[i] = a.r[i] + - a.r[GMX_SIMD_WIDTH_HERE/2+i] + + a.r[GMX_SIMD_REAL_WIDTH/2+i] + b.r[i] + - b.r[GMX_SIMD_WIDTH_HERE/2+i]; + b.r[GMX_SIMD_REAL_WIDTH/2+i]; } return c; @@ -241,12 +241,12 @@ gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b) sum.r[2] = 0; sum.r[3] = 0; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { sum.r[0] += a.r[i]; - sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i]; + sum.r[1] += a.r[GMX_SIMD_REAL_WIDTH/2+i]; sum.r[2] += b.r[i]; - sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i]; + sum.r[3] += b.r[GMX_SIMD_REAL_WIDTH/2+i]; } return sum; @@ -258,10 +258,10 @@ gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { b->r[i] = a.r[i]; - c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i]; + c->r[i] = a.r[GMX_SIMD_REAL_WIDTH/2 + i]; } } static gmx_inline void @@ -269,10 +269,10 @@ gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c) { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { c->r[i] = a.r[i]; - c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i]; + c->r[GMX_SIMD_REAL_WIDTH/2 + i] = b.r[i]; } } @@ -287,13 +287,13 @@ load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S, { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++) { ctab0_S->r[i] = tab_coul_F[ti_S.r[i]]; ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1]; } - *ctab1_S = gmx_sub_pr(*ctab1_S, *ctab0_S); + *ctab1_S = gmx_simd_sub_r(*ctab1_S, *ctab0_S); } static gmx_inline void @@ -306,7 +306,7 @@ load_table_f_v(const real *tab_coul_F, const real *tab_coul_V, load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S); - for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++) { ctabv_S->r[i] = tab_coul_V[ti_S.r[i]]; } @@ -320,7 +320,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti, { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++) { ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4]; ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1]; @@ -337,7 +337,7 @@ load_table_f_v(const real *tab_coul_FDV0, load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S); - for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++) { ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2]; } @@ -347,7 +347,7 @@ load_table_f_v(const real *tab_coul_FDV0, /* Sum the elements within each input register and store the sums in out. * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead. */ -#if GMX_SIMD_WIDTH_HERE == 2 +#if GMX_SIMD_REAL_WIDTH == 2 static gmx_inline gmx_simd_ref_pr gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1) { @@ -360,8 +360,8 @@ gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1) } #endif -#if GMX_SIMD_WIDTH_HERE >= 4 -#if GMX_SIMD_WIDTH_HERE == 4 +#if GMX_SIMD_REAL_WIDTH >= 4 +#if GMX_SIMD_REAL_WIDTH == 4 static gmx_inline gmx_simd_ref_pr #else static gmx_inline gmx_mm_pr4 @@ -369,7 +369,7 @@ static gmx_inline gmx_mm_pr4 gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1, gmx_simd_ref_pr in2, gmx_simd_ref_pr in3) { -#if GMX_SIMD_WIDTH_HERE == 4 +#if GMX_SIMD_REAL_WIDTH == 4 gmx_simd_ref_pr sum; #else gmx_mm_pr4 sum; @@ -381,7 +381,7 @@ gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1, sum.r[2] = 0; sum.r[3] = 0; - for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++) { sum.r[0] += in0.r[i]; sum.r[1] += in1.r[i]; @@ -403,8 +403,8 @@ static gmx_inline void gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1, gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1) { - *out0 = gmx_invsqrt_pr(in0); - *out1 = gmx_invsqrt_pr(in1); + *out0 = gmx_simd_invsqrt_r(in0); + *out1 = gmx_simd_invsqrt_r(in1); } #endif @@ -414,7 +414,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj, { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++) { c6_S->r[i] = nbfp[type[aj+i]*nbfp_stride]; c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1]; @@ -429,12 +429,12 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1, { int i; - for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++) + for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++) { c6_S->r[i] = nbfp0[type[aj+i]*nbfp_stride]; - c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride]; + c6_S->r[GMX_SIMD_REAL_WIDTH/2 + i] = nbfp1[type[aj+i]*nbfp_stride]; c12_S->r[i] = nbfp0[type[aj+i]*nbfp_stride+1]; - c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1]; + c12_S->r[GMX_SIMD_REAL_WIDTH/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1]; } } #endif diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h index f866758218..92be81d99f 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,8 +45,8 @@ * energy group pair energy storage */ -typedef gmx_epi32 gmx_exclfilter; -static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE; +typedef gmx_simd_int32_t gmx_exclfilter; +static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH; /* Transpose 2 double precision registers */ static gmx_inline void @@ -130,7 +130,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj, * AVX_256. */ static gmx_inline void -load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int gmx_unused *ti, +load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int gmx_unused *ti, __m128d *ctab0_S, __m128d *ctab1_S) { int idx[2]; @@ -150,7 +150,7 @@ load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int gmx_unused *ti, static gmx_inline void load_table_f_v(const real *tab_coul_F, const real *tab_coul_V, - gmx_epi32 ti_S, int gmx_unused *ti, + gmx_simd_int32_t ti_S, int gmx_unused *ti, __m128d *ctab0_S, __m128d *ctab1_S, __m128d *ctabv_S) { int idx[2]; @@ -186,7 +186,7 @@ gmx_load_exclusion_filter(const unsigned *i) return _mm_load_si128((__m128i *) i); } -static gmx_inline gmx_mm_pb +static gmx_inline gmx_simd_bool_t gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1) { return gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h index 02c6ca1a5b..0571a6cd1f 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,8 +45,8 @@ * energy group pair energy storage */ -typedef gmx_epi32 gmx_exclfilter; -static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE; +typedef gmx_simd_int32_t gmx_exclfilter; +static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH; /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */ static gmx_inline void @@ -95,7 +95,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj, for (p = 0; p < UNROLLJ; p++) { /* Here we load 4 aligned floats, but we need just 2 */ - clj_S[p] = gmx_load_pr(nbfp+type[aj+p]*nbfp_stride); + clj_S[p] = gmx_simd_load_r(nbfp+type[aj+p]*nbfp_stride); } gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[0], clj_S[1], clj_S[2], clj_S[3], c6_S, c12_S); } @@ -116,7 +116,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj, * AVX_256. */ static gmx_inline void -load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int gmx_unused *ti, +load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti, __m128 *ctab0_S, __m128 *ctab1_S) { int idx[4]; @@ -139,7 +139,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int gmx_unused *ti, } static gmx_inline void -load_table_f_v(const real *tab_coul_FDV0, gmx_epi32 ti_S, int gmx_unused *ti, +load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti, __m128 *ctab0_S, __m128 *ctab1_S, __m128 *ctabv_S) { int idx[4]; @@ -175,7 +175,7 @@ gmx_load_exclusion_filter(const unsigned *i) return _mm_load_si128((__m128i *) i); } -static gmx_inline gmx_mm_pb +static gmx_inline gmx_simd_bool_t gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1) { return gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h index 97f25fa3d4..d5a013d412 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -43,7 +43,7 @@ * energy group pair energy storage */ -typedef gmx_mm_pr gmx_exclfilter; +typedef gmx_simd_real_t gmx_exclfilter; static const int filter_stride = 2; /* Transpose 2 double precision registers */ @@ -193,10 +193,10 @@ gmx_load1_exclfilter(int e) static gmx_inline gmx_exclfilter gmx_load_exclusion_filter(const unsigned *i) { - return gmx_load_pr((real *) (i)); + return gmx_simd_load_r((real *) (i)); } -static gmx_inline gmx_mm_pb +static gmx_inline gmx_simd_bool_t gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1) { /* With <= 16 bits used the cast and conversion should not be diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h index a8be068608..2c6fac5ba9 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -43,7 +43,7 @@ * energy group pair energy storage */ -typedef gmx_mm_pr gmx_exclfilter; +typedef gmx_simd_real_t gmx_exclfilter; static const int filter_stride = 1; /* The 4xn kernel operates on 4-wide i-force registers */ @@ -76,7 +76,7 @@ static const int filter_stride = 1; #define gmx_sum4_hpr gmx_mm256_sum4h_m128 static gmx_inline void -gmx_pr_to_2hpr(gmx_mm_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c) +gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c) { *b = _mm256_extractf128_ps(a, 0); *c = _mm256_extractf128_ps(a, 1); @@ -84,7 +84,7 @@ gmx_pr_to_2hpr(gmx_mm_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c) /* Store half width SIMD registers a and b in full width register *c */ static gmx_inline void -gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_pr *c) +gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c) { *c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1); } @@ -217,7 +217,7 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1, * AVX_256. */ static gmx_inline void -load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti, +load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti, __m256 *ctab0_S, __m256 *ctab1_S) { __m128 ctab_S[8], ctabt_S[4]; @@ -239,7 +239,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti, } static gmx_inline void -load_table_f_v(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti, +load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti, __m256 *ctab0_S, __m256 *ctab1_S, __m256 *ctabv_S) { __m128 ctab_S[8], ctabt_S[4], ctabvt_S[2]; @@ -276,10 +276,10 @@ gmx_load1_exclfilter(int e) static gmx_inline gmx_exclfilter gmx_load_exclusion_filter(const unsigned *i) { - return gmx_load_pr((real *) (i)); + return gmx_simd_load_r((real *) (i)); } -static gmx_inline gmx_mm_pb +static gmx_inline gmx_simd_bool_t gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1) { return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h index c54c6ae79a..e50d5a9d61 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -35,8 +35,8 @@ #ifndef _nbnxn_kernel_simd_utils_x86_mic_h_ #define _nbnxn_kernel_simd_utils_x86_mic_h_ -typedef gmx_epi32 gmx_exclfilter; -static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE; +typedef gmx_simd_int32_t gmx_exclfilter; +static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH; #define nbfp_stride 2 @@ -152,14 +152,14 @@ gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c) } static gmx_inline void -gmx_2hepi_to_epi(gmx_epi32 a, gmx_epi32 b, gmx_epi32 *c) +gmx_2hepi_to_epi(gmx_simd_int32_t a, gmx_simd_int32_t b, gmx_simd_int32_t *c) { *c = _mm512_mask_permute4f128_epi32(a, mask_hih, b, PERM_LOW2HIGH); } /* recombine the 2 high half into c */ static gmx_inline void -gmx_2hepi_high_to_epi(gmx_epi32 a, gmx_epi32 b, gmx_epi32 *c) +gmx_2hepi_high_to_epi(gmx_simd_int32_t a, gmx_simd_int32_t b, gmx_simd_int32_t *c) { *c = _mm512_mask_permute4f128_epi32(b, mask_loh, a, PERM_HIGH2LOW); } @@ -178,7 +178,7 @@ prepare_table_load_buffer(const int *array) instead of low/high. */ static gmx_inline void -load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti, +load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int *ti, gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S) { __m512i idx; @@ -190,12 +190,12 @@ load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti, gmx_2hpr_to_pr(tmp1, tmp2, ctab0_S); gmx_2hpr_high_to_pr(tmp1, tmp2, ctab1_S); - *ctab1_S = gmx_sub_pr(*ctab1_S, *ctab0_S); + *ctab1_S = gmx_simd_sub_r(*ctab1_S, *ctab0_S); } static gmx_inline void load_table_f_v(const real *tab_coul_F, const real *tab_coul_V, - gmx_epi32 ti_S, int *ti, + gmx_simd_int32_t ti_S, int *ti, gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S, gmx_mm_ps *ctabv_S) { @@ -241,12 +241,12 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1, #define HAVE_GMX_SUM_SIMD static gmx_inline real -gmx_sum_simd(gmx_mm_pr x, real* b) +gmx_sum_simd(gmx_simd_real_t x, real* b) { return _mm512_reduce_add_ps(x); } static gmx_inline real -gmx_sum_simd4(gmx_mm_pr x, real* b) +gmx_sum_simd4(gmx_simd_real_t x, real* b) { return _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x); } diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c index 0f5190a50b..660c12cfc2 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -49,7 +49,7 @@ #include "gromacs/simd/macros.h" #include "gromacs/simd/vector_operations.h" -#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16) +#if !(GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16) #error "unsupported SIMD width" #endif @@ -159,7 +159,7 @@ reduce_group_energies(int ng, int ng_2log, const real *VSvdw, const real *VSc, real *Vvdw, real *Vc) { - const int unrollj = GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE; + const int unrollj = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE; const int unrollj_half = unrollj/2; int ng_p2, i, j, j0, j1, c, s; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h index 9d4c7c60b0..6dbfb97502 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h @@ -44,19 +44,19 @@ #endif #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE -#define UNROLLJ (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE) +#define UNROLLJ (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE) /* The stride of all the atom data arrays is equal to half the SIMD width */ -#define STRIDE (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE) +#define STRIDE (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE) #include "../nbnxn_kernel_simd_utils.h" static gmx_inline void -gmx_load_simd_2xnn_interactions(int excl, - gmx_exclfilter filter_S0, - gmx_exclfilter filter_S2, - gmx_mm_pb *interact_S0, - gmx_mm_pb *interact_S2) +gmx_load_simd_2xnn_interactions(int excl, + gmx_exclfilter filter_S0, + gmx_exclfilter filter_S2, + gmx_simd_bool_t *interact_S0, + gmx_simd_bool_t *interact_S2) { /* Load integer interaction mask */ gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl); diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h index bbff9ed368..8e8eae7966 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -70,14 +70,14 @@ * With gcc this is slower, except for RF on Sandy Bridge. * Tested with gcc 4.6.2, 4.6.3 and 4.7.1. */ -#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256)) +#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER)) #define CUTOFF_BLENDV #endif /* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv. * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7). * Tested with icc 13. */ -#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256 +#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER #define CUTOFF_BLENDV #endif #endif @@ -92,99 +92,99 @@ #ifdef CHECK_EXCLS /* Interaction (non-exclusion) mask of all 1's or 0's */ - gmx_mm_pb interact_S0; - gmx_mm_pb interact_S2; + gmx_simd_bool_t interact_S0; + gmx_simd_bool_t interact_S2; #endif - gmx_mm_pr jx_S, jy_S, jz_S; - gmx_mm_pr dx_S0, dy_S0, dz_S0; - gmx_mm_pr dx_S2, dy_S2, dz_S2; - gmx_mm_pr tx_S0, ty_S0, tz_S0; - gmx_mm_pr tx_S2, ty_S2, tz_S2; - gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0; - gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2; + gmx_simd_real_t jx_S, jy_S, jz_S; + gmx_simd_real_t dx_S0, dy_S0, dz_S0; + gmx_simd_real_t dx_S2, dy_S2, dz_S2; + gmx_simd_real_t tx_S0, ty_S0, tz_S0; + gmx_simd_real_t tx_S2, ty_S2, tz_S2; + gmx_simd_real_t rsq_S0, rinv_S0, rinvsq_S0; + gmx_simd_real_t rsq_S2, rinv_S2, rinvsq_S2; #ifndef CUTOFF_BLENDV /* wco: within cut-off, mask of all 1's or 0's */ - gmx_mm_pb wco_S0; - gmx_mm_pb wco_S2; + gmx_simd_bool_t wco_S0; + gmx_simd_bool_t wco_S2; #endif #ifdef VDW_CUTOFF_CHECK - gmx_mm_pb wco_vdw_S0; + gmx_simd_bool_t wco_vdw_S0; #ifndef HALF_LJ - gmx_mm_pb wco_vdw_S2; + gmx_simd_bool_t wco_vdw_S2; #endif #endif #ifdef CALC_COULOMB #ifdef CHECK_EXCLS /* 1/r masked with the interaction mask */ - gmx_mm_pr rinv_ex_S0; - gmx_mm_pr rinv_ex_S2; + gmx_simd_real_t rinv_ex_S0; + gmx_simd_real_t rinv_ex_S2; #endif - gmx_mm_pr jq_S; - gmx_mm_pr qq_S0; - gmx_mm_pr qq_S2; + gmx_simd_real_t jq_S; + gmx_simd_real_t qq_S0; + gmx_simd_real_t qq_S2; #ifdef CALC_COUL_TAB /* The force (PME mesh force) we need to subtract from 1/r^2 */ - gmx_mm_pr fsub_S0; - gmx_mm_pr fsub_S2; + gmx_simd_real_t fsub_S0; + gmx_simd_real_t fsub_S2; #endif #ifdef CALC_COUL_EWALD - gmx_mm_pr brsq_S0, brsq_S2; - gmx_mm_pr ewcorr_S0, ewcorr_S2; + gmx_simd_real_t brsq_S0, brsq_S2; + gmx_simd_real_t ewcorr_S0, ewcorr_S2; #endif /* frcoul = (1/r - fsub)*r */ - gmx_mm_pr frcoul_S0; - gmx_mm_pr frcoul_S2; + gmx_simd_real_t frcoul_S0; + gmx_simd_real_t frcoul_S2; #ifdef CALC_COUL_TAB /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */ - gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0; - gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2; + gmx_simd_real_t r_S0, rs_S0, rf_S0, frac_S0; + gmx_simd_real_t r_S2, rs_S2, rf_S2, frac_S2; /* Table index: rs truncated to an int */ - gmx_epi32 ti_S0, ti_S2; + gmx_simd_int32_t ti_S0, ti_S2; /* Linear force table values */ - gmx_mm_pr ctab0_S0, ctab1_S0; - gmx_mm_pr ctab0_S2, ctab1_S2; + gmx_simd_real_t ctab0_S0, ctab1_S0; + gmx_simd_real_t ctab0_S2, ctab1_S2; #ifdef CALC_ENERGIES /* Quadratic energy table value */ - gmx_mm_pr ctabv_S0; - gmx_mm_pr ctabv_S2; + gmx_simd_real_t ctabv_S0; + gmx_simd_real_t ctabv_S2; #endif #endif #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) /* The potential (PME mesh) we need to subtract from 1/r */ - gmx_mm_pr vc_sub_S0; - gmx_mm_pr vc_sub_S2; + gmx_simd_real_t vc_sub_S0; + gmx_simd_real_t vc_sub_S2; #endif #ifdef CALC_ENERGIES /* Electrostatic potential */ - gmx_mm_pr vcoul_S0; - gmx_mm_pr vcoul_S2; + gmx_simd_real_t vcoul_S0; + gmx_simd_real_t vcoul_S2; #endif #endif /* The force times 1/r */ - gmx_mm_pr fscal_S0; - gmx_mm_pr fscal_S2; + gmx_simd_real_t fscal_S0; + gmx_simd_real_t fscal_S2; #ifdef CALC_LJ #ifdef LJ_COMB_LB /* LJ sigma_j/2 and sqrt(epsilon_j) */ - gmx_mm_pr hsig_j_S, seps_j_S; + gmx_simd_real_t hsig_j_S, seps_j_S; /* LJ sigma_ij and epsilon_ij */ - gmx_mm_pr sig_S0, eps_S0; + gmx_simd_real_t sig_S0, eps_S0; #ifndef HALF_LJ - gmx_mm_pr sig_S2, eps_S2; + gmx_simd_real_t sig_S2, eps_S2; #endif #ifdef CALC_ENERGIES - gmx_mm_pr sig2_S0, sig6_S0; + gmx_simd_real_t sig2_S0, sig6_S0; #ifndef HALF_LJ - gmx_mm_pr sig2_S2, sig6_S2; + gmx_simd_real_t sig2_S2, sig6_S2; #endif #endif /* LJ_COMB_LB */ #endif /* CALC_LJ */ #ifdef LJ_COMB_GEOM - gmx_mm_pr c6s_j_S, c12s_j_S; + gmx_simd_real_t c6s_j_S, c12s_j_S; #endif #if defined LJ_COMB_GEOM || defined LJ_COMB_LB @@ -194,34 +194,34 @@ #ifndef FIX_LJ_C /* LJ C6 and C12 parameters, used with geometric comb. rule */ - gmx_mm_pr c6_S0, c12_S0; + gmx_simd_real_t c6_S0, c12_S0; #ifndef HALF_LJ - gmx_mm_pr c6_S2, c12_S2; + gmx_simd_real_t c6_S2, c12_S2; #endif #endif /* Intermediate variables for LJ calculation */ #ifndef LJ_COMB_LB - gmx_mm_pr rinvsix_S0; + gmx_simd_real_t rinvsix_S0; #ifndef HALF_LJ - gmx_mm_pr rinvsix_S2; + gmx_simd_real_t rinvsix_S2; #endif #endif #ifdef LJ_COMB_LB - gmx_mm_pr sir_S0, sir2_S0, sir6_S0; + gmx_simd_real_t sir_S0, sir2_S0, sir6_S0; #ifndef HALF_LJ - gmx_mm_pr sir_S2, sir2_S2, sir6_S2; + gmx_simd_real_t sir_S2, sir2_S2, sir6_S2; #endif #endif - gmx_mm_pr FrLJ6_S0, FrLJ12_S0; + gmx_simd_real_t FrLJ6_S0, FrLJ12_S0; #ifndef HALF_LJ - gmx_mm_pr FrLJ6_S2, FrLJ12_S2; + gmx_simd_real_t FrLJ6_S2, FrLJ12_S2; #endif #ifdef CALC_ENERGIES - gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0; + gmx_simd_real_t VLJ6_S0, VLJ12_S0, VLJ_S0; #ifndef HALF_LJ - gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2; + gmx_simd_real_t VLJ6_S2, VLJ12_S2, VLJ_S2; #endif #endif #endif /* CALC_LJ */ @@ -260,20 +260,20 @@ gmx_loaddh_pr(&jz_S, x+ajz); /* Calculate distance */ - dx_S0 = gmx_sub_pr(ix_S0, jx_S); - dy_S0 = gmx_sub_pr(iy_S0, jy_S); - dz_S0 = gmx_sub_pr(iz_S0, jz_S); - dx_S2 = gmx_sub_pr(ix_S2, jx_S); - dy_S2 = gmx_sub_pr(iy_S2, jy_S); - dz_S2 = gmx_sub_pr(iz_S2, jz_S); + dx_S0 = gmx_simd_sub_r(ix_S0, jx_S); + dy_S0 = gmx_simd_sub_r(iy_S0, jy_S); + dz_S0 = gmx_simd_sub_r(iz_S0, jz_S); + dx_S2 = gmx_simd_sub_r(ix_S2, jx_S); + dy_S2 = gmx_simd_sub_r(iy_S2, jy_S); + dz_S2 = gmx_simd_sub_r(iz_S2, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); + rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2); #ifndef CUTOFF_BLENDV - wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); - wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); + wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S); + wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S); #endif #ifdef CHECK_EXCLS @@ -282,20 +282,20 @@ #if UNROLLJ == UNROLLI if (cj == ci_sh) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask_S0); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask_S2); } #else #if UNROLLJ == 2*UNROLLI if (cj*2 == ci_sh) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask0_S0); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask0_S2); } else if (cj*2 + 1 == ci_sh) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask1_S0); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask1_S2); } #else #error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels" @@ -303,19 +303,19 @@ #endif #else /* EXCL_FORCES */ /* No exclusion forces: remove all excluded atom pairs from the list */ - wco_S0 = gmx_and_pb(wco_S0, interact_S0); - wco_S2 = gmx_and_pb(wco_S2, interact_S2); + wco_S0 = gmx_simd_and_b(wco_S0, interact_S0); + wco_S2 = gmx_simd_and_b(wco_S2, interact_S2); #endif #endif #ifdef COUNT_PAIRS { int i, j; - real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp; - tmp = gmx_simd_align_real(tmpa); + real tmpa[2*GMX_SIMD_REAL_WIDTH], *tmp; + tmp = gmx_simd_align_r(tmpa); for (i = 0; i < UNROLLI; i += 2) { - gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2); + gmx_simd_store_r(tmp, i == 0 ? wco_S0 : wco_S2); for (j = 0; j < 2*UNROLLJ; j++) { if (!(tmp[j] == 0)) @@ -334,14 +334,14 @@ #endif /* Calculate 1/r */ - rinv_S0 = gmx_invsqrt_pr(rsq_S0); - rinv_S2 = gmx_invsqrt_pr(rsq_S2); + rinv_S0 = gmx_simd_invsqrt_r(rsq_S0); + rinv_S2 = gmx_simd_invsqrt_r(rsq_S2); #ifdef CALC_COULOMB /* Load parameters for j atom */ gmx_loaddh_pr(&jq_S, q+aj); - qq_S0 = gmx_mul_pr(iq_S0, jq_S); - qq_S2 = gmx_mul_pr(iq_S2, jq_S); + qq_S0 = gmx_simd_mul_r(iq_S0, jq_S); + qq_S2 = gmx_simd_mul_r(iq_S2, jq_S); #endif #ifdef CALC_LJ @@ -356,13 +356,13 @@ #ifdef LJ_COMB_GEOM gmx_loaddh_pr(&c6s_j_S, ljc+aj2+0); gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE); - c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S ); + c6_S0 = gmx_simd_mul_r(c6s_S0, c6s_j_S ); #ifndef HALF_LJ - c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S ); + c6_S2 = gmx_simd_mul_r(c6s_S2, c6s_j_S ); #endif - c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S); + c12_S0 = gmx_simd_mul_r(c12s_S0, c12s_j_S); #ifndef HALF_LJ - c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S); + c12_S2 = gmx_simd_mul_r(c12s_S2, c12s_j_S); #endif #endif /* LJ_COMB_GEOM */ @@ -370,27 +370,27 @@ gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0); gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE); - sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S); - eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S); + sig_S0 = gmx_simd_add_r(hsig_i_S0, hsig_j_S); + eps_S0 = gmx_simd_mul_r(seps_i_S0, seps_j_S); #ifndef HALF_LJ - sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S); - eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S); + sig_S2 = gmx_simd_add_r(hsig_i_S2, hsig_j_S); + eps_S2 = gmx_simd_mul_r(seps_i_S2, seps_j_S); #endif #endif /* LJ_COMB_LB */ #endif /* CALC_LJ */ #ifndef CUTOFF_BLENDV - rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0); - rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2); + rinv_S0 = gmx_simd_blendzero_r(rinv_S0, wco_S0); + rinv_S2 = gmx_simd_blendzero_r(rinv_S2, wco_S2); #else /* We only need to mask for the cut-off: blendv is faster */ - rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)); - rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)); + rinv_S0 = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0)); + rinv_S2 = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2)); #endif - rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0); - rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2); + rinvsq_S0 = gmx_simd_mul_r(rinv_S0, rinv_S0); + rinvsq_S2 = gmx_simd_mul_r(rinv_S2, rinv_S2); #ifdef CALC_COULOMB /* Note that here we calculate force*r, not the usual force/r. @@ -401,8 +401,8 @@ #ifdef EXCL_FORCES /* Only add 1/r for non-excluded atom pairs */ - rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0); - rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2); + rinv_ex_S0 = gmx_simd_blendzero_r(rinv_S0, interact_S0); + rinv_ex_S2 = gmx_simd_blendzero_r(rinv_S2, interact_S2); #else /* No exclusion forces, we always need 1/r */ #define rinv_ex_S0 rinv_S0 @@ -411,12 +411,12 @@ #ifdef CALC_COUL_RF /* Electrostatic interactions */ - frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0)); - frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2)); + frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0)); + frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2)); #ifdef CALC_ENERGIES - vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S))); - vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S))); + vcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_add_r(rinv_ex_S0, gmx_simd_add_r(gmx_simd_mul_r(rsq_S0, hrc_3_S), moh_rc_S))); + vcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_add_r(rinv_ex_S2, gmx_simd_add_r(gmx_simd_mul_r(rsq_S2, hrc_3_S), moh_rc_S))); #endif #endif @@ -425,44 +425,44 @@ * as large distances can cause an overflow in gmx_pmecorrF/V. */ #ifndef CUTOFF_BLENDV - brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0)); - brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2)); + brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S0, wco_S0)); + brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S2, wco_S2)); #else /* Strangely, putting mul on a separate line is slower (icc 13) */ - brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0))); - brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2))); + brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0))); + brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2))); #endif - ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S); - ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S); - frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0)); - frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2)); + ewcorr_S0 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S0), beta_S); + ewcorr_S2 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S2), beta_S); + frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0)); + frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2)); #ifdef CALC_ENERGIES - vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S); - vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S); + vc_sub_S0 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S0), beta_S); + vc_sub_S2 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S2), beta_S); #endif #endif /* CALC_COUL_EWALD */ #ifdef CALC_COUL_TAB /* Electrostatic interactions */ - r_S0 = gmx_mul_pr(rsq_S0, rinv_S0); - r_S2 = gmx_mul_pr(rsq_S2, rinv_S2); + r_S0 = gmx_simd_mul_r(rsq_S0, rinv_S0); + r_S2 = gmx_simd_mul_r(rsq_S2, rinv_S2); /* Convert r to scaled table units */ - rs_S0 = gmx_mul_pr(r_S0, invtsp_S); - rs_S2 = gmx_mul_pr(r_S2, invtsp_S); + rs_S0 = gmx_simd_mul_r(r_S0, invtsp_S); + rs_S2 = gmx_simd_mul_r(r_S2, invtsp_S); /* Truncate scaled r to an int */ - ti_S0 = gmx_cvttpr_epi32(rs_S0); - ti_S2 = gmx_cvttpr_epi32(rs_S2); + ti_S0 = gmx_simd_cvtt_r2i(rs_S0); + ti_S2 = gmx_simd_cvtt_r2i(rs_S2); #ifdef GMX_SIMD_HAVE_FLOOR - rf_S0 = gmx_floor_pr(rs_S0); - rf_S2 = gmx_floor_pr(rs_S2); + rf_S0 = gmx_simd_floor_r(rs_S0); + rf_S2 = gmx_simd_floor_r(rs_S2); #else - rf_S0 = gmx_cvtepi32_pr(ti_S0); - rf_S2 = gmx_cvtepi32_pr(ti_S2); + rf_S0 = gmx_simd_cvt_i2r(ti_S0); + rf_S2 = gmx_simd_cvt_i2r(ti_S2); #endif - frac_S0 = gmx_sub_pr(rs_S0, rf_S0); - frac_S2 = gmx_sub_pr(rs_S2, rf_S2); + frac_S0 = gmx_simd_sub_r(rs_S0, rf_S0); + frac_S2 = gmx_simd_sub_r(rs_S2, rf_S2); /* Load and interpolate table forces and possibly energies. * Force and energy can be combined in one table, stride 4: FDV0 @@ -481,14 +481,14 @@ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2); #endif #endif - fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0)); - fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2)); - frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0))); - frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2))); + fsub_S0 = gmx_simd_add_r(ctab0_S0, gmx_simd_mul_r(frac_S0, ctab1_S0)); + fsub_S2 = gmx_simd_add_r(ctab0_S2, gmx_simd_mul_r(frac_S2, ctab1_S2)); + frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, gmx_simd_mul_r(fsub_S0, r_S0))); + frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, gmx_simd_mul_r(fsub_S2, r_S2))); #ifdef CALC_ENERGIES - vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0))); - vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2))); + vc_sub_S0 = gmx_simd_add_r(ctabv_S0, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S0), gmx_simd_add_r(ctab0_S0, fsub_S0))); + vc_sub_S2 = gmx_simd_add_r(ctabv_S2, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S2), gmx_simd_add_r(ctab0_S2, fsub_S2))); #endif #endif /* CALC_COUL_TAB */ @@ -496,22 +496,22 @@ #ifndef NO_SHIFT_EWALD /* Add Ewald potential shift to vc_sub for convenience */ #ifdef CHECK_EXCLS - vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0)); - vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2)); + vc_sub_S0 = gmx_simd_add_r(vc_sub_S0, gmx_simd_blendzero_r(sh_ewald_S, interact_S0)); + vc_sub_S2 = gmx_simd_add_r(vc_sub_S2, gmx_simd_blendzero_r(sh_ewald_S, interact_S2)); #else - vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S); - vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S); + vc_sub_S0 = gmx_simd_add_r(vc_sub_S0, sh_ewald_S); + vc_sub_S2 = gmx_simd_add_r(vc_sub_S2, sh_ewald_S); #endif #endif - vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0)); - vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2)); + vcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, vc_sub_S0)); + vcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, vc_sub_S2)); #endif #ifdef CALC_ENERGIES /* Mask energy for cut-off and diagonal */ - vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0); - vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2); + vcoul_S0 = gmx_simd_blendzero_r(vcoul_S0, wco_S0); + vcoul_S2 = gmx_simd_blendzero_r(vcoul_S2, wco_S2); #endif #endif /* CALC_COULOMB */ @@ -520,9 +520,9 @@ /* Lennard-Jones interaction */ #ifdef VDW_CUTOFF_CHECK - wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S); + wco_vdw_S0 = gmx_simd_cmplt_r(rsq_S0, rcvdw2_S); #ifndef HALF_LJ - wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S); + wco_vdw_S2 = gmx_simd_cmplt_r(rsq_S2, rcvdw2_S); #endif #else /* Same cut-off for Coulomb and VdW, reuse the registers */ @@ -531,82 +531,82 @@ #endif #ifndef LJ_COMB_LB - rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0)); + rinvsix_S0 = gmx_simd_mul_r(rinvsq_S0, gmx_simd_mul_r(rinvsq_S0, rinvsq_S0)); #ifdef EXCL_FORCES - rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0); + rinvsix_S0 = gmx_simd_blendzero_r(rinvsix_S0, interact_S0); #endif #ifndef HALF_LJ - rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2)); + rinvsix_S2 = gmx_simd_mul_r(rinvsq_S2, gmx_simd_mul_r(rinvsq_S2, rinvsq_S2)); #ifdef EXCL_FORCES - rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2); + rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, interact_S2); #endif #endif #ifdef VDW_CUTOFF_CHECK - rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0); + rinvsix_S0 = gmx_simd_blendzero_r(rinvsix_S0, wco_vdw_S0); #ifndef HALF_LJ - rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2); + rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, wco_vdw_S2); #endif #endif - FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0); + FrLJ6_S0 = gmx_simd_mul_r(c6_S0, rinvsix_S0); #ifndef HALF_LJ - FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2); + FrLJ6_S2 = gmx_simd_mul_r(c6_S2, rinvsix_S2); #endif - FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0)); + FrLJ12_S0 = gmx_simd_mul_r(c12_S0, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0)); #ifndef HALF_LJ - FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2)); + FrLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2)); #endif #endif /* not LJ_COMB_LB */ #ifdef LJ_COMB_LB - sir_S0 = gmx_mul_pr(sig_S0, rinv_S0); + sir_S0 = gmx_simd_mul_r(sig_S0, rinv_S0); #ifndef HALF_LJ - sir_S2 = gmx_mul_pr(sig_S2, rinv_S2); + sir_S2 = gmx_simd_mul_r(sig_S2, rinv_S2); #endif - sir2_S0 = gmx_mul_pr(sir_S0, sir_S0); + sir2_S0 = gmx_simd_mul_r(sir_S0, sir_S0); #ifndef HALF_LJ - sir2_S2 = gmx_mul_pr(sir_S2, sir_S2); + sir2_S2 = gmx_simd_mul_r(sir_S2, sir_S2); #endif - sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0)); + sir6_S0 = gmx_simd_mul_r(sir2_S0, gmx_simd_mul_r(sir2_S0, sir2_S0)); #ifdef EXCL_FORCES - sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0); + sir6_S0 = gmx_simd_blendzero_r(sir6_S0, interact_S0); #endif #ifndef HALF_LJ - sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2)); + sir6_S2 = gmx_simd_mul_r(sir2_S2, gmx_simd_mul_r(sir2_S2, sir2_S2)); #ifdef EXCL_FORCES - sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2); + sir6_S2 = gmx_simd_blendzero_r(sir6_S2, interact_S2); #endif #endif #ifdef VDW_CUTOFF_CHECK - sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0); + sir6_S0 = gmx_simd_blendzero_r(sir6_S0, wco_vdw_S0); #ifndef HALF_LJ - sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2); + sir6_S2 = gmx_simd_blendzero_r(sir6_S2, wco_vdw_S2); #endif #endif - FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0); + FrLJ6_S0 = gmx_simd_mul_r(eps_S0, sir6_S0); #ifndef HALF_LJ - FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2); + FrLJ6_S2 = gmx_simd_mul_r(eps_S2, sir6_S2); #endif - FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0); + FrLJ12_S0 = gmx_simd_mul_r(FrLJ6_S0, sir6_S0); #ifndef HALF_LJ - FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2); + FrLJ12_S2 = gmx_simd_mul_r(FrLJ6_S2, sir6_S2); #endif #if defined CALC_ENERGIES /* We need C6 and C12 to calculate the LJ potential shift */ - sig2_S0 = gmx_mul_pr(sig_S0, sig_S0); + sig2_S0 = gmx_simd_mul_r(sig_S0, sig_S0); #ifndef HALF_LJ - sig2_S2 = gmx_mul_pr(sig_S2, sig_S2); + sig2_S2 = gmx_simd_mul_r(sig_S2, sig_S2); #endif - sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0)); + sig6_S0 = gmx_simd_mul_r(sig2_S0, gmx_simd_mul_r(sig2_S0, sig2_S0)); #ifndef HALF_LJ - sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2)); + sig6_S2 = gmx_simd_mul_r(sig2_S2, gmx_simd_mul_r(sig2_S2, sig2_S2)); #endif - c6_S0 = gmx_mul_pr(eps_S0, sig6_S0); + c6_S0 = gmx_simd_mul_r(eps_S0, sig6_S0); #ifndef HALF_LJ - c6_S2 = gmx_mul_pr(eps_S2, sig6_S2); + c6_S2 = gmx_simd_mul_r(eps_S2, sig6_S2); #endif - c12_S0 = gmx_mul_pr(c6_S0, sig6_S0); + c12_S0 = gmx_simd_mul_r(c6_S0, sig6_S0); #ifndef HALF_LJ - c12_S2 = gmx_mul_pr(c6_S2, sig6_S2); + c12_S2 = gmx_simd_mul_r(c6_S2, sig6_S2); #endif #endif #endif /* LJ_COMB_LB */ @@ -642,7 +642,7 @@ #ifdef CALC_COULOMB #ifndef ENERGY_GROUPS - vctot_S = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2)); + vctot_S = gmx_simd_add_r(vctot_S, gmx_simd_add_r(vcoul_S0, vcoul_S2)); #else add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj); add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj); @@ -651,39 +651,39 @@ #ifdef CALC_LJ /* Calculate the LJ energies */ - VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S))); + VLJ6_S0 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S0, gmx_simd_mul_r(c6_S0, sh_invrc6_S))); #ifndef HALF_LJ - VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S))); + VLJ6_S2 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S2, gmx_simd_mul_r(c6_S2, sh_invrc6_S))); #endif - VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S))); + VLJ12_S0 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S0, gmx_simd_mul_r(c12_S0, sh_invrc12_S))); #ifndef HALF_LJ - VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S))); + VLJ12_S2 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S2, gmx_simd_mul_r(c12_S2, sh_invrc12_S))); #endif - VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0); + VLJ_S0 = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0); #ifndef HALF_LJ - VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2); + VLJ_S2 = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2); #endif /* The potential shift should be removed for pairs beyond cut-off */ - VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0); + VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0); #ifndef HALF_LJ - VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2); + VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2); #endif #ifdef CHECK_EXCLS /* The potential shift should be removed for excluded pairs */ - VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0); + VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, interact_S0); #ifndef HALF_LJ - VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2); + VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, interact_S2); #endif #endif #ifndef ENERGY_GROUPS - Vvdwtot_S = gmx_add_pr(Vvdwtot_S, + Vvdwtot_S = gmx_simd_add_r(Vvdwtot_S, #ifndef HALF_LJ - gmx_add_pr(VLJ_S0, VLJ_S2) + gmx_simd_add_r(VLJ_S0, VLJ_S2) #else - VLJ_S0 + VLJ_S0 #endif - ); + ); #else add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj); #ifndef HALF_LJ @@ -695,47 +695,47 @@ #ifdef CALC_LJ #ifdef CALC_COULOMB - fscal_S0 = gmx_mul_pr(rinvsq_S0, - gmx_add_pr(frcoul_S0, - gmx_sub_pr(FrLJ12_S0, FrLJ6_S0))); + fscal_S0 = gmx_simd_mul_r(rinvsq_S0, + gmx_simd_add_r(frcoul_S0, + gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0))); #else - fscal_S0 = gmx_mul_pr(rinvsq_S0, - ( - gmx_sub_pr(FrLJ12_S0, FrLJ6_S0))); + fscal_S0 = gmx_simd_mul_r(rinvsq_S0, + ( + gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0))); #endif #else - fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0); + fscal_S0 = gmx_simd_mul_r(rinvsq_S0, frcoul_S0); #endif /* CALC_LJ */ #if defined CALC_LJ && !defined HALF_LJ #ifdef CALC_COULOMB - fscal_S2 = gmx_mul_pr(rinvsq_S2, - gmx_add_pr(frcoul_S2, - gmx_sub_pr(FrLJ12_S2, FrLJ6_S2))); + fscal_S2 = gmx_simd_mul_r(rinvsq_S2, + gmx_simd_add_r(frcoul_S2, + gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2))); #else - fscal_S2 = gmx_mul_pr(rinvsq_S2, - ( - gmx_sub_pr(FrLJ12_S2, FrLJ6_S2))); + fscal_S2 = gmx_simd_mul_r(rinvsq_S2, + ( + gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2))); #endif #else /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */ - fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2); + fscal_S2 = gmx_simd_mul_r(rinvsq_S2, frcoul_S2); #endif /* Calculate temporary vectorial force */ - tx_S0 = gmx_mul_pr(fscal_S0, dx_S0); - tx_S2 = gmx_mul_pr(fscal_S2, dx_S2); - ty_S0 = gmx_mul_pr(fscal_S0, dy_S0); - ty_S2 = gmx_mul_pr(fscal_S2, dy_S2); - tz_S0 = gmx_mul_pr(fscal_S0, dz_S0); - tz_S2 = gmx_mul_pr(fscal_S2, dz_S2); + tx_S0 = gmx_simd_mul_r(fscal_S0, dx_S0); + tx_S2 = gmx_simd_mul_r(fscal_S2, dx_S2); + ty_S0 = gmx_simd_mul_r(fscal_S0, dy_S0); + ty_S2 = gmx_simd_mul_r(fscal_S2, dy_S2); + tz_S0 = gmx_simd_mul_r(fscal_S0, dz_S0); + tz_S2 = gmx_simd_mul_r(fscal_S2, dz_S2); /* Increment i atom force */ - fix_S0 = gmx_add_pr(fix_S0, tx_S0); - fix_S2 = gmx_add_pr(fix_S2, tx_S2); - fiy_S0 = gmx_add_pr(fiy_S0, ty_S0); - fiy_S2 = gmx_add_pr(fiy_S2, ty_S2); - fiz_S0 = gmx_add_pr(fiz_S0, tz_S0); - fiz_S2 = gmx_add_pr(fiz_S2, tz_S2); + fix_S0 = gmx_simd_add_r(fix_S0, tx_S0); + fix_S2 = gmx_simd_add_r(fix_S2, tx_S2); + fiy_S0 = gmx_simd_add_r(fiy_S0, ty_S0); + fiy_S2 = gmx_simd_add_r(fiy_S2, ty_S2); + fiz_S0 = gmx_simd_add_r(fiz_S0, tz_S0); + fiz_S2 = gmx_simd_add_r(fiz_S2, tz_S2); /* Decrement j atom force */ gmx_load_hpr(&fjx_S, f+ajx); diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h index eb7409906c..0c3c7a9cad 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -89,101 +89,103 @@ real *vctp[UNROLLI]; #endif - gmx_mm_pr shX_S; - gmx_mm_pr shY_S; - gmx_mm_pr shZ_S; - gmx_mm_pr ix_S0, iy_S0, iz_S0; - gmx_mm_pr ix_S2, iy_S2, iz_S2; - gmx_mm_pr fix_S0, fiy_S0, fiz_S0; - gmx_mm_pr fix_S2, fiy_S2, fiz_S2; + gmx_simd_real_t shX_S; + gmx_simd_real_t shY_S; + gmx_simd_real_t shZ_S; + gmx_simd_real_t ix_S0, iy_S0, iz_S0; + gmx_simd_real_t ix_S2, iy_S2, iz_S2; + gmx_simd_real_t fix_S0, fiy_S0, fiz_S0; + gmx_simd_real_t fix_S2, fiy_S2, fiz_S2; /* We use an i-force SIMD register width of 4 */ /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */ - gmx_mm_pr4 fix_S, fiy_S, fiz_S; + gmx_mm_pr4 fix_S, fiy_S, fiz_S; - gmx_mm_pr diagonal_jmi_S; + gmx_simd_real_t diagonal_jmi_S; #if UNROLLI == UNROLLJ - gmx_mm_pb diagonal_mask_S0, diagonal_mask_S2; + gmx_simd_bool_t diagonal_mask_S0, diagonal_mask_S2; #else - gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S2; - gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S2; + gmx_simd_bool_t diagonal_mask0_S0, diagonal_mask0_S2; + gmx_simd_bool_t diagonal_mask1_S0, diagonal_mask1_S2; #endif - unsigned *exclusion_filter; - gmx_exclfilter filter_S0, filter_S2; + unsigned *exclusion_filter; + gmx_exclfilter filter_S0, filter_S2; - gmx_mm_pr zero_S = gmx_set1_pr(0); + gmx_simd_real_t zero_S = gmx_simd_set1_r(0); - gmx_mm_pr one_S = gmx_set1_pr(1.0); - gmx_mm_pr iq_S0 = gmx_setzero_pr(); - gmx_mm_pr iq_S2 = gmx_setzero_pr(); - gmx_mm_pr mrc_3_S; + gmx_simd_real_t one_S = gmx_simd_set1_r(1.0); + gmx_simd_real_t iq_S0 = gmx_simd_setzero_r(); + gmx_simd_real_t iq_S2 = gmx_simd_setzero_r(); + gmx_simd_real_t mrc_3_S; #ifdef CALC_ENERGIES - gmx_mm_pr hrc_3_S, moh_rc_S; + gmx_simd_real_t hrc_3_S, moh_rc_S; #endif #ifdef CALC_COUL_TAB /* Coulomb table variables */ - gmx_mm_pr invtsp_S; - const real *tab_coul_F; + gmx_simd_real_t invtsp_S; + const real *tab_coul_F; #ifndef TAB_FDV0 - const real *tab_coul_V; + const real *tab_coul_V; #endif - int ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0; - int ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2; + int ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0; + int ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2; #ifdef CALC_ENERGIES - gmx_mm_pr mhalfsp_S; + gmx_simd_real_t mhalfsp_S; #endif #endif #ifdef CALC_COUL_EWALD - gmx_mm_pr beta2_S, beta_S; + gmx_simd_real_t beta2_S, beta_S; #endif #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) - gmx_mm_pr sh_ewald_S; + gmx_simd_real_t sh_ewald_S; #endif #ifdef LJ_COMB_LB - const real *ljc; + const real *ljc; - gmx_mm_pr hsig_i_S0, seps_i_S0; - gmx_mm_pr hsig_i_S2, seps_i_S2; + gmx_simd_real_t hsig_i_S0, seps_i_S0; + gmx_simd_real_t hsig_i_S2, seps_i_S2; #else #ifdef FIX_LJ_C - real pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE]; - real *pvdw_c6, *pvdw_c12; - gmx_mm_pr c6_S0, c12_S0; - gmx_mm_pr c6_S2, c12_S2; + real pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_REAL_WIDTH]; + real *pvdw_c6, *pvdw_c12; + gmx_simd_real_t c6_S0, c12_S0; + gmx_simd_real_t c6_S2, c12_S2; #endif #ifdef LJ_COMB_GEOM - const real *ljc; + const real *ljc; - gmx_mm_pr c6s_S0, c12s_S0; - gmx_mm_pr c6s_S1, c12s_S1; - gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr(); - gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr(); + gmx_simd_real_t c6s_S0, c12s_S0; + gmx_simd_real_t c6s_S1, c12s_S1; + gmx_simd_real_t c6s_S2 = gmx_simd_setzero_r(); + gmx_simd_real_t c12s_S2 = gmx_simd_setzero_r(); + gmx_simd_real_t c6s_S3 = gmx_simd_setzero_r(); + gmx_simd_real_t c12s_S3 = gmx_simd_setzero_r(); #endif #endif /* LJ_COMB_LB */ - gmx_mm_pr vctot_S, Vvdwtot_S; - gmx_mm_pr sixth_S, twelveth_S; + gmx_simd_real_t vctot_S, Vvdwtot_S; + gmx_simd_real_t sixth_S, twelveth_S; - gmx_mm_pr avoid_sing_S; - gmx_mm_pr rc2_S; + gmx_simd_real_t avoid_sing_S; + gmx_simd_real_t rc2_S; #ifdef VDW_CUTOFF_CHECK - gmx_mm_pr rcvdw2_S; + gmx_simd_real_t rcvdw2_S; #endif #ifdef CALC_ENERGIES - gmx_mm_pr sh_invrc6_S, sh_invrc12_S; + gmx_simd_real_t sh_invrc6_S, sh_invrc12_S; /* cppcheck-suppress unassignedVariable */ - real tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum; + real tmpsum_array[2*GMX_SIMD_REAL_WIDTH], *tmpsum; #endif #ifdef CALC_SHIFTFORCES /* cppcheck-suppress unassignedVariable */ - real shf_array[2*GMX_SIMD_WIDTH_HERE], *shf; + real shf_array[2*GMX_SIMD_REAL_WIDTH], *shf; #endif int ninner; @@ -200,25 +202,25 @@ #endif /* Load j-i for the first i */ - diagonal_jmi_S = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i); + diagonal_jmi_S = gmx_simd_load_r(nbat->simd_2xnn_diagonal_j_minus_i); /* Generate all the diagonal masks as comparison results */ #if UNROLLI == UNROLLJ - diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); + diagonal_mask_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); #else #if 2*UNROLLI == UNROLLJ - diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); + diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); #endif #endif @@ -246,9 +248,9 @@ ti0 = prepare_table_load_buffer(ti0_array); ti2 = prepare_table_load_buffer(ti2_array); - invtsp_S = gmx_set1_pr(ic->tabq_scale); + invtsp_S = gmx_simd_set1_r(ic->tabq_scale); #ifdef CALC_ENERGIES - mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale); + mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale); #endif #ifdef TAB_FDV0 @@ -260,12 +262,12 @@ #endif /* CALC_COUL_TAB */ #ifdef CALC_COUL_EWALD - beta2_S = gmx_set1_pr(ic->ewaldcoeff_q*ic->ewaldcoeff_q); - beta_S = gmx_set1_pr(ic->ewaldcoeff_q); + beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q); + beta_S = gmx_simd_set1_r(ic->ewaldcoeff_q); #endif #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES - sh_ewald_S = gmx_set1_pr(ic->sh_ewald); + sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald); #endif q = nbat->q; @@ -274,39 +276,39 @@ shiftvec = shift_vec[0]; x = nbat->x; - avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC); + avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC); /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */ - rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb); + rc2_S = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb); #ifdef VDW_CUTOFF_CHECK - rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw); + rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw); #endif #ifdef CALC_ENERGIES - sixth_S = gmx_set1_pr(1.0/6.0); - twelveth_S = gmx_set1_pr(1.0/12.0); + sixth_S = gmx_simd_set1_r(1.0/6.0); + twelveth_S = gmx_simd_set1_r(1.0/12.0); - sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6); - sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6); + sh_invrc6_S = gmx_simd_set1_r(ic->sh_invrc6); + sh_invrc12_S = gmx_simd_set1_r(ic->sh_invrc6*ic->sh_invrc6); #endif - mrc_3_S = gmx_set1_pr(-2*ic->k_rf); + mrc_3_S = gmx_simd_set1_r(-2*ic->k_rf); #ifdef CALC_ENERGIES - hrc_3_S = gmx_set1_pr(ic->k_rf); + hrc_3_S = gmx_simd_set1_r(ic->k_rf); - moh_rc_S = gmx_set1_pr(-ic->c_rf); + moh_rc_S = gmx_simd_set1_r(-ic->c_rf); #endif #ifdef CALC_ENERGIES - tmpsum = gmx_simd_align_real(tmpsum_array); + tmpsum = gmx_simd_align_r(tmpsum_array); #endif #ifdef CALC_SHIFTFORCES - shf = gmx_simd_align_real(shf_array); + shf = gmx_simd_align_r(shf_array); #endif #ifdef FIX_LJ_C - pvdw_c6 = gmx_simd_align_real(pvdw_array); + pvdw_c6 = gmx_simd_align_r(pvdw_array); pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ; for (jp = 0; jp < UNROLLJ; jp++) @@ -321,15 +323,15 @@ pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1]; pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1]; } - c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ); - c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ); - c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ); - c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ); - - c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ); - c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ); - c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ); - c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ); + c6_S0 = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ); + c6_S1 = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ); + c6_S2 = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ); + c6_S3 = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ); + + c12_S0 = gmx_simd_load_r(pvdw_c12+0*UNROLLJ); + c12_S1 = gmx_simd_load_r(pvdw_c12+1*UNROLLJ); + c12_S2 = gmx_simd_load_r(pvdw_c12+2*UNROLLJ); + c12_S3 = gmx_simd_load_r(pvdw_c12+3*UNROLLJ); #endif /* FIX_LJ_C */ #ifdef ENERGY_GROUPS @@ -356,9 +358,9 @@ ci = nbln->ci; ci_sh = (ish == CENTRAL ? ci : -1); - shX_S = gmx_load1_pr(shiftvec+ish3); - shY_S = gmx_load1_pr(shiftvec+ish3+1); - shZ_S = gmx_load1_pr(shiftvec+ish3+2); + shX_S = gmx_simd_load1_r(shiftvec+ish3); + shY_S = gmx_simd_load1_r(shiftvec+ish3+1); + shZ_S = gmx_simd_load1_r(shiftvec+ish3+2); #if UNROLLJ <= 4 sci = ci*STRIDE; @@ -444,23 +446,23 @@ gmx_load1p1_pr(&iy_S2, x+sciy+2); gmx_load1p1_pr(&iz_S0, x+sciz); gmx_load1p1_pr(&iz_S2, x+sciz+2); - ix_S0 = gmx_add_pr(ix_S0, shX_S); - ix_S2 = gmx_add_pr(ix_S2, shX_S); - iy_S0 = gmx_add_pr(iy_S0, shY_S); - iy_S2 = gmx_add_pr(iy_S2, shY_S); - iz_S0 = gmx_add_pr(iz_S0, shZ_S); - iz_S2 = gmx_add_pr(iz_S2, shZ_S); + ix_S0 = gmx_simd_add_r(ix_S0, shX_S); + ix_S2 = gmx_simd_add_r(ix_S2, shX_S); + iy_S0 = gmx_simd_add_r(iy_S0, shY_S); + iy_S2 = gmx_simd_add_r(iy_S2, shY_S); + iz_S0 = gmx_simd_add_r(iz_S0, shZ_S); + iz_S2 = gmx_simd_add_r(iz_S2, shZ_S); if (do_coul) { - gmx_mm_pr facel_S; + gmx_simd_real_t facel_S; - facel_S = gmx_set1_pr(facel); + facel_S = gmx_simd_set1_r(facel); gmx_load1p1_pr(&iq_S0, q+sci); gmx_load1p1_pr(&iq_S2, q+sci+2); - iq_S0 = gmx_mul_pr(facel_S, iq_S0); - iq_S2 = gmx_mul_pr(facel_S, iq_S2); + iq_S0 = gmx_simd_mul_r(facel_S, iq_S0); + iq_S2 = gmx_simd_mul_r(facel_S, iq_S2); } #ifdef LJ_COMB_LB @@ -492,16 +494,16 @@ #endif /* Zero the potential energy for this list */ - Vvdwtot_S = gmx_setzero_pr(); - vctot_S = gmx_setzero_pr(); + Vvdwtot_S = gmx_simd_setzero_r(); + vctot_S = gmx_simd_setzero_r(); /* Clear i atom forces */ - fix_S0 = gmx_setzero_pr(); - fix_S2 = gmx_setzero_pr(); - fiy_S0 = gmx_setzero_pr(); - fiy_S2 = gmx_setzero_pr(); - fiz_S0 = gmx_setzero_pr(); - fiz_S2 = gmx_setzero_pr(); + fix_S0 = gmx_simd_setzero_r(); + fix_S2 = gmx_simd_setzero_r(); + fiy_S0 = gmx_simd_setzero_r(); + fiy_S2 = gmx_simd_setzero_r(); + fiz_S0 = gmx_simd_setzero_r(); + fiz_S2 = gmx_simd_setzero_r(); cjind = cjind0; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c index 3e9245a426..13810fa880 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -51,7 +51,7 @@ #include "gromacs/simd/macros.h" #include "gromacs/simd/vector_operations.h" -#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8) +#if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8) #error "unsupported SIMD width" #endif @@ -161,7 +161,7 @@ reduce_group_energies(int ng, int ng_2log, const real *VSvdw, const real *VSc, real *Vvdw, real *Vc) { - const int unrollj = GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE; + const int unrollj = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE; const int unrollj_half = unrollj/2; int ng_p2, i, j, j0, j1, c, s; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h index 57b0e46099..e3b3380bc5 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h @@ -44,11 +44,11 @@ #endif #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE -#define UNROLLJ (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE) +#define UNROLLJ (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE) /* The stride of all the atom data arrays is max(UNROLLI,unrollj) */ -#if GMX_SIMD_WIDTH_HERE >= UNROLLI -#define STRIDE (GMX_SIMD_WIDTH_HERE/GMX_SIMD_J_UNROLL_SIZE) +#if GMX_SIMD_REAL_WIDTH >= UNROLLI +#define STRIDE (GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE) #else #define STRIDE (UNROLLI) #endif @@ -56,19 +56,19 @@ #include "../nbnxn_kernel_simd_utils.h" static inline void -gmx_load_simd_4xn_interactions(int excl, - gmx_exclfilter filter_S0, - gmx_exclfilter filter_S1, - gmx_exclfilter filter_S2, - gmx_exclfilter filter_S3, - const char gmx_unused *interaction_mask_indices, - real gmx_unused *simd_interaction_array, - gmx_mm_pb *interact_S0, - gmx_mm_pb *interact_S1, - gmx_mm_pb *interact_S2, - gmx_mm_pb *interact_S3) +gmx_load_simd_4xn_interactions(int excl, + gmx_exclfilter filter_S0, + gmx_exclfilter filter_S1, + gmx_exclfilter filter_S2, + gmx_exclfilter filter_S3, + const char gmx_unused *interaction_mask_indices, + real gmx_unused *simd_interaction_array, + gmx_simd_bool_t *interact_S0, + gmx_simd_bool_t *interact_S1, + gmx_simd_bool_t *interact_S2, + gmx_simd_bool_t *interact_S3) { -#if defined GMX_X86_SSE2 || defined GMX_SIMD_REFERENCE_PLAIN_C +#if defined GMX_SIMD_X86_SSE2_OR_HIGHER || defined GMX_SIMD_REFERENCE /* Load integer interaction mask */ gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl); *interact_S0 = gmx_checkbitmask_pb(mask_pr_S, filter_S0); @@ -76,8 +76,8 @@ gmx_load_simd_4xn_interactions(int excl, *interact_S2 = gmx_checkbitmask_pb(mask_pr_S, filter_S2); *interact_S3 = gmx_checkbitmask_pb(mask_pr_S, filter_S3); #endif -#ifdef GMX_CPU_ACCELERATION_IBM_QPX - const int size = GMX_SIMD_WIDTH_HERE * sizeof(real); +#ifdef GMX_SIMD_IBM_QPX + const int size = GMX_SIMD_REAL_WIDTH * sizeof(real); *interact_S0 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[0], simd_interaction_array); *interact_S1 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[1], simd_interaction_array); *interact_S2 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[2], simd_interaction_array); diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h index 0817d76793..703b31740f 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -50,7 +50,7 @@ #endif /* Without exclusions and energies we only need to mask the cut-off, - * this can be faster when we have defined gmx_blendv_pr, i.e. an instruction + * this can be faster when we have defined gmx_simd_blendv_r, i.e. an instruction * that selects from two SIMD registers based on the contents of a third. */ #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV @@ -58,14 +58,14 @@ * With gcc this is slower, except for RF on Sandy Bridge. * Tested with gcc 4.6.2, 4.6.3 and 4.7.1. */ -#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256)) +#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER)) #define NBNXN_CUTOFF_USE_BLENDV #endif /* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv. * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7). * Tested with icc 13. */ -#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256 +#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER #define NBNXN_CUTOFF_USE_BLENDV #endif #endif @@ -80,135 +80,135 @@ #ifdef CHECK_EXCLS /* Interaction (non-exclusion) mask of all 1's or 0's */ - gmx_mm_pb interact_S0; - gmx_mm_pb interact_S1; - gmx_mm_pb interact_S2; - gmx_mm_pb interact_S3; -#endif - - gmx_mm_pr jx_S, jy_S, jz_S; - gmx_mm_pr dx_S0, dy_S0, dz_S0; - gmx_mm_pr dx_S1, dy_S1, dz_S1; - gmx_mm_pr dx_S2, dy_S2, dz_S2; - gmx_mm_pr dx_S3, dy_S3, dz_S3; - gmx_mm_pr tx_S0, ty_S0, tz_S0; - gmx_mm_pr tx_S1, ty_S1, tz_S1; - gmx_mm_pr tx_S2, ty_S2, tz_S2; - gmx_mm_pr tx_S3, ty_S3, tz_S3; - gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0; - gmx_mm_pr rsq_S1, rinv_S1, rinvsq_S1; - gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2; - gmx_mm_pr rsq_S3, rinv_S3, rinvsq_S3; + gmx_simd_bool_t interact_S0; + gmx_simd_bool_t interact_S1; + gmx_simd_bool_t interact_S2; + gmx_simd_bool_t interact_S3; +#endif + + gmx_simd_real_t jx_S, jy_S, jz_S; + gmx_simd_real_t dx_S0, dy_S0, dz_S0; + gmx_simd_real_t dx_S1, dy_S1, dz_S1; + gmx_simd_real_t dx_S2, dy_S2, dz_S2; + gmx_simd_real_t dx_S3, dy_S3, dz_S3; + gmx_simd_real_t tx_S0, ty_S0, tz_S0; + gmx_simd_real_t tx_S1, ty_S1, tz_S1; + gmx_simd_real_t tx_S2, ty_S2, tz_S2; + gmx_simd_real_t tx_S3, ty_S3, tz_S3; + gmx_simd_real_t rsq_S0, rinv_S0, rinvsq_S0; + gmx_simd_real_t rsq_S1, rinv_S1, rinvsq_S1; + gmx_simd_real_t rsq_S2, rinv_S2, rinvsq_S2; + gmx_simd_real_t rsq_S3, rinv_S3, rinvsq_S3; #ifndef NBNXN_CUTOFF_USE_BLENDV /* wco: within cut-off, mask of all 1's or 0's */ - gmx_mm_pb wco_S0; - gmx_mm_pb wco_S1; - gmx_mm_pb wco_S2; - gmx_mm_pb wco_S3; + gmx_simd_bool_t wco_S0; + gmx_simd_bool_t wco_S1; + gmx_simd_bool_t wco_S2; + gmx_simd_bool_t wco_S3; #endif #ifdef VDW_CUTOFF_CHECK - gmx_mm_pb wco_vdw_S0; - gmx_mm_pb wco_vdw_S1; + gmx_simd_bool_t wco_vdw_S0; + gmx_simd_bool_t wco_vdw_S1; #ifndef HALF_LJ - gmx_mm_pb wco_vdw_S2; - gmx_mm_pb wco_vdw_S3; + gmx_simd_bool_t wco_vdw_S2; + gmx_simd_bool_t wco_vdw_S3; #endif #endif #ifdef CALC_COULOMB #ifdef CHECK_EXCLS /* 1/r masked with the interaction mask */ - gmx_mm_pr rinv_ex_S0; - gmx_mm_pr rinv_ex_S1; - gmx_mm_pr rinv_ex_S2; - gmx_mm_pr rinv_ex_S3; -#endif - gmx_mm_pr jq_S; - gmx_mm_pr qq_S0; - gmx_mm_pr qq_S1; - gmx_mm_pr qq_S2; - gmx_mm_pr qq_S3; + gmx_simd_real_t rinv_ex_S0; + gmx_simd_real_t rinv_ex_S1; + gmx_simd_real_t rinv_ex_S2; + gmx_simd_real_t rinv_ex_S3; +#endif + gmx_simd_real_t jq_S; + gmx_simd_real_t qq_S0; + gmx_simd_real_t qq_S1; + gmx_simd_real_t qq_S2; + gmx_simd_real_t qq_S3; #ifdef CALC_COUL_TAB /* The force (PME mesh force) we need to subtract from 1/r^2 */ - gmx_mm_pr fsub_S0; - gmx_mm_pr fsub_S1; - gmx_mm_pr fsub_S2; - gmx_mm_pr fsub_S3; + gmx_simd_real_t fsub_S0; + gmx_simd_real_t fsub_S1; + gmx_simd_real_t fsub_S2; + gmx_simd_real_t fsub_S3; #endif #ifdef CALC_COUL_EWALD - gmx_mm_pr brsq_S0, brsq_S1, brsq_S2, brsq_S3; - gmx_mm_pr ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3; + gmx_simd_real_t brsq_S0, brsq_S1, brsq_S2, brsq_S3; + gmx_simd_real_t ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3; #endif /* frcoul = (1/r - fsub)*r */ - gmx_mm_pr frcoul_S0; - gmx_mm_pr frcoul_S1; - gmx_mm_pr frcoul_S2; - gmx_mm_pr frcoul_S3; + gmx_simd_real_t frcoul_S0; + gmx_simd_real_t frcoul_S1; + gmx_simd_real_t frcoul_S2; + gmx_simd_real_t frcoul_S3; #ifdef CALC_COUL_TAB /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */ - gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0; - gmx_mm_pr r_S1, rs_S1, rf_S1, frac_S1; - gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2; - gmx_mm_pr r_S3, rs_S3, rf_S3, frac_S3; + gmx_simd_real_t r_S0, rs_S0, rf_S0, frac_S0; + gmx_simd_real_t r_S1, rs_S1, rf_S1, frac_S1; + gmx_simd_real_t r_S2, rs_S2, rf_S2, frac_S2; + gmx_simd_real_t r_S3, rs_S3, rf_S3, frac_S3; /* Table index: rs truncated to an int */ - gmx_epi32 ti_S0, ti_S1, ti_S2, ti_S3; + gmx_simd_int32_t ti_S0, ti_S1, ti_S2, ti_S3; /* Linear force table values */ - gmx_mm_pr ctab0_S0, ctab1_S0; - gmx_mm_pr ctab0_S1, ctab1_S1; - gmx_mm_pr ctab0_S2, ctab1_S2; - gmx_mm_pr ctab0_S3, ctab1_S3; + gmx_simd_real_t ctab0_S0, ctab1_S0; + gmx_simd_real_t ctab0_S1, ctab1_S1; + gmx_simd_real_t ctab0_S2, ctab1_S2; + gmx_simd_real_t ctab0_S3, ctab1_S3; #ifdef CALC_ENERGIES /* Quadratic energy table value */ - gmx_mm_pr ctabv_S0; - gmx_mm_pr ctabv_S1; - gmx_mm_pr ctabv_S2; - gmx_mm_pr ctabv_S3; + gmx_simd_real_t ctabv_S0; + gmx_simd_real_t ctabv_S1; + gmx_simd_real_t ctabv_S2; + gmx_simd_real_t ctabv_S3; #endif #endif #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) /* The potential (PME mesh) we need to subtract from 1/r */ - gmx_mm_pr vc_sub_S0; - gmx_mm_pr vc_sub_S1; - gmx_mm_pr vc_sub_S2; - gmx_mm_pr vc_sub_S3; + gmx_simd_real_t vc_sub_S0; + gmx_simd_real_t vc_sub_S1; + gmx_simd_real_t vc_sub_S2; + gmx_simd_real_t vc_sub_S3; #endif #ifdef CALC_ENERGIES /* Electrostatic potential */ - gmx_mm_pr vcoul_S0; - gmx_mm_pr vcoul_S1; - gmx_mm_pr vcoul_S2; - gmx_mm_pr vcoul_S3; + gmx_simd_real_t vcoul_S0; + gmx_simd_real_t vcoul_S1; + gmx_simd_real_t vcoul_S2; + gmx_simd_real_t vcoul_S3; #endif #endif /* The force times 1/r */ - gmx_mm_pr fscal_S0; - gmx_mm_pr fscal_S1; - gmx_mm_pr fscal_S2; - gmx_mm_pr fscal_S3; + gmx_simd_real_t fscal_S0; + gmx_simd_real_t fscal_S1; + gmx_simd_real_t fscal_S2; + gmx_simd_real_t fscal_S3; #ifdef CALC_LJ #ifdef LJ_COMB_LB /* LJ sigma_j/2 and sqrt(epsilon_j) */ - gmx_mm_pr hsig_j_S, seps_j_S; + gmx_simd_real_t hsig_j_S, seps_j_S; /* LJ sigma_ij and epsilon_ij */ - gmx_mm_pr sig_S0, eps_S0; - gmx_mm_pr sig_S1, eps_S1; + gmx_simd_real_t sig_S0, eps_S0; + gmx_simd_real_t sig_S1, eps_S1; #ifndef HALF_LJ - gmx_mm_pr sig_S2, eps_S2; - gmx_mm_pr sig_S3, eps_S3; + gmx_simd_real_t sig_S2, eps_S2; + gmx_simd_real_t sig_S3, eps_S3; #endif #ifdef CALC_ENERGIES - gmx_mm_pr sig2_S0, sig6_S0; - gmx_mm_pr sig2_S1, sig6_S1; + gmx_simd_real_t sig2_S0, sig6_S0; + gmx_simd_real_t sig2_S1, sig6_S1; #ifndef HALF_LJ - gmx_mm_pr sig2_S2, sig6_S2; - gmx_mm_pr sig2_S3, sig6_S3; + gmx_simd_real_t sig2_S2, sig6_S2; + gmx_simd_real_t sig2_S3, sig6_S3; #endif #endif /* LJ_COMB_LB */ #endif /* CALC_LJ */ #ifdef LJ_COMB_GEOM - gmx_mm_pr c6s_j_S, c12s_j_S; + gmx_simd_real_t c6s_j_S, c12s_j_S; #endif #if defined LJ_COMB_GEOM || defined LJ_COMB_LB @@ -218,44 +218,44 @@ #ifndef FIX_LJ_C /* LJ C6 and C12 parameters, used with geometric comb. rule */ - gmx_mm_pr c6_S0, c12_S0; - gmx_mm_pr c6_S1, c12_S1; + gmx_simd_real_t c6_S0, c12_S0; + gmx_simd_real_t c6_S1, c12_S1; #ifndef HALF_LJ - gmx_mm_pr c6_S2, c12_S2; - gmx_mm_pr c6_S3, c12_S3; + gmx_simd_real_t c6_S2, c12_S2; + gmx_simd_real_t c6_S3, c12_S3; #endif #endif /* Intermediate variables for LJ calculation */ #ifndef LJ_COMB_LB - gmx_mm_pr rinvsix_S0; - gmx_mm_pr rinvsix_S1; + gmx_simd_real_t rinvsix_S0; + gmx_simd_real_t rinvsix_S1; #ifndef HALF_LJ - gmx_mm_pr rinvsix_S2; - gmx_mm_pr rinvsix_S3; + gmx_simd_real_t rinvsix_S2; + gmx_simd_real_t rinvsix_S3; #endif #endif #ifdef LJ_COMB_LB - gmx_mm_pr sir_S0, sir2_S0, sir6_S0; - gmx_mm_pr sir_S1, sir2_S1, sir6_S1; + gmx_simd_real_t sir_S0, sir2_S0, sir6_S0; + gmx_simd_real_t sir_S1, sir2_S1, sir6_S1; #ifndef HALF_LJ - gmx_mm_pr sir_S2, sir2_S2, sir6_S2; - gmx_mm_pr sir_S3, sir2_S3, sir6_S3; + gmx_simd_real_t sir_S2, sir2_S2, sir6_S2; + gmx_simd_real_t sir_S3, sir2_S3, sir6_S3; #endif #endif - gmx_mm_pr FrLJ6_S0, FrLJ12_S0; - gmx_mm_pr FrLJ6_S1, FrLJ12_S1; + gmx_simd_real_t FrLJ6_S0, FrLJ12_S0; + gmx_simd_real_t FrLJ6_S1, FrLJ12_S1; #ifndef HALF_LJ - gmx_mm_pr FrLJ6_S2, FrLJ12_S2; - gmx_mm_pr FrLJ6_S3, FrLJ12_S3; + gmx_simd_real_t FrLJ6_S2, FrLJ12_S2; + gmx_simd_real_t FrLJ6_S3, FrLJ12_S3; #endif #ifdef CALC_ENERGIES - gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0; - gmx_mm_pr VLJ6_S1, VLJ12_S1, VLJ_S1; + gmx_simd_real_t VLJ6_S0, VLJ12_S0, VLJ_S0; + gmx_simd_real_t VLJ6_S1, VLJ12_S1, VLJ_S1; #ifndef HALF_LJ - gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2; - gmx_mm_pr VLJ6_S3, VLJ12_S3, VLJ_S3; + gmx_simd_real_t VLJ6_S2, VLJ12_S2, VLJ_S2; + gmx_simd_real_t VLJ6_S3, VLJ12_S3, VLJ_S3; #endif #endif #endif /* CALC_LJ */ @@ -284,7 +284,7 @@ gmx_load_simd_4xn_interactions(l_cj[cjind].excl, filter_S0, filter_S1, filter_S2, filter_S3, -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX l_cj[cjind].interaction_mask_indices, nbat->simd_interaction_array, #else @@ -298,35 +298,35 @@ #endif /* CHECK_EXCLS */ /* load j atom coordinates */ - jx_S = gmx_load_pr(x+ajx); - jy_S = gmx_load_pr(x+ajy); - jz_S = gmx_load_pr(x+ajz); + jx_S = gmx_simd_load_r(x+ajx); + jy_S = gmx_simd_load_r(x+ajy); + jz_S = gmx_simd_load_r(x+ajz); /* Calculate distance */ - dx_S0 = gmx_sub_pr(ix_S0, jx_S); - dy_S0 = gmx_sub_pr(iy_S0, jy_S); - dz_S0 = gmx_sub_pr(iz_S0, jz_S); - dx_S1 = gmx_sub_pr(ix_S1, jx_S); - dy_S1 = gmx_sub_pr(iy_S1, jy_S); - dz_S1 = gmx_sub_pr(iz_S1, jz_S); - dx_S2 = gmx_sub_pr(ix_S2, jx_S); - dy_S2 = gmx_sub_pr(iy_S2, jy_S); - dz_S2 = gmx_sub_pr(iz_S2, jz_S); - dx_S3 = gmx_sub_pr(ix_S3, jx_S); - dy_S3 = gmx_sub_pr(iy_S3, jy_S); - dz_S3 = gmx_sub_pr(iz_S3, jz_S); + dx_S0 = gmx_simd_sub_r(ix_S0, jx_S); + dy_S0 = gmx_simd_sub_r(iy_S0, jy_S); + dz_S0 = gmx_simd_sub_r(iz_S0, jz_S); + dx_S1 = gmx_simd_sub_r(ix_S1, jx_S); + dy_S1 = gmx_simd_sub_r(iy_S1, jy_S); + dz_S1 = gmx_simd_sub_r(iz_S1, jz_S); + dx_S2 = gmx_simd_sub_r(ix_S2, jx_S); + dy_S2 = gmx_simd_sub_r(iy_S2, jy_S); + dz_S2 = gmx_simd_sub_r(iz_S2, jz_S); + dx_S3 = gmx_simd_sub_r(ix_S3, jx_S); + dy_S3 = gmx_simd_sub_r(iy_S3, jy_S); + dz_S3 = gmx_simd_sub_r(iz_S3, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1); - rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); - rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3); + rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_simd_calc_rsq_r(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_simd_calc_rsq_r(dx_S3, dy_S3, dz_S3); #ifndef NBNXN_CUTOFF_USE_BLENDV - wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); - wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S); - wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); - wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S); + wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S); + wco_S1 = gmx_simd_cmplt_r(rsq_S1, rc2_S); + wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S); + wco_S3 = gmx_simd_cmplt_r(rsq_S3, rc2_S); #endif #ifdef CHECK_EXCLS @@ -335,61 +335,61 @@ #if UNROLLJ == UNROLLI if (cj == ci_sh) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0); - wco_S1 = gmx_and_pb(wco_S1, diagonal_mask_S1); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2); - wco_S3 = gmx_and_pb(wco_S3, diagonal_mask_S3); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask_S0); + wco_S1 = gmx_simd_and_b(wco_S1, diagonal_mask_S1); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask_S2); + wco_S3 = gmx_simd_and_b(wco_S3, diagonal_mask_S3); } #else #if UNROLLJ < UNROLLI if (cj == ci_sh*2) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0); - wco_S1 = gmx_and_pb(wco_S1, diagonal_mask0_S1); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2); - wco_S3 = gmx_and_pb(wco_S3, diagonal_mask0_S3); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask0_S0); + wco_S1 = gmx_simd_and_b(wco_S1, diagonal_mask0_S1); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask0_S2); + wco_S3 = gmx_simd_and_b(wco_S3, diagonal_mask0_S3); } if (cj == ci_sh*2 + 1) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0); - wco_S1 = gmx_and_pb(wco_S1, diagonal_mask1_S1); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2); - wco_S3 = gmx_and_pb(wco_S3, diagonal_mask1_S3); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask1_S0); + wco_S1 = gmx_simd_and_b(wco_S1, diagonal_mask1_S1); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask1_S2); + wco_S3 = gmx_simd_and_b(wco_S3, diagonal_mask1_S3); } #else if (cj*2 == ci_sh) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0); - wco_S1 = gmx_and_pb(wco_S1, diagonal_mask0_S1); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2); - wco_S3 = gmx_and_pb(wco_S3, diagonal_mask0_S3); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask0_S0); + wco_S1 = gmx_simd_and_b(wco_S1, diagonal_mask0_S1); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask0_S2); + wco_S3 = gmx_simd_and_b(wco_S3, diagonal_mask0_S3); } else if (cj*2 + 1 == ci_sh) { - wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0); - wco_S1 = gmx_and_pb(wco_S1, diagonal_mask1_S1); - wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2); - wco_S3 = gmx_and_pb(wco_S3, diagonal_mask1_S3); + wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask1_S0); + wco_S1 = gmx_simd_and_b(wco_S1, diagonal_mask1_S1); + wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask1_S2); + wco_S3 = gmx_simd_and_b(wco_S3, diagonal_mask1_S3); } #endif #endif #else /* EXCL_FORCES */ /* No exclusion forces: remove all excluded atom pairs from the list */ - wco_S0 = gmx_and_pb(wco_S0, interact_S0); - wco_S1 = gmx_and_pb(wco_S1, interact_S1); - wco_S2 = gmx_and_pb(wco_S2, interact_S2); - wco_S3 = gmx_and_pb(wco_S3, interact_S3); + wco_S0 = gmx_simd_and_b(wco_S0, interact_S0); + wco_S1 = gmx_simd_and_b(wco_S1, interact_S1); + wco_S2 = gmx_simd_and_b(wco_S2, interact_S2); + wco_S3 = gmx_simd_and_b(wco_S3, interact_S3); #endif #endif #ifdef COUNT_PAIRS { int i, j; - real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp; - tmp = gmx_simd_align_real(tmpa); + real tmpa[2*GMX_SIMD_REAL_WIDTH], *tmp; + tmp = gmx_simd_align_r(tmpa); for (i = 0; i < UNROLLI; i++) { - gmx_store_pr(tmp, gmx_sub_pr(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3)))); + gmx_simd_store_r(tmp, gmx_simd_sub_r(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3)))); for (j = 0; j < UNROLLJ; j++) { if (tmp[j] >= 0) @@ -411,10 +411,10 @@ /* Calculate 1/r */ #ifndef GMX_DOUBLE - rinv_S0 = gmx_invsqrt_pr(rsq_S0); - rinv_S1 = gmx_invsqrt_pr(rsq_S1); - rinv_S2 = gmx_invsqrt_pr(rsq_S2); - rinv_S3 = gmx_invsqrt_pr(rsq_S3); + rinv_S0 = gmx_simd_invsqrt_r(rsq_S0); + rinv_S1 = gmx_simd_invsqrt_r(rsq_S1); + rinv_S2 = gmx_simd_invsqrt_r(rsq_S2); + rinv_S3 = gmx_simd_invsqrt_r(rsq_S3); #else gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1); gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3); @@ -422,11 +422,11 @@ #ifdef CALC_COULOMB /* Load parameters for j atom */ - jq_S = gmx_load_pr(q+aj); - qq_S0 = gmx_mul_pr(iq_S0, jq_S); - qq_S1 = gmx_mul_pr(iq_S1, jq_S); - qq_S2 = gmx_mul_pr(iq_S2, jq_S); - qq_S3 = gmx_mul_pr(iq_S3, jq_S); + jq_S = gmx_simd_load_r(q+aj); + qq_S0 = gmx_simd_mul_r(iq_S0, jq_S); + qq_S1 = gmx_simd_mul_r(iq_S1, jq_S); + qq_S2 = gmx_simd_mul_r(iq_S2, jq_S); + qq_S3 = gmx_simd_mul_r(iq_S3, jq_S); #endif #ifdef CALC_LJ @@ -441,57 +441,57 @@ #endif /* not defined any LJ rule */ #ifdef LJ_COMB_GEOM - c6s_j_S = gmx_load_pr(ljc+aj2+0); - c12s_j_S = gmx_load_pr(ljc+aj2+STRIDE); - c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S ); - c6_S1 = gmx_mul_pr(c6s_S1, c6s_j_S ); + c6s_j_S = gmx_simd_load_r(ljc+aj2+0); + c12s_j_S = gmx_simd_load_r(ljc+aj2+STRIDE); + c6_S0 = gmx_simd_mul_r(c6s_S0, c6s_j_S ); + c6_S1 = gmx_simd_mul_r(c6s_S1, c6s_j_S ); #ifndef HALF_LJ - c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S ); - c6_S3 = gmx_mul_pr(c6s_S3, c6s_j_S ); + c6_S2 = gmx_simd_mul_r(c6s_S2, c6s_j_S ); + c6_S3 = gmx_simd_mul_r(c6s_S3, c6s_j_S ); #endif - c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S); - c12_S1 = gmx_mul_pr(c12s_S1, c12s_j_S); + c12_S0 = gmx_simd_mul_r(c12s_S0, c12s_j_S); + c12_S1 = gmx_simd_mul_r(c12s_S1, c12s_j_S); #ifndef HALF_LJ - c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S); - c12_S3 = gmx_mul_pr(c12s_S3, c12s_j_S); + c12_S2 = gmx_simd_mul_r(c12s_S2, c12s_j_S); + c12_S3 = gmx_simd_mul_r(c12s_S3, c12s_j_S); #endif #endif /* LJ_COMB_GEOM */ #ifdef LJ_COMB_LB - hsig_j_S = gmx_load_pr(ljc+aj2+0); - seps_j_S = gmx_load_pr(ljc+aj2+STRIDE); + hsig_j_S = gmx_simd_load_r(ljc+aj2+0); + seps_j_S = gmx_simd_load_r(ljc+aj2+STRIDE); - sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S); - sig_S1 = gmx_add_pr(hsig_i_S1, hsig_j_S); - eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S); - eps_S1 = gmx_mul_pr(seps_i_S1, seps_j_S); + sig_S0 = gmx_simd_add_r(hsig_i_S0, hsig_j_S); + sig_S1 = gmx_simd_add_r(hsig_i_S1, hsig_j_S); + eps_S0 = gmx_simd_mul_r(seps_i_S0, seps_j_S); + eps_S1 = gmx_simd_mul_r(seps_i_S1, seps_j_S); #ifndef HALF_LJ - sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S); - sig_S3 = gmx_add_pr(hsig_i_S3, hsig_j_S); - eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S); - eps_S3 = gmx_mul_pr(seps_i_S3, seps_j_S); + sig_S2 = gmx_simd_add_r(hsig_i_S2, hsig_j_S); + sig_S3 = gmx_simd_add_r(hsig_i_S3, hsig_j_S); + eps_S2 = gmx_simd_mul_r(seps_i_S2, seps_j_S); + eps_S3 = gmx_simd_mul_r(seps_i_S3, seps_j_S); #endif #endif /* LJ_COMB_LB */ #endif /* CALC_LJ */ #ifndef NBNXN_CUTOFF_USE_BLENDV - rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0); - rinv_S1 = gmx_blendzero_pr(rinv_S1, wco_S1); - rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2); - rinv_S3 = gmx_blendzero_pr(rinv_S3, wco_S3); + rinv_S0 = gmx_simd_blendzero_r(rinv_S0, wco_S0); + rinv_S1 = gmx_simd_blendzero_r(rinv_S1, wco_S1); + rinv_S2 = gmx_simd_blendzero_r(rinv_S2, wco_S2); + rinv_S3 = gmx_simd_blendzero_r(rinv_S3, wco_S3); #else /* We only need to mask for the cut-off: blendv is faster */ - rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)); - rinv_S1 = gmx_blendv_pr(rinv_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1)); - rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)); - rinv_S3 = gmx_blendv_pr(rinv_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3)); + rinv_S0 = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0)); + rinv_S1 = gmx_simd_blendv_r(rinv_S1, zero_S, gmx_simd_sub_r(rc2_S, rsq_S1)); + rinv_S2 = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2)); + rinv_S3 = gmx_simd_blendv_r(rinv_S3, zero_S, gmx_simd_sub_r(rc2_S, rsq_S3)); #endif - rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0); - rinvsq_S1 = gmx_mul_pr(rinv_S1, rinv_S1); - rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2); - rinvsq_S3 = gmx_mul_pr(rinv_S3, rinv_S3); + rinvsq_S0 = gmx_simd_mul_r(rinv_S0, rinv_S0); + rinvsq_S1 = gmx_simd_mul_r(rinv_S1, rinv_S1); + rinvsq_S2 = gmx_simd_mul_r(rinv_S2, rinv_S2); + rinvsq_S3 = gmx_simd_mul_r(rinv_S3, rinv_S3); #ifdef CALC_COULOMB /* Note that here we calculate force*r, not the usual force/r. @@ -502,10 +502,10 @@ #ifdef EXCL_FORCES /* Only add 1/r for non-excluded atom pairs */ - rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0); - rinv_ex_S1 = gmx_blendzero_pr(rinv_S1, interact_S1); - rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2); - rinv_ex_S3 = gmx_blendzero_pr(rinv_S3, interact_S3); + rinv_ex_S0 = gmx_simd_blendzero_r(rinv_S0, interact_S0); + rinv_ex_S1 = gmx_simd_blendzero_r(rinv_S1, interact_S1); + rinv_ex_S2 = gmx_simd_blendzero_r(rinv_S2, interact_S2); + rinv_ex_S3 = gmx_simd_blendzero_r(rinv_S3, interact_S3); #else /* No exclusion forces, we always need 1/r */ #define rinv_ex_S0 rinv_S0 @@ -516,16 +516,16 @@ #ifdef CALC_COUL_RF /* Electrostatic interactions */ - frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0)); - frcoul_S1 = gmx_mul_pr(qq_S1, gmx_madd_pr(rsq_S1, mrc_3_S, rinv_ex_S1)); - frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2)); - frcoul_S3 = gmx_mul_pr(qq_S3, gmx_madd_pr(rsq_S3, mrc_3_S, rinv_ex_S3)); + frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0)); + frcoul_S1 = gmx_simd_mul_r(qq_S1, gmx_simd_fmadd_r(rsq_S1, mrc_3_S, rinv_ex_S1)); + frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2)); + frcoul_S3 = gmx_simd_mul_r(qq_S3, gmx_simd_fmadd_r(rsq_S3, mrc_3_S, rinv_ex_S3)); #ifdef CALC_ENERGIES - vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S))); - vcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_add_pr(gmx_mul_pr(rsq_S1, hrc_3_S), moh_rc_S))); - vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S))); - vcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_add_pr(gmx_mul_pr(rsq_S3, hrc_3_S), moh_rc_S))); + vcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_add_r(rinv_ex_S0, gmx_simd_add_r(gmx_simd_mul_r(rsq_S0, hrc_3_S), moh_rc_S))); + vcoul_S1 = gmx_simd_mul_r(qq_S1, gmx_simd_add_r(rinv_ex_S1, gmx_simd_add_r(gmx_simd_mul_r(rsq_S1, hrc_3_S), moh_rc_S))); + vcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_add_r(rinv_ex_S2, gmx_simd_add_r(gmx_simd_mul_r(rsq_S2, hrc_3_S), moh_rc_S))); + vcoul_S3 = gmx_simd_mul_r(qq_S3, gmx_simd_add_r(rinv_ex_S3, gmx_simd_add_r(gmx_simd_mul_r(rsq_S3, hrc_3_S), moh_rc_S))); #endif #endif @@ -534,67 +534,67 @@ * as large distances can cause an overflow in gmx_pmecorrF/V. */ #ifndef NBNXN_CUTOFF_USE_BLENDV - brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0)); - brsq_S1 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S1, wco_S1)); - brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2)); - brsq_S3 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S3, wco_S3)); + brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S0, wco_S0)); + brsq_S1 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S1, wco_S1)); + brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S2, wco_S2)); + brsq_S3 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S3, wco_S3)); #else /* Strangely, putting mul on a separate line is slower (icc 13) */ - brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0))); - brsq_S1 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1))); - brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2))); - brsq_S3 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3))); -#endif - ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S); - ewcorr_S1 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S); - ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S); - ewcorr_S3 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S); - frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0)); - frcoul_S1 = gmx_mul_pr(qq_S1, gmx_madd_pr(ewcorr_S1, brsq_S1, rinv_ex_S1)); - frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2)); - frcoul_S3 = gmx_mul_pr(qq_S3, gmx_madd_pr(ewcorr_S3, brsq_S3, rinv_ex_S3)); + brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0))); + brsq_S1 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S1, zero_S, gmx_simd_sub_r(rc2_S, rsq_S1))); + brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2))); + brsq_S3 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S3, zero_S, gmx_simd_sub_r(rc2_S, rsq_S3))); +#endif + ewcorr_S0 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S0), beta_S); + ewcorr_S1 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S1), beta_S); + ewcorr_S2 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S2), beta_S); + ewcorr_S3 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S3), beta_S); + frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0)); + frcoul_S1 = gmx_simd_mul_r(qq_S1, gmx_simd_fmadd_r(ewcorr_S1, brsq_S1, rinv_ex_S1)); + frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2)); + frcoul_S3 = gmx_simd_mul_r(qq_S3, gmx_simd_fmadd_r(ewcorr_S3, brsq_S3, rinv_ex_S3)); #ifdef CALC_ENERGIES - vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S); - vc_sub_S1 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S1), beta_S); - vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S); - vc_sub_S3 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S3), beta_S); + vc_sub_S0 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S0), beta_S); + vc_sub_S1 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S1), beta_S); + vc_sub_S2 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S2), beta_S); + vc_sub_S3 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S3), beta_S); #endif #endif /* CALC_COUL_EWALD */ #ifdef CALC_COUL_TAB /* Electrostatic interactions */ - r_S0 = gmx_mul_pr(rsq_S0, rinv_S0); - r_S1 = gmx_mul_pr(rsq_S1, rinv_S1); - r_S2 = gmx_mul_pr(rsq_S2, rinv_S2); - r_S3 = gmx_mul_pr(rsq_S3, rinv_S3); + r_S0 = gmx_simd_mul_r(rsq_S0, rinv_S0); + r_S1 = gmx_simd_mul_r(rsq_S1, rinv_S1); + r_S2 = gmx_simd_mul_r(rsq_S2, rinv_S2); + r_S3 = gmx_simd_mul_r(rsq_S3, rinv_S3); /* Convert r to scaled table units */ - rs_S0 = gmx_mul_pr(r_S0, invtsp_S); - rs_S1 = gmx_mul_pr(r_S1, invtsp_S); - rs_S2 = gmx_mul_pr(r_S2, invtsp_S); - rs_S3 = gmx_mul_pr(r_S3, invtsp_S); + rs_S0 = gmx_simd_mul_r(r_S0, invtsp_S); + rs_S1 = gmx_simd_mul_r(r_S1, invtsp_S); + rs_S2 = gmx_simd_mul_r(r_S2, invtsp_S); + rs_S3 = gmx_simd_mul_r(r_S3, invtsp_S); /* Truncate scaled r to an int */ - ti_S0 = gmx_cvttpr_epi32(rs_S0); - ti_S1 = gmx_cvttpr_epi32(rs_S1); - ti_S2 = gmx_cvttpr_epi32(rs_S2); - ti_S3 = gmx_cvttpr_epi32(rs_S3); + ti_S0 = gmx_simd_cvtt_r2i(rs_S0); + ti_S1 = gmx_simd_cvtt_r2i(rs_S1); + ti_S2 = gmx_simd_cvtt_r2i(rs_S2); + ti_S3 = gmx_simd_cvtt_r2i(rs_S3); #ifdef GMX_SIMD_HAVE_FLOOR /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */ - rf_S0 = gmx_floor_pr(rs_S0); - rf_S1 = gmx_floor_pr(rs_S1); - rf_S2 = gmx_floor_pr(rs_S2); - rf_S3 = gmx_floor_pr(rs_S3); + rf_S0 = gmx_simd_floor_r(rs_S0); + rf_S1 = gmx_simd_floor_r(rs_S1); + rf_S2 = gmx_simd_floor_r(rs_S2); + rf_S3 = gmx_simd_floor_r(rs_S3); #else - rf_S0 = gmx_cvtepi32_pr(ti_S0); - rf_S1 = gmx_cvtepi32_pr(ti_S1); - rf_S2 = gmx_cvtepi32_pr(ti_S2); - rf_S3 = gmx_cvtepi32_pr(ti_S3); + rf_S0 = gmx_simd_cvt_i2r(ti_S0); + rf_S1 = gmx_simd_cvt_i2r(ti_S1); + rf_S2 = gmx_simd_cvt_i2r(ti_S2); + rf_S3 = gmx_simd_cvt_i2r(ti_S3); #endif - frac_S0 = gmx_sub_pr(rs_S0, rf_S0); - frac_S1 = gmx_sub_pr(rs_S1, rf_S1); - frac_S2 = gmx_sub_pr(rs_S2, rf_S2); - frac_S3 = gmx_sub_pr(rs_S3, rf_S3); + frac_S0 = gmx_simd_sub_r(rs_S0, rf_S0); + frac_S1 = gmx_simd_sub_r(rs_S1, rf_S1); + frac_S2 = gmx_simd_sub_r(rs_S2, rf_S2); + frac_S3 = gmx_simd_sub_r(rs_S3, rf_S3); /* Load and interpolate table forces and possibly energies. * Force and energy can be combined in one table, stride 4: FDV0 @@ -619,20 +619,20 @@ load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3); #endif #endif - fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0)); - fsub_S1 = gmx_add_pr(ctab0_S1, gmx_mul_pr(frac_S1, ctab1_S1)); - fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2)); - fsub_S3 = gmx_add_pr(ctab0_S3, gmx_mul_pr(frac_S3, ctab1_S3)); - frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0))); - frcoul_S1 = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, gmx_mul_pr(fsub_S1, r_S1))); - frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2))); - frcoul_S3 = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, gmx_mul_pr(fsub_S3, r_S3))); + fsub_S0 = gmx_simd_add_r(ctab0_S0, gmx_simd_mul_r(frac_S0, ctab1_S0)); + fsub_S1 = gmx_simd_add_r(ctab0_S1, gmx_simd_mul_r(frac_S1, ctab1_S1)); + fsub_S2 = gmx_simd_add_r(ctab0_S2, gmx_simd_mul_r(frac_S2, ctab1_S2)); + fsub_S3 = gmx_simd_add_r(ctab0_S3, gmx_simd_mul_r(frac_S3, ctab1_S3)); + frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, gmx_simd_mul_r(fsub_S0, r_S0))); + frcoul_S1 = gmx_simd_mul_r(qq_S1, gmx_simd_sub_r(rinv_ex_S1, gmx_simd_mul_r(fsub_S1, r_S1))); + frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, gmx_simd_mul_r(fsub_S2, r_S2))); + frcoul_S3 = gmx_simd_mul_r(qq_S3, gmx_simd_sub_r(rinv_ex_S3, gmx_simd_mul_r(fsub_S3, r_S3))); #ifdef CALC_ENERGIES - vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0))); - vc_sub_S1 = gmx_add_pr(ctabv_S1, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S1), gmx_add_pr(ctab0_S1, fsub_S1))); - vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2))); - vc_sub_S3 = gmx_add_pr(ctabv_S3, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S3), gmx_add_pr(ctab0_S3, fsub_S3))); + vc_sub_S0 = gmx_simd_add_r(ctabv_S0, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S0), gmx_simd_add_r(ctab0_S0, fsub_S0))); + vc_sub_S1 = gmx_simd_add_r(ctabv_S1, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S1), gmx_simd_add_r(ctab0_S1, fsub_S1))); + vc_sub_S2 = gmx_simd_add_r(ctabv_S2, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S2), gmx_simd_add_r(ctab0_S2, fsub_S2))); + vc_sub_S3 = gmx_simd_add_r(ctabv_S3, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S3), gmx_simd_add_r(ctab0_S3, fsub_S3))); #endif #endif /* CALC_COUL_TAB */ @@ -640,31 +640,31 @@ #ifndef NO_SHIFT_EWALD /* Add Ewald potential shift to vc_sub for convenience */ #ifdef CHECK_EXCLS - vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0)); - vc_sub_S1 = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, interact_S1)); - vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2)); - vc_sub_S3 = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, interact_S3)); + vc_sub_S0 = gmx_simd_add_r(vc_sub_S0, gmx_simd_blendzero_r(sh_ewald_S, interact_S0)); + vc_sub_S1 = gmx_simd_add_r(vc_sub_S1, gmx_simd_blendzero_r(sh_ewald_S, interact_S1)); + vc_sub_S2 = gmx_simd_add_r(vc_sub_S2, gmx_simd_blendzero_r(sh_ewald_S, interact_S2)); + vc_sub_S3 = gmx_simd_add_r(vc_sub_S3, gmx_simd_blendzero_r(sh_ewald_S, interact_S3)); #else - vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S); - vc_sub_S1 = gmx_add_pr(vc_sub_S1, sh_ewald_S); - vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S); - vc_sub_S3 = gmx_add_pr(vc_sub_S3, sh_ewald_S); + vc_sub_S0 = gmx_simd_add_r(vc_sub_S0, sh_ewald_S); + vc_sub_S1 = gmx_simd_add_r(vc_sub_S1, sh_ewald_S); + vc_sub_S2 = gmx_simd_add_r(vc_sub_S2, sh_ewald_S); + vc_sub_S3 = gmx_simd_add_r(vc_sub_S3, sh_ewald_S); #endif #endif - vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0)); - vcoul_S1 = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, vc_sub_S1)); - vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2)); - vcoul_S3 = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, vc_sub_S3)); + vcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, vc_sub_S0)); + vcoul_S1 = gmx_simd_mul_r(qq_S1, gmx_simd_sub_r(rinv_ex_S1, vc_sub_S1)); + vcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, vc_sub_S2)); + vcoul_S3 = gmx_simd_mul_r(qq_S3, gmx_simd_sub_r(rinv_ex_S3, vc_sub_S3)); #endif #ifdef CALC_ENERGIES /* Mask energy for cut-off and diagonal */ - vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0); - vcoul_S1 = gmx_blendzero_pr(vcoul_S1, wco_S1); - vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2); - vcoul_S3 = gmx_blendzero_pr(vcoul_S3, wco_S3); + vcoul_S0 = gmx_simd_blendzero_r(vcoul_S0, wco_S0); + vcoul_S1 = gmx_simd_blendzero_r(vcoul_S1, wco_S1); + vcoul_S2 = gmx_simd_blendzero_r(vcoul_S2, wco_S2); + vcoul_S3 = gmx_simd_blendzero_r(vcoul_S3, wco_S3); #endif #endif /* CALC_COULOMB */ @@ -673,11 +673,11 @@ /* Lennard-Jones interaction */ #ifdef VDW_CUTOFF_CHECK - wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S); - wco_vdw_S1 = gmx_cmplt_pr(rsq_S1, rcvdw2_S); + wco_vdw_S0 = gmx_simd_cmplt_r(rsq_S0, rcvdw2_S); + wco_vdw_S1 = gmx_simd_cmplt_r(rsq_S1, rcvdw2_S); #ifndef HALF_LJ - wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S); - wco_vdw_S3 = gmx_cmplt_pr(rsq_S3, rcvdw2_S); + wco_vdw_S2 = gmx_simd_cmplt_r(rsq_S2, rcvdw2_S); + wco_vdw_S3 = gmx_simd_cmplt_r(rsq_S3, rcvdw2_S); #endif #else /* Same cut-off for Coulomb and VdW, reuse the registers */ @@ -688,114 +688,114 @@ #endif #ifndef LJ_COMB_LB - rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0)); - rinvsix_S1 = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1)); + rinvsix_S0 = gmx_simd_mul_r(rinvsq_S0, gmx_simd_mul_r(rinvsq_S0, rinvsq_S0)); + rinvsix_S1 = gmx_simd_mul_r(rinvsq_S1, gmx_simd_mul_r(rinvsq_S1, rinvsq_S1)); #ifdef EXCL_FORCES - rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0); - rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, interact_S1); + rinvsix_S0 = gmx_simd_blendzero_r(rinvsix_S0, interact_S0); + rinvsix_S1 = gmx_simd_blendzero_r(rinvsix_S1, interact_S1); #endif #ifndef HALF_LJ - rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2)); - rinvsix_S3 = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3)); + rinvsix_S2 = gmx_simd_mul_r(rinvsq_S2, gmx_simd_mul_r(rinvsq_S2, rinvsq_S2)); + rinvsix_S3 = gmx_simd_mul_r(rinvsq_S3, gmx_simd_mul_r(rinvsq_S3, rinvsq_S3)); #ifdef EXCL_FORCES - rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2); - rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, interact_S3); + rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, interact_S2); + rinvsix_S3 = gmx_simd_blendzero_r(rinvsix_S3, interact_S3); #endif #endif #ifdef VDW_CUTOFF_CHECK - rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0); - rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, wco_vdw_S1); + rinvsix_S0 = gmx_simd_blendzero_r(rinvsix_S0, wco_vdw_S0); + rinvsix_S1 = gmx_simd_blendzero_r(rinvsix_S1, wco_vdw_S1); #ifndef HALF_LJ - rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2); - rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, wco_vdw_S3); + rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, wco_vdw_S2); + rinvsix_S3 = gmx_simd_blendzero_r(rinvsix_S3, wco_vdw_S3); #endif #endif - FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0); - FrLJ6_S1 = gmx_mul_pr(c6_S1, rinvsix_S1); + FrLJ6_S0 = gmx_simd_mul_r(c6_S0, rinvsix_S0); + FrLJ6_S1 = gmx_simd_mul_r(c6_S1, rinvsix_S1); #ifndef HALF_LJ - FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2); - FrLJ6_S3 = gmx_mul_pr(c6_S3, rinvsix_S3); + FrLJ6_S2 = gmx_simd_mul_r(c6_S2, rinvsix_S2); + FrLJ6_S3 = gmx_simd_mul_r(c6_S3, rinvsix_S3); #endif - FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0)); - FrLJ12_S1 = gmx_mul_pr(c12_S1, gmx_mul_pr(rinvsix_S1, rinvsix_S1)); + FrLJ12_S0 = gmx_simd_mul_r(c12_S0, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0)); + FrLJ12_S1 = gmx_simd_mul_r(c12_S1, gmx_simd_mul_r(rinvsix_S1, rinvsix_S1)); #ifndef HALF_LJ - FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2)); - FrLJ12_S3 = gmx_mul_pr(c12_S3, gmx_mul_pr(rinvsix_S3, rinvsix_S3)); + FrLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2)); + FrLJ12_S3 = gmx_simd_mul_r(c12_S3, gmx_simd_mul_r(rinvsix_S3, rinvsix_S3)); #endif #endif /* not LJ_COMB_LB */ #ifdef LJ_COMB_LB - sir_S0 = gmx_mul_pr(sig_S0, rinv_S0); - sir_S1 = gmx_mul_pr(sig_S1, rinv_S1); + sir_S0 = gmx_simd_mul_r(sig_S0, rinv_S0); + sir_S1 = gmx_simd_mul_r(sig_S1, rinv_S1); #ifndef HALF_LJ - sir_S2 = gmx_mul_pr(sig_S2, rinv_S2); - sir_S3 = gmx_mul_pr(sig_S3, rinv_S3); + sir_S2 = gmx_simd_mul_r(sig_S2, rinv_S2); + sir_S3 = gmx_simd_mul_r(sig_S3, rinv_S3); #endif - sir2_S0 = gmx_mul_pr(sir_S0, sir_S0); - sir2_S1 = gmx_mul_pr(sir_S1, sir_S1); + sir2_S0 = gmx_simd_mul_r(sir_S0, sir_S0); + sir2_S1 = gmx_simd_mul_r(sir_S1, sir_S1); #ifndef HALF_LJ - sir2_S2 = gmx_mul_pr(sir_S2, sir_S2); - sir2_S3 = gmx_mul_pr(sir_S3, sir_S3); + sir2_S2 = gmx_simd_mul_r(sir_S2, sir_S2); + sir2_S3 = gmx_simd_mul_r(sir_S3, sir_S3); #endif - sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0)); - sir6_S1 = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1)); + sir6_S0 = gmx_simd_mul_r(sir2_S0, gmx_simd_mul_r(sir2_S0, sir2_S0)); + sir6_S1 = gmx_simd_mul_r(sir2_S1, gmx_simd_mul_r(sir2_S1, sir2_S1)); #ifdef EXCL_FORCES - sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0); - sir6_S1 = gmx_blendzero_pr(sir6_S1, interact_S1); + sir6_S0 = gmx_simd_blendzero_r(sir6_S0, interact_S0); + sir6_S1 = gmx_simd_blendzero_r(sir6_S1, interact_S1); #endif #ifndef HALF_LJ - sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2)); - sir6_S3 = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3)); + sir6_S2 = gmx_simd_mul_r(sir2_S2, gmx_simd_mul_r(sir2_S2, sir2_S2)); + sir6_S3 = gmx_simd_mul_r(sir2_S3, gmx_simd_mul_r(sir2_S3, sir2_S3)); #ifdef EXCL_FORCES - sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2); - sir6_S3 = gmx_blendzero_pr(sir6_S3, interact_S3); + sir6_S2 = gmx_simd_blendzero_r(sir6_S2, interact_S2); + sir6_S3 = gmx_simd_blendzero_r(sir6_S3, interact_S3); #endif #endif #ifdef VDW_CUTOFF_CHECK - sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0); - sir6_S1 = gmx_blendzero_pr(sir6_S1, wco_vdw_S1); + sir6_S0 = gmx_simd_blendzero_r(sir6_S0, wco_vdw_S0); + sir6_S1 = gmx_simd_blendzero_r(sir6_S1, wco_vdw_S1); #ifndef HALF_LJ - sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2); - sir6_S3 = gmx_blendzero_pr(sir6_S3, wco_vdw_S3); + sir6_S2 = gmx_simd_blendzero_r(sir6_S2, wco_vdw_S2); + sir6_S3 = gmx_simd_blendzero_r(sir6_S3, wco_vdw_S3); #endif #endif - FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0); - FrLJ6_S1 = gmx_mul_pr(eps_S1, sir6_S1); + FrLJ6_S0 = gmx_simd_mul_r(eps_S0, sir6_S0); + FrLJ6_S1 = gmx_simd_mul_r(eps_S1, sir6_S1); #ifndef HALF_LJ - FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2); - FrLJ6_S3 = gmx_mul_pr(eps_S3, sir6_S3); + FrLJ6_S2 = gmx_simd_mul_r(eps_S2, sir6_S2); + FrLJ6_S3 = gmx_simd_mul_r(eps_S3, sir6_S3); #endif - FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0); - FrLJ12_S1 = gmx_mul_pr(FrLJ6_S1, sir6_S1); + FrLJ12_S0 = gmx_simd_mul_r(FrLJ6_S0, sir6_S0); + FrLJ12_S1 = gmx_simd_mul_r(FrLJ6_S1, sir6_S1); #ifndef HALF_LJ - FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2); - FrLJ12_S3 = gmx_mul_pr(FrLJ6_S3, sir6_S3); + FrLJ12_S2 = gmx_simd_mul_r(FrLJ6_S2, sir6_S2); + FrLJ12_S3 = gmx_simd_mul_r(FrLJ6_S3, sir6_S3); #endif #if defined CALC_ENERGIES /* We need C6 and C12 to calculate the LJ potential shift */ - sig2_S0 = gmx_mul_pr(sig_S0, sig_S0); - sig2_S1 = gmx_mul_pr(sig_S1, sig_S1); + sig2_S0 = gmx_simd_mul_r(sig_S0, sig_S0); + sig2_S1 = gmx_simd_mul_r(sig_S1, sig_S1); #ifndef HALF_LJ - sig2_S2 = gmx_mul_pr(sig_S2, sig_S2); - sig2_S3 = gmx_mul_pr(sig_S3, sig_S3); + sig2_S2 = gmx_simd_mul_r(sig_S2, sig_S2); + sig2_S3 = gmx_simd_mul_r(sig_S3, sig_S3); #endif - sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0)); - sig6_S1 = gmx_mul_pr(sig2_S1, gmx_mul_pr(sig2_S1, sig2_S1)); + sig6_S0 = gmx_simd_mul_r(sig2_S0, gmx_simd_mul_r(sig2_S0, sig2_S0)); + sig6_S1 = gmx_simd_mul_r(sig2_S1, gmx_simd_mul_r(sig2_S1, sig2_S1)); #ifndef HALF_LJ - sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2)); - sig6_S3 = gmx_mul_pr(sig2_S3, gmx_mul_pr(sig2_S3, sig2_S3)); + sig6_S2 = gmx_simd_mul_r(sig2_S2, gmx_simd_mul_r(sig2_S2, sig2_S2)); + sig6_S3 = gmx_simd_mul_r(sig2_S3, gmx_simd_mul_r(sig2_S3, sig2_S3)); #endif - c6_S0 = gmx_mul_pr(eps_S0, sig6_S0); - c6_S1 = gmx_mul_pr(eps_S1, sig6_S1); + c6_S0 = gmx_simd_mul_r(eps_S0, sig6_S0); + c6_S1 = gmx_simd_mul_r(eps_S1, sig6_S1); #ifndef HALF_LJ - c6_S2 = gmx_mul_pr(eps_S2, sig6_S2); - c6_S3 = gmx_mul_pr(eps_S3, sig6_S3); + c6_S2 = gmx_simd_mul_r(eps_S2, sig6_S2); + c6_S3 = gmx_simd_mul_r(eps_S3, sig6_S3); #endif - c12_S0 = gmx_mul_pr(c6_S0, sig6_S0); - c12_S1 = gmx_mul_pr(c6_S1, sig6_S1); + c12_S0 = gmx_simd_mul_r(c6_S0, sig6_S0); + c12_S1 = gmx_simd_mul_r(c6_S1, sig6_S1); #ifndef HALF_LJ - c12_S2 = gmx_mul_pr(c6_S2, sig6_S2); - c12_S3 = gmx_mul_pr(c6_S3, sig6_S3); + c12_S2 = gmx_simd_mul_r(c6_S2, sig6_S2); + c12_S3 = gmx_simd_mul_r(c6_S3, sig6_S3); #endif #endif #endif /* LJ_COMB_LB */ @@ -831,7 +831,7 @@ #ifdef CALC_COULOMB #ifndef ENERGY_GROUPS - vctot_S = gmx_add_pr(vctot_S, gmx_sum4_pr(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3)); + vctot_S = gmx_simd_add_r(vctot_S, gmx_simd_sum4_r(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3)); #else add_ener_grp(vcoul_S0, vctp[0], egp_jj); add_ener_grp(vcoul_S1, vctp[1], egp_jj); @@ -842,50 +842,50 @@ #ifdef CALC_LJ /* Calculate the LJ energies */ - VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S))); - VLJ6_S1 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S1, gmx_mul_pr(c6_S1, sh_invrc6_S))); + VLJ6_S0 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S0, gmx_simd_mul_r(c6_S0, sh_invrc6_S))); + VLJ6_S1 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S1, gmx_simd_mul_r(c6_S1, sh_invrc6_S))); #ifndef HALF_LJ - VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S))); - VLJ6_S3 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S3, gmx_mul_pr(c6_S3, sh_invrc6_S))); + VLJ6_S2 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S2, gmx_simd_mul_r(c6_S2, sh_invrc6_S))); + VLJ6_S3 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S3, gmx_simd_mul_r(c6_S3, sh_invrc6_S))); #endif - VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S))); - VLJ12_S1 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S1, gmx_mul_pr(c12_S1, sh_invrc12_S))); + VLJ12_S0 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S0, gmx_simd_mul_r(c12_S0, sh_invrc12_S))); + VLJ12_S1 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S1, gmx_simd_mul_r(c12_S1, sh_invrc12_S))); #ifndef HALF_LJ - VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S))); - VLJ12_S3 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S3, gmx_mul_pr(c12_S3, sh_invrc12_S))); + VLJ12_S2 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S2, gmx_simd_mul_r(c12_S2, sh_invrc12_S))); + VLJ12_S3 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S3, gmx_simd_mul_r(c12_S3, sh_invrc12_S))); #endif - VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0); - VLJ_S1 = gmx_sub_pr(VLJ12_S1, VLJ6_S1); + VLJ_S0 = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0); + VLJ_S1 = gmx_simd_sub_r(VLJ12_S1, VLJ6_S1); #ifndef HALF_LJ - VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2); - VLJ_S3 = gmx_sub_pr(VLJ12_S3, VLJ6_S3); + VLJ_S2 = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2); + VLJ_S3 = gmx_simd_sub_r(VLJ12_S3, VLJ6_S3); #endif /* The potential shift should be removed for pairs beyond cut-off */ - VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0); - VLJ_S1 = gmx_blendzero_pr(VLJ_S1, wco_vdw_S1); + VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0); + VLJ_S1 = gmx_simd_blendzero_r(VLJ_S1, wco_vdw_S1); #ifndef HALF_LJ - VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2); - VLJ_S3 = gmx_blendzero_pr(VLJ_S3, wco_vdw_S3); + VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2); + VLJ_S3 = gmx_simd_blendzero_r(VLJ_S3, wco_vdw_S3); #endif #ifdef CHECK_EXCLS /* The potential shift should be removed for excluded pairs */ - VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0); - VLJ_S1 = gmx_blendzero_pr(VLJ_S1, interact_S1); + VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, interact_S0); + VLJ_S1 = gmx_simd_blendzero_r(VLJ_S1, interact_S1); #ifndef HALF_LJ - VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2); - VLJ_S3 = gmx_blendzero_pr(VLJ_S3, interact_S3); + VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, interact_S2); + VLJ_S3 = gmx_simd_blendzero_r(VLJ_S3, interact_S3); #endif #endif #ifndef ENERGY_GROUPS #ifndef HALF_LJ - Vvdwtot_S = gmx_add_pr(Vvdwtot_S, - gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3) - ); + Vvdwtot_S = gmx_simd_add_r(Vvdwtot_S, + gmx_simd_sum4_r(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3) + ); #else - Vvdwtot_S = gmx_add_pr(Vvdwtot_S, - gmx_add_pr(VLJ_S0, VLJ_S1) - ); + Vvdwtot_S = gmx_simd_add_r(Vvdwtot_S, + gmx_simd_add_r(VLJ_S0, VLJ_S1) + ); #endif #else add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj); @@ -900,87 +900,87 @@ #ifdef CALC_LJ #ifdef CALC_COULOMB - fscal_S0 = gmx_mul_pr(rinvsq_S0, - gmx_add_pr(frcoul_S0, - gmx_sub_pr(FrLJ12_S0, FrLJ6_S0))); + fscal_S0 = gmx_simd_mul_r(rinvsq_S0, + gmx_simd_add_r(frcoul_S0, + gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0))); #else - fscal_S0 = gmx_mul_pr(rinvsq_S0, - ( - gmx_sub_pr(FrLJ12_S0, FrLJ6_S0))); + fscal_S0 = gmx_simd_mul_r(rinvsq_S0, + ( + gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0))); #endif #ifdef CALC_COULOMB - fscal_S1 = gmx_mul_pr(rinvsq_S1, - gmx_add_pr(frcoul_S1, - gmx_sub_pr(FrLJ12_S1, FrLJ6_S1))); + fscal_S1 = gmx_simd_mul_r(rinvsq_S1, + gmx_simd_add_r(frcoul_S1, + gmx_simd_sub_r(FrLJ12_S1, FrLJ6_S1))); #else - fscal_S1 = gmx_mul_pr(rinvsq_S1, - ( - gmx_sub_pr(FrLJ12_S1, FrLJ6_S1))); + fscal_S1 = gmx_simd_mul_r(rinvsq_S1, + ( + gmx_simd_sub_r(FrLJ12_S1, FrLJ6_S1))); #endif #else - fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0); - fscal_S1 = gmx_mul_pr(rinvsq_S1, frcoul_S1); + fscal_S0 = gmx_simd_mul_r(rinvsq_S0, frcoul_S0); + fscal_S1 = gmx_simd_mul_r(rinvsq_S1, frcoul_S1); #endif /* CALC_LJ */ #if defined CALC_LJ && !defined HALF_LJ #ifdef CALC_COULOMB - fscal_S2 = gmx_mul_pr(rinvsq_S2, - gmx_add_pr(frcoul_S2, - gmx_sub_pr(FrLJ12_S2, FrLJ6_S2))); + fscal_S2 = gmx_simd_mul_r(rinvsq_S2, + gmx_simd_add_r(frcoul_S2, + gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2))); #else - fscal_S2 = gmx_mul_pr(rinvsq_S2, - ( - gmx_sub_pr(FrLJ12_S2, FrLJ6_S2))); + fscal_S2 = gmx_simd_mul_r(rinvsq_S2, + ( + gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2))); #endif #ifdef CALC_COULOMB - fscal_S3 = gmx_mul_pr(rinvsq_S3, - gmx_add_pr(frcoul_S3, - gmx_sub_pr(FrLJ12_S3, FrLJ6_S3))); + fscal_S3 = gmx_simd_mul_r(rinvsq_S3, + gmx_simd_add_r(frcoul_S3, + gmx_simd_sub_r(FrLJ12_S3, FrLJ6_S3))); #else - fscal_S3 = gmx_mul_pr(rinvsq_S3, - ( - gmx_sub_pr(FrLJ12_S3, FrLJ6_S3))); + fscal_S3 = gmx_simd_mul_r(rinvsq_S3, + ( + gmx_simd_sub_r(FrLJ12_S3, FrLJ6_S3))); #endif #else /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */ - fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2); - fscal_S3 = gmx_mul_pr(rinvsq_S3, frcoul_S3); + fscal_S2 = gmx_simd_mul_r(rinvsq_S2, frcoul_S2); + fscal_S3 = gmx_simd_mul_r(rinvsq_S3, frcoul_S3); #endif /* Calculate temporary vectorial force */ - tx_S0 = gmx_mul_pr(fscal_S0, dx_S0); - tx_S1 = gmx_mul_pr(fscal_S1, dx_S1); - tx_S2 = gmx_mul_pr(fscal_S2, dx_S2); - tx_S3 = gmx_mul_pr(fscal_S3, dx_S3); - ty_S0 = gmx_mul_pr(fscal_S0, dy_S0); - ty_S1 = gmx_mul_pr(fscal_S1, dy_S1); - ty_S2 = gmx_mul_pr(fscal_S2, dy_S2); - ty_S3 = gmx_mul_pr(fscal_S3, dy_S3); - tz_S0 = gmx_mul_pr(fscal_S0, dz_S0); - tz_S1 = gmx_mul_pr(fscal_S1, dz_S1); - tz_S2 = gmx_mul_pr(fscal_S2, dz_S2); - tz_S3 = gmx_mul_pr(fscal_S3, dz_S3); + tx_S0 = gmx_simd_mul_r(fscal_S0, dx_S0); + tx_S1 = gmx_simd_mul_r(fscal_S1, dx_S1); + tx_S2 = gmx_simd_mul_r(fscal_S2, dx_S2); + tx_S3 = gmx_simd_mul_r(fscal_S3, dx_S3); + ty_S0 = gmx_simd_mul_r(fscal_S0, dy_S0); + ty_S1 = gmx_simd_mul_r(fscal_S1, dy_S1); + ty_S2 = gmx_simd_mul_r(fscal_S2, dy_S2); + ty_S3 = gmx_simd_mul_r(fscal_S3, dy_S3); + tz_S0 = gmx_simd_mul_r(fscal_S0, dz_S0); + tz_S1 = gmx_simd_mul_r(fscal_S1, dz_S1); + tz_S2 = gmx_simd_mul_r(fscal_S2, dz_S2); + tz_S3 = gmx_simd_mul_r(fscal_S3, dz_S3); /* Increment i atom force */ - fix_S0 = gmx_add_pr(fix_S0, tx_S0); - fix_S1 = gmx_add_pr(fix_S1, tx_S1); - fix_S2 = gmx_add_pr(fix_S2, tx_S2); - fix_S3 = gmx_add_pr(fix_S3, tx_S3); - fiy_S0 = gmx_add_pr(fiy_S0, ty_S0); - fiy_S1 = gmx_add_pr(fiy_S1, ty_S1); - fiy_S2 = gmx_add_pr(fiy_S2, ty_S2); - fiy_S3 = gmx_add_pr(fiy_S3, ty_S3); - fiz_S0 = gmx_add_pr(fiz_S0, tz_S0); - fiz_S1 = gmx_add_pr(fiz_S1, tz_S1); - fiz_S2 = gmx_add_pr(fiz_S2, tz_S2); - fiz_S3 = gmx_add_pr(fiz_S3, tz_S3); + fix_S0 = gmx_simd_add_r(fix_S0, tx_S0); + fix_S1 = gmx_simd_add_r(fix_S1, tx_S1); + fix_S2 = gmx_simd_add_r(fix_S2, tx_S2); + fix_S3 = gmx_simd_add_r(fix_S3, tx_S3); + fiy_S0 = gmx_simd_add_r(fiy_S0, ty_S0); + fiy_S1 = gmx_simd_add_r(fiy_S1, ty_S1); + fiy_S2 = gmx_simd_add_r(fiy_S2, ty_S2); + fiy_S3 = gmx_simd_add_r(fiy_S3, ty_S3); + fiz_S0 = gmx_simd_add_r(fiz_S0, tz_S0); + fiz_S1 = gmx_simd_add_r(fiz_S1, tz_S1); + fiz_S2 = gmx_simd_add_r(fiz_S2, tz_S2); + fiz_S3 = gmx_simd_add_r(fiz_S3, tz_S3); /* Decrement j atom force */ - gmx_store_pr(f+ajx, - gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_S0, tx_S1, tx_S2, tx_S3) )); - gmx_store_pr(f+ajy, - gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_S0, ty_S1, ty_S2, ty_S3) )); - gmx_store_pr(f+ajz, - gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_S0, tz_S1, tz_S2, tz_S3) )); + gmx_simd_store_r(f+ajx, + gmx_simd_sub_r( gmx_simd_load_r(f+ajx), gmx_simd_sum4_r(tx_S0, tx_S1, tx_S2, tx_S3) )); + gmx_simd_store_r(f+ajy, + gmx_simd_sub_r( gmx_simd_load_r(f+ajy), gmx_simd_sum4_r(ty_S0, ty_S1, ty_S2, ty_S3) )); + gmx_simd_store_r(f+ajz, + gmx_simd_sub_r( gmx_simd_load_r(f+ajz), gmx_simd_sum4_r(tz_S0, tz_S1, tz_S2, tz_S3) )); } #undef rinv_ex_S0 diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h index 3ccc2daef1..2879213662 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h @@ -59,119 +59,121 @@ real *vctp[UNROLLI]; #endif - gmx_mm_pr shX_S; - gmx_mm_pr shY_S; - gmx_mm_pr shZ_S; - gmx_mm_pr ix_S0, iy_S0, iz_S0; - gmx_mm_pr ix_S1, iy_S1, iz_S1; - gmx_mm_pr ix_S2, iy_S2, iz_S2; - gmx_mm_pr ix_S3, iy_S3, iz_S3; - gmx_mm_pr fix_S0, fiy_S0, fiz_S0; - gmx_mm_pr fix_S1, fiy_S1, fiz_S1; - gmx_mm_pr fix_S2, fiy_S2, fiz_S2; - gmx_mm_pr fix_S3, fiy_S3, fiz_S3; + gmx_simd_real_t shX_S; + gmx_simd_real_t shY_S; + gmx_simd_real_t shZ_S; + gmx_simd_real_t ix_S0, iy_S0, iz_S0; + gmx_simd_real_t ix_S1, iy_S1, iz_S1; + gmx_simd_real_t ix_S2, iy_S2, iz_S2; + gmx_simd_real_t ix_S3, iy_S3, iz_S3; + gmx_simd_real_t fix_S0, fiy_S0, fiz_S0; + gmx_simd_real_t fix_S1, fiy_S1, fiz_S1; + gmx_simd_real_t fix_S2, fiy_S2, fiz_S2; + gmx_simd_real_t fix_S3, fiy_S3, fiz_S3; #if UNROLLJ >= 4 /* We use an i-force SIMD register width of 4 */ gmx_mm_pr4 fix_S, fiy_S, fiz_S; #else /* We use an i-force SIMD register width of 2 */ - gmx_mm_pr fix0_S, fiy0_S, fiz0_S; - gmx_mm_pr fix2_S, fiy2_S, fiz2_S; + gmx_simd_real_t fix0_S, fiy0_S, fiz0_S; + gmx_simd_real_t fix2_S, fiy2_S, fiz2_S; #endif - gmx_mm_pr diagonal_jmi_S; + gmx_simd_real_t diagonal_jmi_S; #if UNROLLI == UNROLLJ - gmx_mm_pb diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3; + gmx_simd_bool_t diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3; #else - gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3; - gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3; + gmx_simd_bool_t diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3; + gmx_simd_bool_t diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3; #endif - unsigned *exclusion_filter; - gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3; + unsigned *exclusion_filter; + gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3; - gmx_mm_pr zero_S = gmx_set1_pr(0.0); + gmx_simd_real_t zero_S = gmx_simd_set1_r(0.0); - gmx_mm_pr one_S = gmx_set1_pr(1.0); - gmx_mm_pr iq_S0 = gmx_setzero_pr(); - gmx_mm_pr iq_S1 = gmx_setzero_pr(); - gmx_mm_pr iq_S2 = gmx_setzero_pr(); - gmx_mm_pr iq_S3 = gmx_setzero_pr(); - gmx_mm_pr mrc_3_S; + gmx_simd_real_t one_S = gmx_simd_set1_r(1.0); + gmx_simd_real_t iq_S0 = gmx_simd_setzero_r(); + gmx_simd_real_t iq_S1 = gmx_simd_setzero_r(); + gmx_simd_real_t iq_S2 = gmx_simd_setzero_r(); + gmx_simd_real_t iq_S3 = gmx_simd_setzero_r(); + gmx_simd_real_t mrc_3_S; #ifdef CALC_ENERGIES - gmx_mm_pr hrc_3_S, moh_rc_S; + gmx_simd_real_t hrc_3_S, moh_rc_S; #endif #ifdef CALC_COUL_TAB /* Coulomb table variables */ - gmx_mm_pr invtsp_S; - const real *tab_coul_F; + gmx_simd_real_t invtsp_S; + const real *tab_coul_F; #ifndef TAB_FDV0 - const real *tab_coul_V; + const real *tab_coul_V; #endif /* Thread-local working buffers for force and potential lookups */ - int ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0 = NULL; - int ti1_array[2*GMX_SIMD_WIDTH_HERE], *ti1 = NULL; - int ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2 = NULL; - int ti3_array[2*GMX_SIMD_WIDTH_HERE], *ti3 = NULL; + int ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL; + int ti1_array[2*GMX_SIMD_REAL_WIDTH], *ti1 = NULL; + int ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2 = NULL; + int ti3_array[2*GMX_SIMD_REAL_WIDTH], *ti3 = NULL; #ifdef CALC_ENERGIES - gmx_mm_pr mhalfsp_S; + gmx_simd_real_t mhalfsp_S; #endif #endif #ifdef CALC_COUL_EWALD - gmx_mm_pr beta2_S, beta_S; + gmx_simd_real_t beta2_S, beta_S; #endif #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) - gmx_mm_pr sh_ewald_S; + gmx_simd_real_t sh_ewald_S; #endif #ifdef LJ_COMB_LB - const real *ljc; + const real *ljc; - gmx_mm_pr hsig_i_S0, seps_i_S0; - gmx_mm_pr hsig_i_S1, seps_i_S1; - gmx_mm_pr hsig_i_S2, seps_i_S2; - gmx_mm_pr hsig_i_S3, seps_i_S3; + gmx_simd_real_t hsig_i_S0, seps_i_S0; + gmx_simd_real_t hsig_i_S1, seps_i_S1; + gmx_simd_real_t hsig_i_S2, seps_i_S2; + gmx_simd_real_t hsig_i_S3, seps_i_S3; #else #ifdef FIX_LJ_C - real pvdw_array[2*UNROLLI*UNROLLJ+3]; - real *pvdw_c6, *pvdw_c12; - gmx_mm_pr c6_S0, c12_S0; - gmx_mm_pr c6_S1, c12_S1; - gmx_mm_pr c6_S2, c12_S2; - gmx_mm_pr c6_S3, c12_S3; + real pvdw_array[2*UNROLLI*UNROLLJ+3]; + real *pvdw_c6, *pvdw_c12; + gmx_simd_real_t c6_S0, c12_S0; + gmx_simd_real_t c6_S1, c12_S1; + gmx_simd_real_t c6_S2, c12_S2; + gmx_simd_real_t c6_S3, c12_S3; #endif #ifdef LJ_COMB_GEOM - const real *ljc; + const real *ljc; - gmx_mm_pr c6s_S0, c12s_S0; - gmx_mm_pr c6s_S1, c12s_S1; - gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr(); - gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr(); + gmx_simd_real_t c6s_S0, c12s_S0; + gmx_simd_real_t c6s_S1, c12s_S1; + gmx_simd_real_t c6s_S2 = gmx_simd_setzero_r(); + gmx_simd_real_t c12s_S2 = gmx_simd_setzero_r(); + gmx_simd_real_t c6s_S3 = gmx_simd_setzero_r(); + gmx_simd_real_t c12s_S3 = gmx_simd_setzero_r(); #endif #endif /* LJ_COMB_LB */ - gmx_mm_pr vctot_S, Vvdwtot_S; - gmx_mm_pr sixth_S, twelveth_S; + gmx_simd_real_t vctot_S, Vvdwtot_S; + gmx_simd_real_t sixth_S, twelveth_S; - gmx_mm_pr avoid_sing_S; - gmx_mm_pr rc2_S; + gmx_simd_real_t avoid_sing_S; + gmx_simd_real_t rc2_S; #ifdef VDW_CUTOFF_CHECK - gmx_mm_pr rcvdw2_S; + gmx_simd_real_t rcvdw2_S; #endif #ifdef CALC_ENERGIES - gmx_mm_pr sh_invrc6_S, sh_invrc12_S; + gmx_simd_real_t sh_invrc6_S, sh_invrc12_S; /* cppcheck-suppress unassignedVariable */ - real tmpsum_array[GMX_SIMD_WIDTH_HERE*2], *tmpsum; + real tmpsum_array[GMX_SIMD_REAL_WIDTH*2], *tmpsum; #endif #ifdef CALC_SHIFTFORCES /* cppcheck-suppress unassignedVariable */ - real shf_array[GMX_SIMD_WIDTH_HERE*2], *shf; + real shf_array[GMX_SIMD_REAL_WIDTH*2], *shf; #endif int ninner; @@ -188,39 +190,39 @@ #endif /* Load j-i for the first i */ - diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i); + diagonal_jmi_S = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i); /* Generate all the diagonal masks as comparison results */ #if UNROLLI == UNROLLJ - diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); + diagonal_mask_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); #else #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ - diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); + diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask0_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask0_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); #if UNROLLI == 2*UNROLLJ /* Load j-i for the second half of the j-cluster */ - diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ); + diagonal_jmi_S = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ); #endif - diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); - diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S); - diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S); + diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask1_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); + diagonal_jmi_S = gmx_simd_sub_r(diagonal_jmi_S, one_S); + diagonal_mask1_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S); #endif #endif @@ -251,9 +253,9 @@ ti2 = prepare_table_load_buffer(ti2_array); ti3 = prepare_table_load_buffer(ti3_array); - invtsp_S = gmx_set1_pr(ic->tabq_scale); + invtsp_S = gmx_simd_set1_r(ic->tabq_scale); #ifdef CALC_ENERGIES - mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale); + mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale); #endif #ifdef TAB_FDV0 @@ -265,12 +267,12 @@ #endif /* CALC_COUL_TAB */ #ifdef CALC_COUL_EWALD - beta2_S = gmx_set1_pr(ic->ewaldcoeff_q*ic->ewaldcoeff_q); - beta_S = gmx_set1_pr(ic->ewaldcoeff_q); + beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q); + beta_S = gmx_simd_set1_r(ic->ewaldcoeff_q); #endif #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES - sh_ewald_S = gmx_set1_pr(ic->sh_ewald); + sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald); #endif q = nbat->q; @@ -279,39 +281,39 @@ shiftvec = shift_vec[0]; x = nbat->x; - avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC); + avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC); /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */ - rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb); + rc2_S = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb); #ifdef VDW_CUTOFF_CHECK - rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw); + rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw); #endif #ifdef CALC_ENERGIES - sixth_S = gmx_set1_pr(1.0/6.0); - twelveth_S = gmx_set1_pr(1.0/12.0); + sixth_S = gmx_simd_set1_r(1.0/6.0); + twelveth_S = gmx_simd_set1_r(1.0/12.0); - sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6); - sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6); + sh_invrc6_S = gmx_simd_set1_r(ic->sh_invrc6); + sh_invrc12_S = gmx_simd_set1_r(ic->sh_invrc6*ic->sh_invrc6); #endif - mrc_3_S = gmx_set1_pr(-2*ic->k_rf); + mrc_3_S = gmx_simd_set1_r(-2*ic->k_rf); #ifdef CALC_ENERGIES - hrc_3_S = gmx_set1_pr(ic->k_rf); + hrc_3_S = gmx_simd_set1_r(ic->k_rf); - moh_rc_S = gmx_set1_pr(-ic->c_rf); + moh_rc_S = gmx_simd_set1_r(-ic->c_rf); #endif #ifdef CALC_ENERGIES - tmpsum = gmx_simd_align_real(tmpsum_array); + tmpsum = gmx_simd_align_r(tmpsum_array); #endif #ifdef CALC_SHIFTFORCES - shf = gmx_simd_align_real(shf_array); + shf = gmx_simd_align_r(shf_array); #endif #ifdef FIX_LJ_C - pvdw_c6 = gmx_simd_align_real(pvdw_array+3); + pvdw_c6 = gmx_simd_align_r(pvdw_array+3); pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ; for (jp = 0; jp < UNROLLJ; jp++) @@ -326,15 +328,15 @@ pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1]; pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1]; } - c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ); - c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ); - c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ); - c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ); - - c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ); - c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ); - c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ); - c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ); + c6_S0 = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ); + c6_S1 = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ); + c6_S2 = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ); + c6_S3 = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ); + + c12_S0 = gmx_simd_load_r(pvdw_c12+0*UNROLLJ); + c12_S1 = gmx_simd_load_r(pvdw_c12+1*UNROLLJ); + c12_S2 = gmx_simd_load_r(pvdw_c12+2*UNROLLJ); + c12_S3 = gmx_simd_load_r(pvdw_c12+3*UNROLLJ); #endif /* FIX_LJ_C */ #ifdef ENERGY_GROUPS @@ -361,9 +363,9 @@ ci = nbln->ci; ci_sh = (ish == CENTRAL ? ci : -1); - shX_S = gmx_load1_pr(shiftvec+ish3); - shY_S = gmx_load1_pr(shiftvec+ish3+1); - shZ_S = gmx_load1_pr(shiftvec+ish3+2); + shX_S = gmx_simd_load1_r(shiftvec+ish3); + shY_S = gmx_simd_load1_r(shiftvec+ish3+1); + shZ_S = gmx_simd_load1_r(shiftvec+ish3+2); #if UNROLLJ <= 4 sci = ci*STRIDE; @@ -446,51 +448,51 @@ /* Load i atom data */ sciy = scix + STRIDE; sciz = sciy + STRIDE; - ix_S0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_S); - ix_S1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S); - ix_S2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S); - ix_S3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S); - iy_S0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S); - iy_S1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S); - iy_S2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S); - iy_S3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S); - iz_S0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S); - iz_S1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S); - iz_S2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S); - iz_S3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S); + ix_S0 = gmx_simd_add_r(gmx_simd_load1_r(x+scix), shX_S); + ix_S1 = gmx_simd_add_r(gmx_simd_load1_r(x+scix+1), shX_S); + ix_S2 = gmx_simd_add_r(gmx_simd_load1_r(x+scix+2), shX_S); + ix_S3 = gmx_simd_add_r(gmx_simd_load1_r(x+scix+3), shX_S); + iy_S0 = gmx_simd_add_r(gmx_simd_load1_r(x+sciy), shY_S); + iy_S1 = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+1), shY_S); + iy_S2 = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+2), shY_S); + iy_S3 = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+3), shY_S); + iz_S0 = gmx_simd_add_r(gmx_simd_load1_r(x+sciz), shZ_S); + iz_S1 = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+1), shZ_S); + iz_S2 = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+2), shZ_S); + iz_S3 = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+3), shZ_S); if (do_coul) { - iq_S0 = gmx_set1_pr(facel*q[sci]); - iq_S1 = gmx_set1_pr(facel*q[sci+1]); - iq_S2 = gmx_set1_pr(facel*q[sci+2]); - iq_S3 = gmx_set1_pr(facel*q[sci+3]); + iq_S0 = gmx_simd_set1_r(facel*q[sci]); + iq_S1 = gmx_simd_set1_r(facel*q[sci+1]); + iq_S2 = gmx_simd_set1_r(facel*q[sci+2]); + iq_S3 = gmx_simd_set1_r(facel*q[sci+3]); } #ifdef LJ_COMB_LB - hsig_i_S0 = gmx_load1_pr(ljc+sci2+0); - hsig_i_S1 = gmx_load1_pr(ljc+sci2+1); - hsig_i_S2 = gmx_load1_pr(ljc+sci2+2); - hsig_i_S3 = gmx_load1_pr(ljc+sci2+3); - seps_i_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0); - seps_i_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1); - seps_i_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2); - seps_i_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3); + hsig_i_S0 = gmx_simd_load1_r(ljc+sci2+0); + hsig_i_S1 = gmx_simd_load1_r(ljc+sci2+1); + hsig_i_S2 = gmx_simd_load1_r(ljc+sci2+2); + hsig_i_S3 = gmx_simd_load1_r(ljc+sci2+3); + seps_i_S0 = gmx_simd_load1_r(ljc+sci2+STRIDE+0); + seps_i_S1 = gmx_simd_load1_r(ljc+sci2+STRIDE+1); + seps_i_S2 = gmx_simd_load1_r(ljc+sci2+STRIDE+2); + seps_i_S3 = gmx_simd_load1_r(ljc+sci2+STRIDE+3); #else #ifdef LJ_COMB_GEOM - c6s_S0 = gmx_load1_pr(ljc+sci2+0); - c6s_S1 = gmx_load1_pr(ljc+sci2+1); + c6s_S0 = gmx_simd_load1_r(ljc+sci2+0); + c6s_S1 = gmx_simd_load1_r(ljc+sci2+1); if (!half_LJ) { - c6s_S2 = gmx_load1_pr(ljc+sci2+2); - c6s_S3 = gmx_load1_pr(ljc+sci2+3); + c6s_S2 = gmx_simd_load1_r(ljc+sci2+2); + c6s_S3 = gmx_simd_load1_r(ljc+sci2+3); } - c12s_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0); - c12s_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1); + c12s_S0 = gmx_simd_load1_r(ljc+sci2+STRIDE+0); + c12s_S1 = gmx_simd_load1_r(ljc+sci2+STRIDE+1); if (!half_LJ) { - c12s_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2); - c12s_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3); + c12s_S2 = gmx_simd_load1_r(ljc+sci2+STRIDE+2); + c12s_S3 = gmx_simd_load1_r(ljc+sci2+STRIDE+3); } #else nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride; @@ -504,22 +506,22 @@ #endif /* Zero the potential energy for this list */ - Vvdwtot_S = gmx_setzero_pr(); - vctot_S = gmx_setzero_pr(); + Vvdwtot_S = gmx_simd_setzero_r(); + vctot_S = gmx_simd_setzero_r(); /* Clear i atom forces */ - fix_S0 = gmx_setzero_pr(); - fix_S1 = gmx_setzero_pr(); - fix_S2 = gmx_setzero_pr(); - fix_S3 = gmx_setzero_pr(); - fiy_S0 = gmx_setzero_pr(); - fiy_S1 = gmx_setzero_pr(); - fiy_S2 = gmx_setzero_pr(); - fiy_S3 = gmx_setzero_pr(); - fiz_S0 = gmx_setzero_pr(); - fiz_S1 = gmx_setzero_pr(); - fiz_S2 = gmx_setzero_pr(); - fiz_S3 = gmx_setzero_pr(); + fix_S0 = gmx_simd_setzero_r(); + fix_S1 = gmx_simd_setzero_r(); + fix_S2 = gmx_simd_setzero_r(); + fix_S3 = gmx_simd_setzero_r(); + fiy_S0 = gmx_simd_setzero_r(); + fiy_S1 = gmx_simd_setzero_r(); + fiy_S2 = gmx_simd_setzero_r(); + fiy_S3 = gmx_simd_setzero_r(); + fiz_S0 = gmx_simd_setzero_r(); + fiz_S1 = gmx_simd_setzero_r(); + fiz_S2 = gmx_simd_setzero_r(); + fiz_S3 = gmx_simd_setzero_r(); cjind = cjind0; @@ -594,24 +596,24 @@ #endif #else fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1); - gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix))); + gmx_simd_store_r(f+scix, gmx_simd_add_r(fix0_S, gmx_simd_load_r(f+scix))); fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3); - gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2))); + gmx_simd_store_r(f+scix+2, gmx_simd_add_r(fix2_S, gmx_simd_load_r(f+scix+2))); fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1); - gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy))); + gmx_simd_store_r(f+sciy, gmx_simd_add_r(fiy0_S, gmx_simd_load_r(f+sciy))); fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3); - gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2))); + gmx_simd_store_r(f+sciy+2, gmx_simd_add_r(fiy2_S, gmx_simd_load_r(f+sciy+2))); fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1); - gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz))); + gmx_simd_store_r(f+sciz, gmx_simd_add_r(fiz0_S, gmx_simd_load_r(f+sciz))); fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3); - gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2))); + gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2))); #ifdef CALC_SHIFTFORCES - fshift[ish3+0] += gmx_sum_simd2(gmx_add_pr(fix0_S, fix2_S), shf); - fshift[ish3+1] += gmx_sum_simd2(gmx_add_pr(fiy0_S, fiy2_S), shf); - fshift[ish3+2] += gmx_sum_simd2(gmx_add_pr(fiz0_S, fiz2_S), shf); + fshift[ish3+0] += gmx_sum_simd2(gmx_simd_add_r(fix0_S, fix2_S), shf); + fshift[ish3+1] += gmx_sum_simd2(gmx_simd_add_r(fiy0_S, fiy2_S), shf); + fshift[ish3+2] += gmx_sum_simd2(gmx_simd_add_r(fiz0_S, fiz2_S), shf); #endif #endif diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c index 1813a9c43c..457db91393 100644 --- a/src/gromacs/mdlib/nbnxn_search.c +++ b/src/gromacs/mdlib/nbnxn_search.c @@ -105,17 +105,17 @@ #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8) /* The j-cluster size is matched to the SIMD width */ -#if GMX_SIMD_WIDTH_HERE == 2 +#if GMX_SIMD_REAL_WIDTH == 2 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci) #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci) #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj) #else -#if GMX_SIMD_WIDTH_HERE == 4 +#if GMX_SIMD_REAL_WIDTH == 4 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci) #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci) #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj) #else -#if GMX_SIMD_WIDTH_HERE == 8 +#if GMX_SIMD_REAL_WIDTH == 8 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci) #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci) #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj) @@ -124,7 +124,7 @@ #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci) #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj) #else -#if GMX_SIMD_WIDTH_HERE == 16 +#if GMX_SIMD_REAL_WIDTH == 16 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci) #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci) #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj) @@ -258,7 +258,7 @@ int nbnxn_kernel_to_cj_size(int nb_kernel_type) int cj_size = 0; #ifdef GMX_NBNXN_SIMD - nbnxn_simd_width = GMX_SIMD_WIDTH_HERE; + nbnxn_simd_width = GMX_SIMD_REAL_WIDTH; #endif switch (nb_kernel_type) @@ -808,20 +808,20 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x, * so we don't need to treat special cases in the rest of the code. */ #ifdef NBNXN_SEARCH_BB_SIMD4 - gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0])); - gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0])); + gmx_simd4_store_r(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0])); + gmx_simd4_store_r(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0])); #else bbj[1] = bbj[0]; #endif } #ifdef NBNXN_SEARCH_BB_SIMD4 - gmx_simd4_store_pr(&bb->lower[0], - gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bbj[0].lower[0]), - gmx_simd4_load_bb_pr(&bbj[1].lower[0]))); - gmx_simd4_store_pr(&bb->upper[0], - gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bbj[0].upper[0]), - gmx_simd4_load_bb_pr(&bbj[1].upper[0]))); + gmx_simd4_store_r(&bb->lower[0], + gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bbj[0].lower[0]), + gmx_simd4_load_bb_pr(&bbj[1].lower[0]))); + gmx_simd4_store_r(&bb->upper[0], + gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bbj[0].upper[0]), + gmx_simd4_load_bb_pr(&bbj[1].upper[0]))); #else { int i; @@ -877,10 +877,10 @@ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb) /* Coordinate order xyz?, bb order xyz0 */ static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb) { - gmx_simd4_pr bb_0_S, bb_1_S; - gmx_simd4_pr x_S; + gmx_simd4_real_t bb_0_S, bb_1_S; + gmx_simd4_real_t x_S; - int i; + int i; bb_0_S = gmx_simd4_load_bb_pr(x); bb_1_S = bb_0_S; @@ -888,12 +888,12 @@ static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb) for (i = 1; i < na; i++) { x_S = gmx_simd4_load_bb_pr(x+i*NNBSBB_C); - bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S); - bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S); + bb_0_S = gmx_simd4_min_r(bb_0_S, x_S); + bb_1_S = gmx_simd4_max_r(bb_1_S, x_S); } - gmx_simd4_store_pr(&bb->lower[0], bb_0_S); - gmx_simd4_store_pr(&bb->upper[0], bb_1_S); + gmx_simd4_store_r(&bb->lower[0], bb_0_S); + gmx_simd4_store_r(&bb->upper[0], bb_1_S); } /* Coordinate order xyz?, bb order xxxxyyyyzzzz */ @@ -928,14 +928,14 @@ static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb) for (c2 = sc2; c2 < sc2+nc2; c2++) { #ifdef NBNXN_SEARCH_BB_SIMD4 - gmx_simd4_pr min_S, max_S; - - min_S = gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]), - gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0])); - max_S = gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]), - gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0])); - gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S); - gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S); + gmx_simd4_real_t min_S, max_S; + + min_S = gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]), + gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0])); + max_S = gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]), + gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0])); + gmx_simd4_store_r(&grid->bbj[c2].lower[0], min_S); + gmx_simd4_store_r(&grid->bbj[c2].upper[0], max_S); #else for (j = 0; j < NNBSBB_C; j++) { @@ -1156,7 +1156,7 @@ void fill_cell(const nbnxn_search_t nbs, offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log; bb_ptr = grid->bb + offset; -#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2 +#if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH == 2 if (2*grid->na_cj == grid->na_c) { calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr, @@ -2075,40 +2075,40 @@ static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci, static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci, int csj, const nbnxn_bb_t *bb_j_all) { - gmx_simd4_pr bb_i_S0, bb_i_S1; - gmx_simd4_pr bb_j_S0, bb_j_S1; - gmx_simd4_pr dl_S; - gmx_simd4_pr dh_S; - gmx_simd4_pr dm_S; - gmx_simd4_pr dm0_S; + gmx_simd4_real_t bb_i_S0, bb_i_S1; + gmx_simd4_real_t bb_j_S0, bb_j_S1; + gmx_simd4_real_t dl_S; + gmx_simd4_real_t dh_S; + gmx_simd4_real_t dm_S; + gmx_simd4_real_t dm0_S; bb_i_S0 = gmx_simd4_load_bb_pr(&bb_i_ci[si].lower[0]); bb_i_S1 = gmx_simd4_load_bb_pr(&bb_i_ci[si].upper[0]); bb_j_S0 = gmx_simd4_load_bb_pr(&bb_j_all[csj].lower[0]); bb_j_S1 = gmx_simd4_load_bb_pr(&bb_j_all[csj].upper[0]); - dl_S = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1); - dh_S = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1); + dl_S = gmx_simd4_sub_r(bb_i_S0, bb_j_S1); + dh_S = gmx_simd4_sub_r(bb_j_S0, bb_i_S1); - dm_S = gmx_simd4_max_pr(dl_S, dh_S); - dm0_S = gmx_simd4_max_pr(dm_S, gmx_simd4_setzero_pr()); + dm_S = gmx_simd4_max_r(dl_S, dh_S); + dm0_S = gmx_simd4_max_r(dm_S, gmx_simd4_setzero_r()); - return gmx_simd4_dotproduct3(dm0_S, dm0_S); + return gmx_simd4_dotproduct3_r(dm0_S, dm0_S); } /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */ #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \ { \ - int shi; \ + int shi; \ \ - gmx_simd4_pr dx_0, dy_0, dz_0; \ - gmx_simd4_pr dx_1, dy_1, dz_1; \ + gmx_simd4_real_t dx_0, dy_0, dz_0; \ + gmx_simd4_real_t dx_1, dy_1, dz_1; \ \ - gmx_simd4_pr mx, my, mz; \ - gmx_simd4_pr m0x, m0y, m0z; \ + gmx_simd4_real_t mx, my, mz; \ + gmx_simd4_real_t m0x, m0y, m0z; \ \ - gmx_simd4_pr d2x, d2y, d2z; \ - gmx_simd4_pr d2s, d2t; \ + gmx_simd4_real_t d2x, d2y, d2z; \ + gmx_simd4_real_t d2s, d2t; \ \ shi = si*NNBSBB_D*DIM; \ \ @@ -2119,30 +2119,30 @@ static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci, yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB); \ zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB); \ \ - dx_0 = gmx_simd4_sub_pr(xi_l, xj_h); \ - dy_0 = gmx_simd4_sub_pr(yi_l, yj_h); \ - dz_0 = gmx_simd4_sub_pr(zi_l, zj_h); \ + dx_0 = gmx_simd4_sub_r(xi_l, xj_h); \ + dy_0 = gmx_simd4_sub_r(yi_l, yj_h); \ + dz_0 = gmx_simd4_sub_r(zi_l, zj_h); \ \ - dx_1 = gmx_simd4_sub_pr(xj_l, xi_h); \ - dy_1 = gmx_simd4_sub_pr(yj_l, yi_h); \ - dz_1 = gmx_simd4_sub_pr(zj_l, zi_h); \ + dx_1 = gmx_simd4_sub_r(xj_l, xi_h); \ + dy_1 = gmx_simd4_sub_r(yj_l, yi_h); \ + dz_1 = gmx_simd4_sub_r(zj_l, zi_h); \ \ - mx = gmx_simd4_max_pr(dx_0, dx_1); \ - my = gmx_simd4_max_pr(dy_0, dy_1); \ - mz = gmx_simd4_max_pr(dz_0, dz_1); \ + mx = gmx_simd4_max_r(dx_0, dx_1); \ + my = gmx_simd4_max_r(dy_0, dy_1); \ + mz = gmx_simd4_max_r(dz_0, dz_1); \ \ - m0x = gmx_simd4_max_pr(mx, zero); \ - m0y = gmx_simd4_max_pr(my, zero); \ - m0z = gmx_simd4_max_pr(mz, zero); \ + m0x = gmx_simd4_max_r(mx, zero); \ + m0y = gmx_simd4_max_r(my, zero); \ + m0z = gmx_simd4_max_r(mz, zero); \ \ - d2x = gmx_simd4_mul_pr(m0x, m0x); \ - d2y = gmx_simd4_mul_pr(m0y, m0y); \ - d2z = gmx_simd4_mul_pr(m0z, m0z); \ + d2x = gmx_simd4_mul_r(m0x, m0x); \ + d2y = gmx_simd4_mul_r(m0y, m0y); \ + d2z = gmx_simd4_mul_r(m0z, m0z); \ \ - d2s = gmx_simd4_add_pr(d2x, d2y); \ - d2t = gmx_simd4_add_pr(d2s, d2z); \ + d2s = gmx_simd4_add_r(d2x, d2y); \ + d2t = gmx_simd4_add_r(d2s, d2z); \ \ - gmx_simd4_store_pr(d2+si, d2t); \ + gmx_simd4_store_r(d2+si, d2t); \ } /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */ @@ -2150,21 +2150,21 @@ static void subc_bb_dist2_simd4_xxxx(const float *bb_j, int nsi, const float *bb_i, float *d2) { - gmx_simd4_pr xj_l, yj_l, zj_l; - gmx_simd4_pr xj_h, yj_h, zj_h; - gmx_simd4_pr xi_l, yi_l, zi_l; - gmx_simd4_pr xi_h, yi_h, zi_h; + gmx_simd4_real_t xj_l, yj_l, zj_l; + gmx_simd4_real_t xj_h, yj_h, zj_h; + gmx_simd4_real_t xi_l, yi_l, zi_l; + gmx_simd4_real_t xi_h, yi_h, zi_h; - gmx_simd4_pr zero; + gmx_simd4_real_t zero; - zero = gmx_simd4_setzero_pr(); + zero = gmx_simd4_setzero_r(); - xj_l = gmx_simd4_set1_pr(bb_j[0*STRIDE_PBB]); - yj_l = gmx_simd4_set1_pr(bb_j[1*STRIDE_PBB]); - zj_l = gmx_simd4_set1_pr(bb_j[2*STRIDE_PBB]); - xj_h = gmx_simd4_set1_pr(bb_j[3*STRIDE_PBB]); - yj_h = gmx_simd4_set1_pr(bb_j[4*STRIDE_PBB]); - zj_h = gmx_simd4_set1_pr(bb_j[5*STRIDE_PBB]); + xj_l = gmx_simd4_set1_r(bb_j[0*STRIDE_PBB]); + yj_l = gmx_simd4_set1_r(bb_j[1*STRIDE_PBB]); + zj_l = gmx_simd4_set1_r(bb_j[2*STRIDE_PBB]); + xj_h = gmx_simd4_set1_r(bb_j[3*STRIDE_PBB]); + yj_h = gmx_simd4_set1_r(bb_j[4*STRIDE_PBB]); + zj_h = gmx_simd4_set1_r(bb_j[5*STRIDE_PBB]); /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB. * But as we know the number of iterations is 1 or 2, we unroll manually. @@ -2214,10 +2214,10 @@ static gmx_bool subc_in_range_x(int na_c, /* When we make seperate single/double precision SIMD vector operation * include files, this function should be moved there (also using FMA). */ -static inline gmx_simd4_pr -gmx_simd4_calc_rsq_pr(gmx_simd4_pr x, gmx_simd4_pr y, gmx_simd4_pr z) +static inline gmx_simd4_real_t +gmx_simd4_calc_rsq_r(gmx_simd4_real_t x, gmx_simd4_real_t y, gmx_simd4_real_t z) { - return gmx_simd4_add_pr( gmx_simd4_add_pr( gmx_simd4_mul_pr(x, x), gmx_simd4_mul_pr(y, y) ), gmx_simd4_mul_pr(z, z) ); + return gmx_simd4_add_r( gmx_simd4_add_r( gmx_simd4_mul_r(x, x), gmx_simd4_mul_r(y, y) ), gmx_simd4_mul_r(z, z) ); } /* 4-wide SIMD function which determines if any atom pair between two cells, @@ -2229,15 +2229,15 @@ static gmx_bool subc_in_range_simd4(int na_c, int csj, int stride, const real *x_j, real rl2) { - gmx_simd4_pr ix_S0, iy_S0, iz_S0; - gmx_simd4_pr ix_S1, iy_S1, iz_S1; + gmx_simd4_real_t ix_S0, iy_S0, iz_S0; + gmx_simd4_real_t ix_S1, iy_S1, iz_S1; - gmx_simd4_pr rc2_S; + gmx_simd4_real_t rc2_S; - int dim_stride; - int j0, j1; + int dim_stride; + int j0, j1; - rc2_S = gmx_simd4_set1_pr(rl2); + rc2_S = gmx_simd4_set1_r(rl2); dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM; ix_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+0)*STRIDE_PBB); @@ -2254,63 +2254,63 @@ static gmx_bool subc_in_range_simd4(int na_c, j1 = j0 + na_c - 1; while (j0 < j1) { - gmx_simd4_pr jx0_S, jy0_S, jz0_S; - gmx_simd4_pr jx1_S, jy1_S, jz1_S; + gmx_simd4_real_t jx0_S, jy0_S, jz0_S; + gmx_simd4_real_t jx1_S, jy1_S, jz1_S; - gmx_simd4_pr dx_S0, dy_S0, dz_S0; - gmx_simd4_pr dx_S1, dy_S1, dz_S1; - gmx_simd4_pr dx_S2, dy_S2, dz_S2; - gmx_simd4_pr dx_S3, dy_S3, dz_S3; + gmx_simd4_real_t dx_S0, dy_S0, dz_S0; + gmx_simd4_real_t dx_S1, dy_S1, dz_S1; + gmx_simd4_real_t dx_S2, dy_S2, dz_S2; + gmx_simd4_real_t dx_S3, dy_S3, dz_S3; - gmx_simd4_pr rsq_S0; - gmx_simd4_pr rsq_S1; - gmx_simd4_pr rsq_S2; - gmx_simd4_pr rsq_S3; + gmx_simd4_real_t rsq_S0; + gmx_simd4_real_t rsq_S1; + gmx_simd4_real_t rsq_S2; + gmx_simd4_real_t rsq_S3; - gmx_simd4_pb wco_S0; - gmx_simd4_pb wco_S1; - gmx_simd4_pb wco_S2; - gmx_simd4_pb wco_S3; - gmx_simd4_pb wco_any_S01, wco_any_S23, wco_any_S; + gmx_simd4_bool_t wco_S0; + gmx_simd4_bool_t wco_S1; + gmx_simd4_bool_t wco_S2; + gmx_simd4_bool_t wco_S3; + gmx_simd4_bool_t wco_any_S01, wco_any_S23, wco_any_S; - jx0_S = gmx_simd4_set1_pr(x_j[j0*stride+0]); - jy0_S = gmx_simd4_set1_pr(x_j[j0*stride+1]); - jz0_S = gmx_simd4_set1_pr(x_j[j0*stride+2]); + jx0_S = gmx_simd4_set1_r(x_j[j0*stride+0]); + jy0_S = gmx_simd4_set1_r(x_j[j0*stride+1]); + jz0_S = gmx_simd4_set1_r(x_j[j0*stride+2]); - jx1_S = gmx_simd4_set1_pr(x_j[j1*stride+0]); - jy1_S = gmx_simd4_set1_pr(x_j[j1*stride+1]); - jz1_S = gmx_simd4_set1_pr(x_j[j1*stride+2]); + jx1_S = gmx_simd4_set1_r(x_j[j1*stride+0]); + jy1_S = gmx_simd4_set1_r(x_j[j1*stride+1]); + jz1_S = gmx_simd4_set1_r(x_j[j1*stride+2]); /* Calculate distance */ - dx_S0 = gmx_simd4_sub_pr(ix_S0, jx0_S); - dy_S0 = gmx_simd4_sub_pr(iy_S0, jy0_S); - dz_S0 = gmx_simd4_sub_pr(iz_S0, jz0_S); - dx_S1 = gmx_simd4_sub_pr(ix_S1, jx0_S); - dy_S1 = gmx_simd4_sub_pr(iy_S1, jy0_S); - dz_S1 = gmx_simd4_sub_pr(iz_S1, jz0_S); - dx_S2 = gmx_simd4_sub_pr(ix_S0, jx1_S); - dy_S2 = gmx_simd4_sub_pr(iy_S0, jy1_S); - dz_S2 = gmx_simd4_sub_pr(iz_S0, jz1_S); - dx_S3 = gmx_simd4_sub_pr(ix_S1, jx1_S); - dy_S3 = gmx_simd4_sub_pr(iy_S1, jy1_S); - dz_S3 = gmx_simd4_sub_pr(iz_S1, jz1_S); + dx_S0 = gmx_simd4_sub_r(ix_S0, jx0_S); + dy_S0 = gmx_simd4_sub_r(iy_S0, jy0_S); + dz_S0 = gmx_simd4_sub_r(iz_S0, jz0_S); + dx_S1 = gmx_simd4_sub_r(ix_S1, jx0_S); + dy_S1 = gmx_simd4_sub_r(iy_S1, jy0_S); + dz_S1 = gmx_simd4_sub_r(iz_S1, jz0_S); + dx_S2 = gmx_simd4_sub_r(ix_S0, jx1_S); + dy_S2 = gmx_simd4_sub_r(iy_S0, jy1_S); + dz_S2 = gmx_simd4_sub_r(iz_S0, jz1_S); + dx_S3 = gmx_simd4_sub_r(ix_S1, jx1_S); + dy_S3 = gmx_simd4_sub_r(iy_S1, jy1_S); + dz_S3 = gmx_simd4_sub_r(iz_S1, jz1_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_simd4_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S1 = gmx_simd4_calc_rsq_pr(dx_S1, dy_S1, dz_S1); - rsq_S2 = gmx_simd4_calc_rsq_pr(dx_S2, dy_S2, dz_S2); - rsq_S3 = gmx_simd4_calc_rsq_pr(dx_S3, dy_S3, dz_S3); + rsq_S0 = gmx_simd4_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_simd4_calc_rsq_r(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_simd4_calc_rsq_r(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_simd4_calc_rsq_r(dx_S3, dy_S3, dz_S3); - wco_S0 = gmx_simd4_cmplt_pr(rsq_S0, rc2_S); - wco_S1 = gmx_simd4_cmplt_pr(rsq_S1, rc2_S); - wco_S2 = gmx_simd4_cmplt_pr(rsq_S2, rc2_S); - wco_S3 = gmx_simd4_cmplt_pr(rsq_S3, rc2_S); + wco_S0 = gmx_simd4_cmplt_r(rsq_S0, rc2_S); + wco_S1 = gmx_simd4_cmplt_r(rsq_S1, rc2_S); + wco_S2 = gmx_simd4_cmplt_r(rsq_S2, rc2_S); + wco_S3 = gmx_simd4_cmplt_r(rsq_S3, rc2_S); - wco_any_S01 = gmx_simd4_or_pb(wco_S0, wco_S1); - wco_any_S23 = gmx_simd4_or_pb(wco_S2, wco_S3); - wco_any_S = gmx_simd4_or_pb(wco_any_S01, wco_any_S23); + wco_any_S01 = gmx_simd4_or_b(wco_S0, wco_S1); + wco_any_S23 = gmx_simd4_or_b(wco_S2, wco_S3); + wco_any_S = gmx_simd4_or_b(wco_any_S01, wco_any_S23); - if (gmx_simd4_anytrue_pb(wco_any_S)) + if (gmx_simd4_anytrue_b(wco_any_S)) { return TRUE; } @@ -2738,17 +2738,17 @@ static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj) } #ifdef GMX_NBNXN_SIMD -#if GMX_SIMD_WIDTH_HERE == 2 +#if GMX_SIMD_REAL_WIDTH == 2 #define get_imask_simd_4xn get_imask_simd_j2 #endif -#if GMX_SIMD_WIDTH_HERE == 4 +#if GMX_SIMD_REAL_WIDTH == 4 #define get_imask_simd_4xn get_imask_simd_j4 #endif -#if GMX_SIMD_WIDTH_HERE == 8 +#if GMX_SIMD_REAL_WIDTH == 8 #define get_imask_simd_4xn get_imask_simd_j8 #define get_imask_simd_2xnn get_imask_simd_j4 #endif -#if GMX_SIMD_WIDTH_HERE == 16 +#if GMX_SIMD_REAL_WIDTH == 16 #define get_imask_simd_2xnn get_imask_simd_j8 #endif #endif @@ -3191,7 +3191,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs, /* The next code line is usually not needed. We do not want to version * away the above line, because there is logic that relies on being * able to detect easily whether any exclusions exist. */ -#if (defined GMX_CPU_ACCELERATION_IBM_QPX) +#if (defined GMX_SIMD_IBM_QPX) nbl->cj[found].interaction_mask_indices[inner_i] &= ~(1U << inner_e); #endif } diff --git a/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h index b383daf0b7..59760f4b94 100644 --- a/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h +++ b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -37,16 +37,16 @@ #include "nbnxn_kernels/nbnxn_kernel_simd_utils.h" -#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE -#define STRIDE_S (GMX_SIMD_WIDTH_HERE/2) +#if GMX_SIMD_REAL_WIDTH >= 2*NBNXN_CPU_CLUSTER_I_SIZE +#define STRIDE_S (GMX_SIMD_REAL_WIDTH/2) #else #define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE #endif -static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a) +static gmx_inline gmx_simd_real_t gmx_load_hpr_hilo_pr(const real *a) { - gmx_mm_hpr a_S; - gmx_mm_pr a_a_S; + gmx_mm_hpr a_S; + gmx_simd_real_t a_a_S; gmx_load_hpr(&a_S, a); @@ -55,10 +55,10 @@ static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a) return a_a_S; } -static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift) +static gmx_inline gmx_simd_real_t gmx_set_2real_shift_pr(const real *a, real shift) { - gmx_mm_hpr a0_S, a1_S; - gmx_mm_pr a0_a1_S; + gmx_mm_hpr a0_S, a1_S; + gmx_simd_real_t a0_a1_S; gmx_set1_hpr(&a0_S, a[0] + shift); gmx_set1_hpr(&a1_S, a[1] + shift); @@ -105,26 +105,26 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, real rl2, float rbb2, int *ndistc) { - const nbnxn_x_ci_simd_2xnn_t *work; - const nbnxn_bb_t *bb_ci; + const nbnxn_x_ci_simd_2xnn_t *work; + const nbnxn_bb_t *bb_ci; - gmx_mm_pr jx_S, jy_S, jz_S; + gmx_simd_real_t jx_S, jy_S, jz_S; - gmx_mm_pr dx_S0, dy_S0, dz_S0; - gmx_mm_pr dx_S2, dy_S2, dz_S2; + gmx_simd_real_t dx_S0, dy_S0, dz_S0; + gmx_simd_real_t dx_S2, dy_S2, dz_S2; - gmx_mm_pr rsq_S0; - gmx_mm_pr rsq_S2; + gmx_simd_real_t rsq_S0; + gmx_simd_real_t rsq_S2; - gmx_mm_pb wco_S0; - gmx_mm_pb wco_S2; - gmx_mm_pb wco_any_S; + gmx_simd_bool_t wco_S0; + gmx_simd_bool_t wco_S2; + gmx_simd_bool_t wco_any_S; - gmx_mm_pr rc2_S; + gmx_simd_real_t rc2_S; - gmx_bool InRange; - float d2; - int xind_f, xind_l, cj; + gmx_bool InRange; + float d2; + int xind_f, xind_l, cj; cjf = CI_TO_CJ_SIMD_2XNN(cjf); cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1; @@ -133,7 +133,7 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, bb_ci = nbl->work->bb_ci; - rc2_S = gmx_set1_pr(rl2); + rc2_S = gmx_simd_set1_r(rl2); InRange = FALSE; while (!InRange && cjf <= cjl) @@ -163,25 +163,25 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, jz_S = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S); /* Calculate distance */ - dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); - dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); - dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); - dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); - dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); - dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); + dx_S0 = gmx_simd_sub_r(work->ix_S0, jx_S); + dy_S0 = gmx_simd_sub_r(work->iy_S0, jy_S); + dz_S0 = gmx_simd_sub_r(work->iz_S0, jz_S); + dx_S2 = gmx_simd_sub_r(work->ix_S2, jx_S); + dy_S2 = gmx_simd_sub_r(work->iy_S2, jy_S); + dz_S2 = gmx_simd_sub_r(work->iz_S2, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); + rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2); - wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); - wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); + wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S); + wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S); - wco_any_S = gmx_or_pb(wco_S0, wco_S2); + wco_any_S = gmx_simd_or_b(wco_S0, wco_S2); - InRange = gmx_anytrue_pb(wco_any_S); + InRange = gmx_simd_anytrue_b(wco_any_S); - *ndistc += 2*GMX_SIMD_WIDTH_HERE; + *ndistc += 2*GMX_SIMD_REAL_WIDTH; } if (!InRange) { @@ -221,25 +221,25 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, jz_S = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S); /* Calculate distance */ - dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); - dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); - dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); - dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); - dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); - dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); + dx_S0 = gmx_simd_sub_r(work->ix_S0, jx_S); + dy_S0 = gmx_simd_sub_r(work->iy_S0, jy_S); + dz_S0 = gmx_simd_sub_r(work->iz_S0, jz_S); + dx_S2 = gmx_simd_sub_r(work->ix_S2, jx_S); + dy_S2 = gmx_simd_sub_r(work->iy_S2, jy_S); + dz_S2 = gmx_simd_sub_r(work->iz_S2, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); + rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2); - wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); - wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); + wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S); + wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S); - wco_any_S = gmx_or_pb(wco_S0, wco_S2); + wco_any_S = gmx_simd_or_b(wco_S0, wco_S2); - InRange = gmx_anytrue_pb(wco_any_S); + InRange = gmx_simd_anytrue_b(wco_any_S); - *ndistc += 2*GMX_SIMD_WIDTH_HERE; + *ndistc += 2*GMX_SIMD_REAL_WIDTH; } if (!InRange) { diff --git a/src/gromacs/mdlib/nbnxn_search_simd_4xn.h b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h index 12dd77fdb4..4931a1a4eb 100644 --- a/src/gromacs/mdlib/nbnxn_search_simd_4xn.h +++ b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -34,8 +34,8 @@ */ -#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE -#define STRIDE_S (GMX_SIMD_WIDTH_HERE) +#if GMX_SIMD_REAL_WIDTH >= NBNXN_CPU_CLUSTER_I_SIZE +#define STRIDE_S (GMX_SIMD_REAL_WIDTH) #else #define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE #endif @@ -55,18 +55,18 @@ icell_set_x_simd_4xn(int ci, ia = X_IND_CI_SIMD_4XN(ci); - x_ci->ix_S0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx); - x_ci->iy_S0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy); - x_ci->iz_S0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz); - x_ci->ix_S1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx); - x_ci->iy_S1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy); - x_ci->iz_S1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz); - x_ci->ix_S2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx); - x_ci->iy_S2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy); - x_ci->iz_S2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz); - x_ci->ix_S3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx); - x_ci->iy_S3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy); - x_ci->iz_S3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz); + x_ci->ix_S0 = gmx_simd_set1_r(x[ia + 0*STRIDE_S ] + shx); + x_ci->iy_S0 = gmx_simd_set1_r(x[ia + 1*STRIDE_S ] + shy); + x_ci->iz_S0 = gmx_simd_set1_r(x[ia + 2*STRIDE_S ] + shz); + x_ci->ix_S1 = gmx_simd_set1_r(x[ia + 0*STRIDE_S + 1] + shx); + x_ci->iy_S1 = gmx_simd_set1_r(x[ia + 1*STRIDE_S + 1] + shy); + x_ci->iz_S1 = gmx_simd_set1_r(x[ia + 2*STRIDE_S + 1] + shz); + x_ci->ix_S2 = gmx_simd_set1_r(x[ia + 0*STRIDE_S + 2] + shx); + x_ci->iy_S2 = gmx_simd_set1_r(x[ia + 1*STRIDE_S + 2] + shy); + x_ci->iz_S2 = gmx_simd_set1_r(x[ia + 2*STRIDE_S + 2] + shz); + x_ci->ix_S3 = gmx_simd_set1_r(x[ia + 0*STRIDE_S + 3] + shx); + x_ci->iy_S3 = gmx_simd_set1_r(x[ia + 1*STRIDE_S + 3] + shy); + x_ci->iz_S3 = gmx_simd_set1_r(x[ia + 2*STRIDE_S + 3] + shz); } /* SIMD code for making a pair list of cell ci vs cell cjf-cjl @@ -83,32 +83,32 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, real rl2, float rbb2, int *ndistc) { - const nbnxn_x_ci_simd_4xn_t *work; - const nbnxn_bb_t *bb_ci; + const nbnxn_x_ci_simd_4xn_t *work; + const nbnxn_bb_t *bb_ci; - gmx_mm_pr jx_S, jy_S, jz_S; + gmx_simd_real_t jx_S, jy_S, jz_S; - gmx_mm_pr dx_S0, dy_S0, dz_S0; - gmx_mm_pr dx_S1, dy_S1, dz_S1; - gmx_mm_pr dx_S2, dy_S2, dz_S2; - gmx_mm_pr dx_S3, dy_S3, dz_S3; + gmx_simd_real_t dx_S0, dy_S0, dz_S0; + gmx_simd_real_t dx_S1, dy_S1, dz_S1; + gmx_simd_real_t dx_S2, dy_S2, dz_S2; + gmx_simd_real_t dx_S3, dy_S3, dz_S3; - gmx_mm_pr rsq_S0; - gmx_mm_pr rsq_S1; - gmx_mm_pr rsq_S2; - gmx_mm_pr rsq_S3; + gmx_simd_real_t rsq_S0; + gmx_simd_real_t rsq_S1; + gmx_simd_real_t rsq_S2; + gmx_simd_real_t rsq_S3; - gmx_mm_pb wco_S0; - gmx_mm_pb wco_S1; - gmx_mm_pb wco_S2; - gmx_mm_pb wco_S3; - gmx_mm_pb wco_any_S01, wco_any_S23, wco_any_S; + gmx_simd_bool_t wco_S0; + gmx_simd_bool_t wco_S1; + gmx_simd_bool_t wco_S2; + gmx_simd_bool_t wco_S3; + gmx_simd_bool_t wco_any_S01, wco_any_S23, wco_any_S; - gmx_mm_pr rc2_S; + gmx_simd_real_t rc2_S; - gmx_bool InRange; - float d2; - int xind_f, xind_l, cj; + gmx_bool InRange; + float d2; + int xind_f, xind_l, cj; cjf = CI_TO_CJ_SIMD_4XN(cjf); cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1; @@ -117,7 +117,7 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, bb_ci = nbl->work->bb_ci; - rc2_S = gmx_set1_pr(rl2); + rc2_S = gmx_simd_set1_r(rl2); InRange = FALSE; while (!InRange && cjf <= cjl) @@ -142,43 +142,43 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, { xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf); - jx_S = gmx_load_pr(x_j+xind_f+0*STRIDE_S); - jy_S = gmx_load_pr(x_j+xind_f+1*STRIDE_S); - jz_S = gmx_load_pr(x_j+xind_f+2*STRIDE_S); + jx_S = gmx_simd_load_r(x_j+xind_f+0*STRIDE_S); + jy_S = gmx_simd_load_r(x_j+xind_f+1*STRIDE_S); + jz_S = gmx_simd_load_r(x_j+xind_f+2*STRIDE_S); /* Calculate distance */ - dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); - dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); - dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); - dx_S1 = gmx_sub_pr(work->ix_S1, jx_S); - dy_S1 = gmx_sub_pr(work->iy_S1, jy_S); - dz_S1 = gmx_sub_pr(work->iz_S1, jz_S); - dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); - dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); - dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); - dx_S3 = gmx_sub_pr(work->ix_S3, jx_S); - dy_S3 = gmx_sub_pr(work->iy_S3, jy_S); - dz_S3 = gmx_sub_pr(work->iz_S3, jz_S); + dx_S0 = gmx_simd_sub_r(work->ix_S0, jx_S); + dy_S0 = gmx_simd_sub_r(work->iy_S0, jy_S); + dz_S0 = gmx_simd_sub_r(work->iz_S0, jz_S); + dx_S1 = gmx_simd_sub_r(work->ix_S1, jx_S); + dy_S1 = gmx_simd_sub_r(work->iy_S1, jy_S); + dz_S1 = gmx_simd_sub_r(work->iz_S1, jz_S); + dx_S2 = gmx_simd_sub_r(work->ix_S2, jx_S); + dy_S2 = gmx_simd_sub_r(work->iy_S2, jy_S); + dz_S2 = gmx_simd_sub_r(work->iz_S2, jz_S); + dx_S3 = gmx_simd_sub_r(work->ix_S3, jx_S); + dy_S3 = gmx_simd_sub_r(work->iy_S3, jy_S); + dz_S3 = gmx_simd_sub_r(work->iz_S3, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1); - rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); - rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3); + rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_simd_calc_rsq_r(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_simd_calc_rsq_r(dx_S3, dy_S3, dz_S3); - wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); - wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S); - wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); - wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S); + wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S); + wco_S1 = gmx_simd_cmplt_r(rsq_S1, rc2_S); + wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S); + wco_S3 = gmx_simd_cmplt_r(rsq_S3, rc2_S); - wco_any_S01 = gmx_or_pb(wco_S0, wco_S1); - wco_any_S23 = gmx_or_pb(wco_S2, wco_S3); - wco_any_S = gmx_or_pb(wco_any_S01, wco_any_S23); + wco_any_S01 = gmx_simd_or_b(wco_S0, wco_S1); + wco_any_S23 = gmx_simd_or_b(wco_S2, wco_S3); + wco_any_S = gmx_simd_or_b(wco_any_S01, wco_any_S23); - InRange = gmx_anytrue_pb(wco_any_S); + InRange = gmx_simd_anytrue_b(wco_any_S); - *ndistc += 4*GMX_SIMD_WIDTH_HERE; + *ndistc += 4*GMX_SIMD_REAL_WIDTH; } if (!InRange) { @@ -213,42 +213,42 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, { xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl); - jx_S = gmx_load_pr(x_j+xind_l+0*STRIDE_S); - jy_S = gmx_load_pr(x_j+xind_l+1*STRIDE_S); - jz_S = gmx_load_pr(x_j+xind_l+2*STRIDE_S); + jx_S = gmx_simd_load_r(x_j+xind_l+0*STRIDE_S); + jy_S = gmx_simd_load_r(x_j+xind_l+1*STRIDE_S); + jz_S = gmx_simd_load_r(x_j+xind_l+2*STRIDE_S); /* Calculate distance */ - dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); - dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); - dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); - dx_S1 = gmx_sub_pr(work->ix_S1, jx_S); - dy_S1 = gmx_sub_pr(work->iy_S1, jy_S); - dz_S1 = gmx_sub_pr(work->iz_S1, jz_S); - dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); - dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); - dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); - dx_S3 = gmx_sub_pr(work->ix_S3, jx_S); - dy_S3 = gmx_sub_pr(work->iy_S3, jy_S); - dz_S3 = gmx_sub_pr(work->iz_S3, jz_S); + dx_S0 = gmx_simd_sub_r(work->ix_S0, jx_S); + dy_S0 = gmx_simd_sub_r(work->iy_S0, jy_S); + dz_S0 = gmx_simd_sub_r(work->iz_S0, jz_S); + dx_S1 = gmx_simd_sub_r(work->ix_S1, jx_S); + dy_S1 = gmx_simd_sub_r(work->iy_S1, jy_S); + dz_S1 = gmx_simd_sub_r(work->iz_S1, jz_S); + dx_S2 = gmx_simd_sub_r(work->ix_S2, jx_S); + dy_S2 = gmx_simd_sub_r(work->iy_S2, jy_S); + dz_S2 = gmx_simd_sub_r(work->iz_S2, jz_S); + dx_S3 = gmx_simd_sub_r(work->ix_S3, jx_S); + dy_S3 = gmx_simd_sub_r(work->iy_S3, jy_S); + dz_S3 = gmx_simd_sub_r(work->iz_S3, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); - rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1); - rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); - rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3); + rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_simd_calc_rsq_r(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_simd_calc_rsq_r(dx_S3, dy_S3, dz_S3); - wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); - wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S); - wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); - wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S); + wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S); + wco_S1 = gmx_simd_cmplt_r(rsq_S1, rc2_S); + wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S); + wco_S3 = gmx_simd_cmplt_r(rsq_S3, rc2_S); - wco_any_S01 = gmx_or_pb(wco_S0, wco_S1); - wco_any_S23 = gmx_or_pb(wco_S2, wco_S3); - wco_any_S = gmx_or_pb(wco_any_S01, wco_any_S23); + wco_any_S01 = gmx_simd_or_b(wco_S0, wco_S1); + wco_any_S23 = gmx_simd_or_b(wco_S2, wco_S3); + wco_any_S = gmx_simd_or_b(wco_any_S01, wco_any_S23); - InRange = gmx_anytrue_pb(wco_any_S); + InRange = gmx_simd_anytrue_b(wco_any_S); - *ndistc += 4*GMX_SIMD_WIDTH_HERE; + *ndistc += 4*GMX_SIMD_REAL_WIDTH; } if (!InRange) { @@ -263,7 +263,7 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, /* Store cj and the interaction mask */ nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj; nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj); -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX nbl->cj[nbl->ncj].interaction_mask_indices[0] = (nbl->cj[nbl->ncj].excl & 0x000F) >> (0 * 4); nbl->cj[nbl->ncj].interaction_mask_indices[1] = (nbl->cj[nbl->ncj].excl & 0x00F0) >> (1 * 4); nbl->cj[nbl->ncj].interaction_mask_indices[2] = (nbl->cj[nbl->ncj].excl & 0x0F00) >> (2 * 4); diff --git a/src/gromacs/mdlib/pme.c b/src/gromacs/mdlib/pme.c index ff52b5b9fe..34ebe16bbe 100644 --- a/src/gromacs/mdlib/pme.c +++ b/src/gromacs/mdlib/pme.c @@ -260,9 +260,9 @@ typedef struct { typedef struct { #ifdef PME_SIMD4_SPREAD_GATHER /* Masks for 4-wide SIMD aligned spreading and gathering */ - gmx_simd4_pb mask_S0[6], mask_S1[6]; + gmx_simd4_bool_t mask_S0[6], mask_S1[6]; #else - int dummy; /* C89 requires that struct has at least one member */ + int dummy; /* C89 requires that struct has at least one member */ #endif } pme_spline_work_t; @@ -1877,7 +1877,7 @@ static void realloc_work(pme_work_t *work, int nkx) * elements at the end for padding. */ #ifdef PME_SIMD_SOLVE - simd_width = GMX_SIMD_WIDTH_HERE; + simd_width = GMX_SIMD_REAL_WIDTH; #else /* We can use any alignment, apart from 0, so we use 4 */ simd_width = 4; @@ -1914,24 +1914,24 @@ static void free_work(pme_work_t *work) inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned) { { - const gmx_mm_pr two = gmx_set1_pr(2.0); - gmx_mm_pr f_simd; - gmx_mm_pr lu; - gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e; + const gmx_simd_real_t two = gmx_simd_set1_r(2.0); + gmx_simd_real_t f_simd; + gmx_simd_real_t lu; + gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e; int kx; - f_simd = gmx_set1_pr(f); + f_simd = gmx_simd_set1_r(f); /* We only need to calculate from start. But since start is 0 or 1 * and we want to use aligned loads/stores, we always start from 0. */ - for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE) + for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH) { - tmp_d1 = gmx_load_pr(d_aligned+kx); - d_inv = gmx_inv_pr(tmp_d1); - tmp_r = gmx_load_pr(r_aligned+kx); - tmp_r = gmx_exp_pr(tmp_r); - tmp_e = gmx_mul_pr(f_simd, d_inv); - tmp_e = gmx_mul_pr(tmp_e, tmp_r); - gmx_store_pr(e_aligned+kx, tmp_e); + tmp_d1 = gmx_simd_load_r(d_aligned+kx); + d_inv = gmx_simd_inv_r(tmp_d1); + tmp_r = gmx_simd_load_r(r_aligned+kx); + tmp_r = gmx_simd_exp_r(tmp_r); + tmp_e = gmx_simd_mul_r(f_simd, d_inv); + tmp_e = gmx_simd_mul_r(tmp_e, tmp_r); + gmx_simd_store_r(e_aligned+kx, tmp_e); } } } @@ -1958,23 +1958,23 @@ inline static void calc_exponentials_q(int start, int end, real f, real *d, real /* Calculate exponentials through SIMD */ inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned) { - gmx_mm_pr tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk; - const gmx_mm_pr sqr_PI = gmx_sqrt_pr(gmx_set1_pr(M_PI)); + gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk; + const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI)); int kx; - for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE) + for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH) { /* We only need to calculate from start. But since start is 0 or 1 * and we want to use aligned loads/stores, we always start from 0. */ - tmp_d = gmx_load_pr(d_aligned+kx); - d_inv = gmx_inv_pr(tmp_d); - gmx_store_pr(d_aligned+kx, d_inv); - tmp_r = gmx_load_pr(r_aligned+kx); - tmp_r = gmx_exp_pr(tmp_r); - gmx_store_pr(r_aligned+kx, tmp_r); - tmp_mk = gmx_load_pr(factor_aligned+kx); - tmp_fac = gmx_mul_pr(sqr_PI, gmx_mul_pr(tmp_mk, gmx_erfc_pr(tmp_mk))); - gmx_store_pr(factor_aligned+kx, tmp_fac); + tmp_d = gmx_simd_load_r(d_aligned+kx); + d_inv = gmx_simd_inv_r(tmp_d); + gmx_simd_store_r(d_aligned+kx, d_inv); + tmp_r = gmx_simd_load_r(r_aligned+kx); + tmp_r = gmx_simd_exp_r(tmp_r); + gmx_simd_store_r(r_aligned+kx, tmp_r); + tmp_mk = gmx_simd_load_r(factor_aligned+kx); + tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk))); + gmx_simd_store_r(factor_aligned+kx, tmp_fac); } } #else @@ -3400,15 +3400,15 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order) #ifdef PME_SIMD4_SPREAD_GATHER real tmp[12], *tmp_aligned; - gmx_simd4_pr zero_S; - gmx_simd4_pr real_mask_S0, real_mask_S1; + gmx_simd4_real_t zero_S; + gmx_simd4_real_t real_mask_S0, real_mask_S1; int of, i; snew_aligned(work, 1, SIMD4_ALIGNMENT); tmp_aligned = gmx_simd4_align_real(tmp); - zero_S = gmx_simd4_setzero_pr(); + zero_S = gmx_simd4_setzero_r(); /* Generate bit masks to mask out the unused grid entries, * as we only operate on order of the 8 grid entries that are @@ -3420,10 +3420,10 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order) { tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0); } - real_mask_S0 = gmx_simd4_load_pr(tmp_aligned); - real_mask_S1 = gmx_simd4_load_pr(tmp_aligned+4); - work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S); - work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S); + real_mask_S0 = gmx_simd4_load_r(tmp_aligned); + real_mask_S1 = gmx_simd4_load_r(tmp_aligned+4); + work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S); + work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S); } #else work = NULL; diff --git a/src/gromacs/mdlib/pme_simd4.h b/src/gromacs/mdlib/pme_simd4.h index 1b6f0b0314..4cd2213c5d 100644 --- a/src/gromacs/mdlib/pme_simd4.h +++ b/src/gromacs/mdlib/pme_simd4.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,44 +45,44 @@ * This code does not assume any memory alignment for the grid. */ { - gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3; - gmx_simd4_pr tz_S; - gmx_simd4_pr vx_S; - gmx_simd4_pr vx_tz_S; - gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3; - gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3; - - ty_S0 = gmx_simd4_set1_pr(thy[0]); - ty_S1 = gmx_simd4_set1_pr(thy[1]); - ty_S2 = gmx_simd4_set1_pr(thy[2]); - ty_S3 = gmx_simd4_set1_pr(thy[3]); + gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3; + gmx_simd4_real_t tz_S; + gmx_simd4_real_t vx_S; + gmx_simd4_real_t vx_tz_S; + gmx_simd4_real_t sum_S0, sum_S1, sum_S2, sum_S3; + gmx_simd4_real_t gri_S0, gri_S1, gri_S2, gri_S3; + + ty_S0 = gmx_simd4_set1_r(thy[0]); + ty_S1 = gmx_simd4_set1_r(thy[1]); + ty_S2 = gmx_simd4_set1_r(thy[2]); + ty_S3 = gmx_simd4_set1_r(thy[3]); /* With order 4 the z-spline is actually aligned */ - tz_S = gmx_simd4_load_pr(thz); + tz_S = gmx_simd4_load_r(thz); for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)*pny*pnz; valx = qn*thx[ithx]; - vx_S = gmx_simd4_set1_pr(valx); + vx_S = gmx_simd4_set1_r(valx); - vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S); + vx_tz_S = gmx_simd4_mul_r(vx_S, tz_S); - gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0); - gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0); - gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0); - gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0); + gri_S0 = gmx_simd4_loadu_r(grid+index_x+(j0+0)*pnz+k0); + gri_S1 = gmx_simd4_loadu_r(grid+index_x+(j0+1)*pnz+k0); + gri_S2 = gmx_simd4_loadu_r(grid+index_x+(j0+2)*pnz+k0); + gri_S3 = gmx_simd4_loadu_r(grid+index_x+(j0+3)*pnz+k0); - sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0); - sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1); - sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2); - sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3); + sum_S0 = gmx_simd4_fmadd_r(vx_tz_S, ty_S0, gri_S0); + sum_S1 = gmx_simd4_fmadd_r(vx_tz_S, ty_S1, gri_S1); + sum_S2 = gmx_simd4_fmadd_r(vx_tz_S, ty_S2, gri_S2); + sum_S3 = gmx_simd4_fmadd_r(vx_tz_S, ty_S3, gri_S3); - gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0); - gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1); - gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2); - gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3); + gmx_simd4_storeu_r(grid+index_x+(j0+0)*pnz+k0, sum_S0); + gmx_simd4_storeu_r(grid+index_x+(j0+1)*pnz+k0, sum_S1); + gmx_simd4_storeu_r(grid+index_x+(j0+2)*pnz+k0, sum_S2); + gmx_simd4_storeu_r(grid+index_x+(j0+3)*pnz+k0, sum_S3); } } #undef PME_SPREAD_SIMD4_ORDER4 @@ -94,52 +94,52 @@ * This code does not assume any memory alignment for the grid. */ { - real fx_tmp[4], fy_tmp[4], fz_tmp[4]; + real fx_tmp[4], fy_tmp[4], fz_tmp[4]; - gmx_simd4_pr fx_S, fy_S, fz_S; + gmx_simd4_real_t fx_S, fy_S, fz_S; - gmx_simd4_pr tx_S, ty_S, tz_S; - gmx_simd4_pr dx_S, dy_S, dz_S; + gmx_simd4_real_t tx_S, ty_S, tz_S; + gmx_simd4_real_t dx_S, dy_S, dz_S; - gmx_simd4_pr gval_S; + gmx_simd4_real_t gval_S; - gmx_simd4_pr fxy1_S; - gmx_simd4_pr fz1_S; + gmx_simd4_real_t fxy1_S; + gmx_simd4_real_t fz1_S; - fx_S = gmx_simd4_setzero_pr(); - fy_S = gmx_simd4_setzero_pr(); - fz_S = gmx_simd4_setzero_pr(); + fx_S = gmx_simd4_setzero_r(); + fy_S = gmx_simd4_setzero_r(); + fz_S = gmx_simd4_setzero_r(); /* With order 4 the z-spline is actually aligned */ - tz_S = gmx_simd4_load_pr(thz); - dz_S = gmx_simd4_load_pr(dthz); + tz_S = gmx_simd4_load_r(thz); + dz_S = gmx_simd4_load_r(dthz); for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)*pny*pnz; - tx_S = gmx_simd4_set1_pr(thx[ithx]); - dx_S = gmx_simd4_set1_pr(dthx[ithx]); + tx_S = gmx_simd4_set1_r(thx[ithx]); + dx_S = gmx_simd4_set1_r(dthx[ithx]); for (ithy = 0; (ithy < 4); ithy++) { index_xy = index_x+(j0+ithy)*pnz; - ty_S = gmx_simd4_set1_pr(thy[ithy]); - dy_S = gmx_simd4_set1_pr(dthy[ithy]); + ty_S = gmx_simd4_set1_r(thy[ithy]); + dy_S = gmx_simd4_set1_r(dthy[ithy]); - gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0); + gval_S = gmx_simd4_loadu_r(grid+index_xy+k0); - fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S); - fz1_S = gmx_simd4_mul_pr(dz_S, gval_S); + fxy1_S = gmx_simd4_mul_r(tz_S, gval_S); + fz1_S = gmx_simd4_mul_r(dz_S, gval_S); - fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S); - fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S); - fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S); + fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S); + fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S); + fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S); } } - gmx_simd4_storeu_pr(fx_tmp, fx_S); - gmx_simd4_storeu_pr(fy_tmp, fy_S); - gmx_simd4_storeu_pr(fz_tmp, fz_S); + gmx_simd4_storeu_r(fx_tmp, fx_S); + gmx_simd4_storeu_r(fy_tmp, fy_S); + gmx_simd4_storeu_r(fz_tmp, fz_S); fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3]; fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3]; @@ -155,32 +155,32 @@ * This code supports pme_order <= 5. */ { - int offset; - int index; - gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4; - gmx_simd4_pr tz_S0; - gmx_simd4_pr tz_S1; - gmx_simd4_pr vx_S; - gmx_simd4_pr vx_tz_S0; - gmx_simd4_pr vx_tz_S1; - gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04; - gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14; - gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04; - gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14; + int offset; + int index; + gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3, ty_S4; + gmx_simd4_real_t tz_S0; + gmx_simd4_real_t tz_S1; + gmx_simd4_real_t vx_S; + gmx_simd4_real_t vx_tz_S0; + gmx_simd4_real_t vx_tz_S1; + gmx_simd4_real_t sum_S00, sum_S01, sum_S02, sum_S03, sum_S04; + gmx_simd4_real_t sum_S10, sum_S11, sum_S12, sum_S13, sum_S14; + gmx_simd4_real_t gri_S00, gri_S01, gri_S02, gri_S03, gri_S04; + gmx_simd4_real_t gri_S10, gri_S11, gri_S12, gri_S13, gri_S14; offset = k0 & 3; - ty_S0 = gmx_simd4_set1_pr(thy[0]); - ty_S1 = gmx_simd4_set1_pr(thy[1]); - ty_S2 = gmx_simd4_set1_pr(thy[2]); - ty_S3 = gmx_simd4_set1_pr(thy[3]); + ty_S0 = gmx_simd4_set1_r(thy[0]); + ty_S1 = gmx_simd4_set1_r(thy[1]); + ty_S2 = gmx_simd4_set1_r(thy[2]); + ty_S3 = gmx_simd4_set1_r(thy[3]); #if PME_ORDER == 5 - ty_S4 = gmx_simd4_set1_pr(thy[4]); + ty_S4 = gmx_simd4_set1_r(thy[4]); #endif #ifdef GMX_SIMD4_HAVE_UNALIGNED - tz_S0 = gmx_simd4_loadu_pr(thz-offset); - tz_S1 = gmx_simd4_loadu_pr(thz-offset+4); + tz_S0 = gmx_simd4_loadu_r(thz-offset); + tz_S1 = gmx_simd4_loadu_r(thz-offset+4); #else { int i; @@ -189,66 +189,66 @@ { thz_aligned[offset+i] = thz[i]; } - tz_S0 = gmx_simd4_load_pr(thz_aligned); - tz_S1 = gmx_simd4_load_pr(thz_aligned+4); + tz_S0 = gmx_simd4_load_r(thz_aligned); + tz_S1 = gmx_simd4_load_r(thz_aligned+4); } #endif - tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]); - tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]); + tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]); + tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]); for (ithx = 0; (ithx < PME_ORDER); ithx++) { index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset; valx = qn*thx[ithx]; - vx_S = gmx_simd4_set1_pr(valx); + vx_S = gmx_simd4_set1_r(valx); - vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0); - vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1); + vx_tz_S0 = gmx_simd4_mul_r(vx_S, tz_S0); + vx_tz_S1 = gmx_simd4_mul_r(vx_S, tz_S1); - gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz); - gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz); - gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz); - gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz); + gri_S00 = gmx_simd4_load_r(grid+index+0*pnz); + gri_S01 = gmx_simd4_load_r(grid+index+1*pnz); + gri_S02 = gmx_simd4_load_r(grid+index+2*pnz); + gri_S03 = gmx_simd4_load_r(grid+index+3*pnz); #if PME_ORDER == 5 - gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz); + gri_S04 = gmx_simd4_load_r(grid+index+4*pnz); #endif - gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4); - gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4); - gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4); - gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4); + gri_S10 = gmx_simd4_load_r(grid+index+0*pnz+4); + gri_S11 = gmx_simd4_load_r(grid+index+1*pnz+4); + gri_S12 = gmx_simd4_load_r(grid+index+2*pnz+4); + gri_S13 = gmx_simd4_load_r(grid+index+3*pnz+4); #if PME_ORDER == 5 - gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4); + gri_S14 = gmx_simd4_load_r(grid+index+4*pnz+4); #endif - sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00); - sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01); - sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02); - sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03); + sum_S00 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S0, gri_S00); + sum_S01 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S1, gri_S01); + sum_S02 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S2, gri_S02); + sum_S03 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S3, gri_S03); #if PME_ORDER == 5 - sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04); + sum_S04 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S4, gri_S04); #endif - sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10); - sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11); - sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12); - sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13); + sum_S10 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S0, gri_S10); + sum_S11 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S1, gri_S11); + sum_S12 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S2, gri_S12); + sum_S13 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S3, gri_S13); #if PME_ORDER == 5 - sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14); + sum_S14 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S4, gri_S14); #endif - gmx_simd4_store_pr(grid+index+0*pnz, sum_S00); - gmx_simd4_store_pr(grid+index+1*pnz, sum_S01); - gmx_simd4_store_pr(grid+index+2*pnz, sum_S02); - gmx_simd4_store_pr(grid+index+3*pnz, sum_S03); + gmx_simd4_store_r(grid+index+0*pnz, sum_S00); + gmx_simd4_store_r(grid+index+1*pnz, sum_S01); + gmx_simd4_store_r(grid+index+2*pnz, sum_S02); + gmx_simd4_store_r(grid+index+3*pnz, sum_S03); #if PME_ORDER == 5 - gmx_simd4_store_pr(grid+index+4*pnz, sum_S04); + gmx_simd4_store_r(grid+index+4*pnz, sum_S04); #endif - gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10); - gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11); - gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12); - gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13); + gmx_simd4_store_r(grid+index+0*pnz+4, sum_S10); + gmx_simd4_store_r(grid+index+1*pnz+4, sum_S11); + gmx_simd4_store_r(grid+index+2*pnz+4, sum_S12); + gmx_simd4_store_r(grid+index+3*pnz+4, sum_S13); #if PME_ORDER == 5 - gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14); + gmx_simd4_store_r(grid+index+4*pnz+4, sum_S14); #endif } } @@ -263,36 +263,36 @@ * This code supports pme_order <= 5. */ { - int offset; + int offset; - real fx_tmp[4], fy_tmp[4], fz_tmp[4]; + real fx_tmp[4], fy_tmp[4], fz_tmp[4]; - gmx_simd4_pr fx_S, fy_S, fz_S; + gmx_simd4_real_t fx_S, fy_S, fz_S; - gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1; - gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1; + gmx_simd4_real_t tx_S, ty_S, tz_S0, tz_S1; + gmx_simd4_real_t dx_S, dy_S, dz_S0, dz_S1; - gmx_simd4_pr gval_S0; - gmx_simd4_pr gval_S1; + gmx_simd4_real_t gval_S0; + gmx_simd4_real_t gval_S1; - gmx_simd4_pr fxy1_S0; - gmx_simd4_pr fz1_S0; - gmx_simd4_pr fxy1_S1; - gmx_simd4_pr fz1_S1; - gmx_simd4_pr fxy1_S; - gmx_simd4_pr fz1_S; + gmx_simd4_real_t fxy1_S0; + gmx_simd4_real_t fz1_S0; + gmx_simd4_real_t fxy1_S1; + gmx_simd4_real_t fz1_S1; + gmx_simd4_real_t fxy1_S; + gmx_simd4_real_t fz1_S; offset = k0 & 3; - fx_S = gmx_simd4_setzero_pr(); - fy_S = gmx_simd4_setzero_pr(); - fz_S = gmx_simd4_setzero_pr(); + fx_S = gmx_simd4_setzero_r(); + fy_S = gmx_simd4_setzero_r(); + fz_S = gmx_simd4_setzero_r(); #ifdef GMX_SIMD4_HAVE_UNALIGNED - tz_S0 = gmx_simd4_loadu_pr(thz-offset); - tz_S1 = gmx_simd4_loadu_pr(thz-offset+4); - dz_S0 = gmx_simd4_loadu_pr(dthz-offset); - dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4); + tz_S0 = gmx_simd4_loadu_r(thz-offset); + tz_S1 = gmx_simd4_loadu_r(thz-offset+4); + dz_S0 = gmx_simd4_loadu_r(dthz-offset); + dz_S1 = gmx_simd4_loadu_r(dthz-offset+4); #else { int i; @@ -302,49 +302,49 @@ thz_aligned[offset+i] = thz[i]; dthz_aligned[offset+i] = dthz[i]; } - tz_S0 = gmx_simd4_load_pr(thz_aligned); - tz_S1 = gmx_simd4_load_pr(thz_aligned+4); - dz_S0 = gmx_simd4_load_pr(dthz_aligned); - dz_S1 = gmx_simd4_load_pr(dthz_aligned+4); + tz_S0 = gmx_simd4_load_r(thz_aligned); + tz_S1 = gmx_simd4_load_r(thz_aligned+4); + dz_S0 = gmx_simd4_load_r(dthz_aligned); + dz_S1 = gmx_simd4_load_r(dthz_aligned+4); } #endif - tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]); - dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]); - tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]); - dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]); + tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]); + dz_S0 = gmx_simd4_blendzero_r(dz_S0, work->mask_S0[offset]); + tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]); + dz_S1 = gmx_simd4_blendzero_r(dz_S1, work->mask_S1[offset]); for (ithx = 0; (ithx < PME_ORDER); ithx++) { index_x = (i0+ithx)*pny*pnz; - tx_S = gmx_simd4_set1_pr(thx[ithx]); - dx_S = gmx_simd4_set1_pr(dthx[ithx]); + tx_S = gmx_simd4_set1_r(thx[ithx]); + dx_S = gmx_simd4_set1_r(dthx[ithx]); for (ithy = 0; (ithy < PME_ORDER); ithy++) { index_xy = index_x+(j0+ithy)*pnz; - ty_S = gmx_simd4_set1_pr(thy[ithy]); - dy_S = gmx_simd4_set1_pr(dthy[ithy]); + ty_S = gmx_simd4_set1_r(thy[ithy]); + dy_S = gmx_simd4_set1_r(dthy[ithy]); - gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset); - gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4); + gval_S0 = gmx_simd4_load_r(grid+index_xy+k0-offset); + gval_S1 = gmx_simd4_load_r(grid+index_xy+k0-offset+4); - fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0); - fz1_S0 = gmx_simd4_mul_pr(dz_S0, gval_S0); - fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1); - fz1_S1 = gmx_simd4_mul_pr(dz_S1, gval_S1); + fxy1_S0 = gmx_simd4_mul_r(tz_S0, gval_S0); + fz1_S0 = gmx_simd4_mul_r(dz_S0, gval_S0); + fxy1_S1 = gmx_simd4_mul_r(tz_S1, gval_S1); + fz1_S1 = gmx_simd4_mul_r(dz_S1, gval_S1); - fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1); - fz1_S = gmx_simd4_add_pr(fz1_S0, fz1_S1); + fxy1_S = gmx_simd4_add_r(fxy1_S0, fxy1_S1); + fz1_S = gmx_simd4_add_r(fz1_S0, fz1_S1); - fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S); - fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S); - fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S); + fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S); + fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S); + fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S); } } - gmx_simd4_store_pr(fx_tmp, fx_S); - gmx_simd4_store_pr(fy_tmp, fy_S); - gmx_simd4_store_pr(fz_tmp, fz_S); + gmx_simd4_store_r(fx_tmp, fx_S); + gmx_simd4_store_r(fy_tmp, fy_S); + gmx_simd4_store_r(fz_tmp, fz_S); fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3]; fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3]; diff --git a/src/gromacs/mdlib/tpi.c b/src/gromacs/mdlib/tpi.c index 2726173dd6..057b73c54f 100644 --- a/src/gromacs/mdlib/tpi.c +++ b/src/gromacs/mdlib/tpi.c @@ -79,7 +79,7 @@ #include "gromacs/timing/wallcycle.h" #include "gromacs/timing/walltime_accounting.h" -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER #include "gromacs/simd/general_x86_sse2.h" #endif @@ -439,7 +439,7 @@ double do_tpi(FILE *fplog, t_commrec *cr, refvolshift = log(det(rerun_fr.box)); -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER /* Make sure we don't detect SSE overflow generated before this point */ gmx_mm_check_and_reset_overflow(); #endif @@ -631,7 +631,7 @@ double do_tpi(FILE *fplog, t_commrec *cr, epot = enerd->term[F_EPOT]; bEnergyOutOfBounds = FALSE; -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER /* With SSE the energy can overflow, check for this */ if (gmx_mm_check_and_reset_overflow()) { diff --git a/src/gromacs/simd/four_wide_macros.h b/src/gromacs/simd/four_wide_macros.h index 8f6f08cc49..8ed1d34932 100644 --- a/src/gromacs/simd/four_wide_macros.h +++ b/src/gromacs/simd/four_wide_macros.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -73,8 +73,8 @@ typedef float gmx_simd4_real; #endif /* Uncomment the next line, without other SIMD active, for testing plain-C */ -/* #define GMX_SIMD4_REFERENCE_PLAIN_C */ -#ifdef GMX_SIMD4_REFERENCE_PLAIN_C +/* #define GMX_SIMD4_REFERENCE */ +#ifdef GMX_SIMD4_REFERENCE /* Plain C SIMD reference implementation, also serves as documentation */ #define GMX_HAVE_SIMD4_MACROS @@ -82,51 +82,51 @@ typedef float gmx_simd4_real; #include "four_wide_macros_ref.h" /* float/double SIMD register type */ -#define gmx_simd4_pr gmx_simd4_ref_pr +#define gmx_simd4_real_t gmx_simd4_ref_pr /* boolean SIMD register type */ -#define gmx_simd4_pb gmx_simd4_ref_pb +#define gmx_simd4_bool_t gmx_simd4_ref_pb -#define gmx_simd4_load_pr gmx_simd4_ref_load_pr +#define gmx_simd4_load_r gmx_simd4_ref_load_pr #define gmx_simd4_load_bb_pr gmx_simd4_ref_load_pr -#define gmx_simd4_set1_pr gmx_simd4_ref_set1_pr -#define gmx_simd4_setzero_pr gmx_simd4_ref_setzero_pr -#define gmx_simd4_store_pr gmx_simd4_ref_store_pr +#define gmx_simd4_set1_r gmx_simd4_ref_set1_pr +#define gmx_simd4_setzero_r gmx_simd4_ref_setzero_pr +#define gmx_simd4_store_r gmx_simd4_ref_store_pr /* Unaligned load+store are not required, * but they can speed up the PME spread+gather operations. */ #define GMX_SIMD4_HAVE_UNALIGNED #ifdef GMX_SIMD4_HAVE_UNALIGNED -#define gmx_simd4_loadu_pr gmx_simd4_ref_load_pr -#define gmx_simd4_storeu_pr gmx_simd4_ref_store_pr +#define gmx_simd4_loadu_r gmx_simd4_ref_load_pr +#define gmx_simd4_storeu_r gmx_simd4_ref_store_pr #endif -#define gmx_simd4_add_pr gmx_simd4_ref_add_pr -#define gmx_simd4_sub_pr gmx_simd4_ref_sub_pr -#define gmx_simd4_mul_pr gmx_simd4_ref_mul_pr +#define gmx_simd4_add_r gmx_simd4_ref_add_pr +#define gmx_simd4_sub_r gmx_simd4_ref_sub_pr +#define gmx_simd4_mul_r gmx_simd4_ref_mul_pr /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */ -#define gmx_simd4_madd_pr gmx_simd4_ref_madd_pr -#define gmx_simd4_nmsub_pr gmx_simd4_ref_nmsub_pr +#define gmx_simd4_fmadd_r gmx_simd4_ref_madd_pr +#define gmx_simd4_fnmadd_r gmx_simd4_ref_nmsub_pr -#define gmx_simd4_dotproduct3 gmx_simd4_ref_dotproduct3 +#define gmx_simd4_dotproduct3_r gmx_simd4_ref_dotproduct3 -#define gmx_simd4_min_pr gmx_simd4_ref_min_pr -#define gmx_simd4_max_pr gmx_simd4_ref_max_pr +#define gmx_simd4_min_r gmx_simd4_ref_min_pr +#define gmx_simd4_max_r gmx_simd4_ref_max_pr -#define gmx_simd4_blendzero_pr gmx_simd4_ref_blendzero_pr +#define gmx_simd4_blendzero_r gmx_simd4_ref_blendzero_pr /* Comparison */ -#define gmx_simd4_cmplt_pr gmx_simd4_ref_cmplt_pr +#define gmx_simd4_cmplt_r gmx_simd4_ref_cmplt_pr /* Logical operations on SIMD booleans */ -#define gmx_simd4_and_pb gmx_simd4_ref_and_pb -#define gmx_simd4_or_pb gmx_simd4_ref_or_pb +#define gmx_simd4_and_b gmx_simd4_ref_and_pb +#define gmx_simd4_or_b gmx_simd4_ref_or_pb /* Returns a single int (0/1) which tells if any of the 4 booleans is True */ -#define gmx_simd4_anytrue_pb gmx_simd4_ref_anytrue_pb +#define gmx_simd4_anytrue_b gmx_simd4_ref_anytrue_pb -#endif /* GMX_SIMD4_REFERENCE_PLAIN_C */ +#endif /* GMX_SIMD4_REFERENCE */ /* The same SIMD macros can be translated to SIMD intrinsics (and compiled @@ -139,7 +139,7 @@ typedef float gmx_simd4_real; */ -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER /* This is for general x86 SIMD instruction sets that also support SSE2 */ #ifdef GMX_SIMD4_SINGLE @@ -147,17 +147,17 @@ typedef float gmx_simd4_real; #endif #ifdef GMX_SIMD4_DOUBLE -/* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA. +/* Note that here we will use 256-bit SIMD with GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER. * This is inconsistent naming wise, but should give the best performance. */ -#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256 +#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER #define GMX_HAVE_SIMD4_MACROS #endif #endif #ifdef GMX_HAVE_SIMD4_MACROS -#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256 +#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER #include #ifdef HAVE_X86INTRIN_H @@ -168,7 +168,7 @@ typedef float gmx_simd4_real; #endif #else -#ifdef GMX_X86_SSE4_1 +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER #include #else /* We only have SSE2 */ @@ -178,39 +178,39 @@ typedef float gmx_simd4_real; #ifdef GMX_SIMD4_SINGLE -#define gmx_simd4_pr __m128 +#define gmx_simd4_real_t __m128 -#define gmx_simd4_pb __m128 +#define gmx_simd4_bool_t __m128 -#define gmx_simd4_load_pr _mm_load_ps +#define gmx_simd4_load_r _mm_load_ps #define gmx_simd4_load_bb_pr _mm_load_ps -#define gmx_simd4_set1_pr _mm_set1_ps -#define gmx_simd4_setzero_pr _mm_setzero_ps -#define gmx_simd4_store_pr _mm_store_ps +#define gmx_simd4_set1_r _mm_set1_ps +#define gmx_simd4_setzero_r _mm_setzero_ps +#define gmx_simd4_store_r _mm_store_ps /* Some old AMD processors could have problems with unaligned loads+stores */ #ifndef GMX_FAHCORE #define GMX_SIMD4_HAVE_UNALIGNED #endif #ifdef GMX_SIMD4_HAVE_UNALIGNED -#define gmx_simd4_loadu_pr _mm_loadu_ps -#define gmx_simd4_storeu_pr _mm_storeu_ps +#define gmx_simd4_loadu_r _mm_loadu_ps +#define gmx_simd4_storeu_r _mm_storeu_ps #endif -#define gmx_simd4_add_pr _mm_add_ps -#define gmx_simd4_sub_pr _mm_sub_ps -#define gmx_simd4_mul_pr _mm_mul_ps +#define gmx_simd4_add_r _mm_add_ps +#define gmx_simd4_sub_r _mm_sub_ps +#define gmx_simd4_mul_r _mm_mul_ps -#ifdef GMX_X86_AVX_128_FMA -#define gmx_simd4_madd_pr(a, b, c) _mm_macc_ps(a, b, c) -#define gmx_simd4_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c) +#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER +#define gmx_simd4_fmadd_r(a, b, c) _mm_macc_ps(a, b, c) +#define gmx_simd4_fnmadd_r(a, b, c) _mm_nmacc_ps(a, b, c) #else -#define gmx_simd4_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b)) -#define gmx_simd4_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) +#define gmx_simd4_fmadd_r(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b)) +#define gmx_simd4_fnmadd_r(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) #endif -static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b) -#ifdef GMX_X86_SSE4_1 +static inline float gmx_simd4_dotproduct3_r(__m128 a, __m128 b) +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER { float dp; @@ -232,66 +232,66 @@ static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b) } #endif -#define gmx_simd4_min_pr _mm_min_ps -#define gmx_simd4_max_pr _mm_max_ps +#define gmx_simd4_min_r _mm_min_ps +#define gmx_simd4_max_r _mm_max_ps -#define gmx_simd4_blendzero_pr _mm_and_ps +#define gmx_simd4_blendzero_r _mm_and_ps -#define gmx_simd4_cmplt_pr _mm_cmplt_ps -#define gmx_simd4_and_pb _mm_and_ps -#define gmx_simd4_or_pb _mm_or_ps +#define gmx_simd4_cmplt_r _mm_cmplt_ps +#define gmx_simd4_and_b _mm_and_ps +#define gmx_simd4_or_b _mm_or_ps -#define gmx_simd4_anytrue_pb _mm_movemask_ps +#define gmx_simd4_anytrue_b _mm_movemask_ps #endif /* GMX_SIMD4_SINGLE */ #ifdef GMX_SIMD4_DOUBLE -#define gmx_simd4_pr __m256d +#define gmx_simd4_real_t __m256d -#define gmx_simd4_pb __m256d +#define gmx_simd4_bool_t __m256d -#define gmx_simd4_load_pr _mm256_load_pd +#define gmx_simd4_load_r _mm256_load_pd #define gmx_simd4_load_bb_pr _mm256_load_pd -#define gmx_simd4_set1_pr _mm256_set1_pd -#define gmx_simd4_setzero_pr _mm256_setzero_pd -#define gmx_simd4_store_pr _mm256_store_pd +#define gmx_simd4_set1_r _mm256_set1_pd +#define gmx_simd4_setzero_r _mm256_setzero_pd +#define gmx_simd4_store_r _mm256_store_pd #define GMX_SIMD4_HAVE_UNALIGNED -#define gmx_simd4_loadu_pr _mm256_loadu_pd -#define gmx_simd4_storeu_pr _mm256_storeu_pd - -#define gmx_simd4_add_pr _mm256_add_pd -#define gmx_simd4_sub_pr _mm256_sub_pd -#define gmx_simd4_mul_pr _mm256_mul_pd -#ifdef GMX_X86_AVX_128_FMA -#define gmx_simd4_madd_pr(a, b, c) _mm256_macc_pd(a, b, c) -#define gmx_simd4_nmsub_pr(a, b, c) _mm256_nmacc_pd(a, b, c) +#define gmx_simd4_loadu_r _mm256_loadu_pd +#define gmx_simd4_storeu_r _mm256_storeu_pd + +#define gmx_simd4_add_r _mm256_add_pd +#define gmx_simd4_sub_r _mm256_sub_pd +#define gmx_simd4_mul_r _mm256_mul_pd +#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER +#define gmx_simd4_fmadd_r(a, b, c) _mm256_macc_pd(a, b, c) +#define gmx_simd4_fnmadd_r(a, b, c) _mm256_nmacc_pd(a, b, c) #else -#define gmx_simd4_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b)) -#define gmx_simd4_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b)) +#define gmx_simd4_fmadd_r(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b)) +#define gmx_simd4_fnmadd_r(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b)) #endif -#define gmx_simd4_min_pr _mm256_min_pd -#define gmx_simd4_max_pr _mm256_max_pd +#define gmx_simd4_min_r _mm256_min_pd +#define gmx_simd4_max_r _mm256_max_pd -#define gmx_simd4_blendzero_pr _mm256_and_pd +#define gmx_simd4_blendzero_r _mm256_and_pd /* Less-than (we use ordered, non-signaling, but that's not required) */ -#define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11) -#define gmx_simd4_and_pb _mm256_and_pd -#define gmx_simd4_or_pb _mm256_or_pd +#define gmx_simd4_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11) +#define gmx_simd4_and_b _mm256_and_pd +#define gmx_simd4_or_b _mm256_or_pd -#define gmx_simd4_anytrue_pb _mm256_movemask_pd +#define gmx_simd4_anytrue_b _mm256_movemask_pd #endif /* GMX_SIMD4_DOUBLE */ #endif /* GMX_HAVE_SIMD4_MACROS */ -#endif /* GMX_X86_SSE2 */ +#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */ -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX /* i.e. BlueGene/Q */ /* This hack works on the compilers that can reach this code. A real @@ -302,8 +302,8 @@ static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b) #define GMX_HAVE_SIMD4_MACROS #endif -typedef vector4double gmx_simd4_pr; -typedef vector4double gmx_simd4_pb; +typedef vector4double gmx_simd4_real_t; +typedef vector4double gmx_simd4_bool_t; /* The declarations of vec_ld* use non-const pointers, and IBM can't/won't fix this any time soon. So GROMACS has to cast away the @@ -316,10 +316,10 @@ typedef vector4double gmx_simd4_pb; always-float variables have to be done with a function that does the correct cast. Since functions cannot be overloaded by type in C, they have to have different names. Thus we have - gmx_simd4_load_pr and gmx_simd4_load_bb_pr. + gmx_simd4_load_r and gmx_simd4_load_bb_pr. */ -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_r(const real *a) { #ifdef NDEBUG return vec_ld(0, (real *) a); @@ -328,7 +328,7 @@ static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a #endif } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const float *a) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_bb_pr(const float *a) { #ifdef NDEBUG return vec_ld(0, (float *) a); @@ -337,12 +337,12 @@ static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const floa #endif } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_set1_pr(const real a) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_set1_r(const real a) { return vec_splats(a); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr() +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_setzero_r() { return vec_splats(0.0); } @@ -350,7 +350,7 @@ static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr() /* TODO this will not yet work, because the function might be passed a pointer to a float when running in double precision. */ -static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_pr b) +static gmx_inline void gmx_always_inline gmx_simd4_store_r(real *a, gmx_simd4_real_t b) { #ifdef NDEBUG vec_st(b, 0, a); @@ -359,64 +359,64 @@ static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_p #endif } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_add_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { return vec_add(a, b); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_sub_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_sub_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { return vec_sub(a, b); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_mul_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_mul_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { return vec_mul(a, b); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_madd_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c) { return vec_madd(a, b, c); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_nmsub_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fnmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c) { return vec_nmsub(a, b, c); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_min_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_min_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { /* Implemented the same way as max, but with the subtraction operands swapped. */ return vec_sel(b, a, vec_sub(b, a)); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_max_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_max_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { return vec_sel(b, a, vec_sub(a, b)); } -static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_blendzero_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_blendzero_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { - return vec_sel(gmx_setzero_pr(), a, b); + return vec_sel(gmx_simd_setzero_r(), a, b); } -static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_cmplt_pr(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_cmplt_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { return vec_cmplt(a, b); } -static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_and_pb(gmx_simd4_pb a, gmx_simd4_pb b) +static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_and_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b) { return vec_and(a, b); } -static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_or_pb(gmx_simd4_pb a, gmx_simd4_pb b) +static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_or_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b) { return vec_or(a, b); } -static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, gmx_simd4_pr b) +static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3_r(gmx_simd4_real_t a, gmx_simd4_real_t b) { /* The dot product is done solely on the QPX AXU (which is the only available FPU). This is awkward, because pretty much no @@ -432,25 +432,25 @@ static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, memory at all. */ - gmx_simd4_pr dp_shifted_left_0 = vec_mul(a, b); - gmx_simd4_pr dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1); - gmx_simd4_pr dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2); - gmx_simd4_pr dp = vec_add(dp_shifted_left_2, - vec_add(dp_shifted_left_0, dp_shifted_left_1)); + gmx_simd4_real_t dp_shifted_left_0 = vec_mul(a, b); + gmx_simd4_real_t dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1); + gmx_simd4_real_t dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2); + gmx_simd4_real_t dp = vec_add(dp_shifted_left_2, + vec_add(dp_shifted_left_0, dp_shifted_left_1)); /* See comment in nbnxn_make_pairlist_part() about how this should be able to return a double on PowerPC. */ return (float) vec_extract(dp, 0); } -static gmx_inline int gmx_always_inline gmx_simd4_anytrue_pb(gmx_simd4_pb a) +static gmx_inline int gmx_always_inline gmx_simd4_anytrue_b(gmx_simd4_bool_t a) { - return gmx_anytrue_pb(a); + return gmx_simd_anytrue_b(a); } #undef gmx_always_inline -#endif /* GMX_CPU_ACCELERATION_IBM_QPX */ +#endif /* GMX_SIMD_IBM_QPX */ #ifdef GMX_HAVE_SIMD4_MACROS /* Generic functions to extract a SIMD4 aligned pointer from a pointer x. diff --git a/src/gromacs/simd/four_wide_macros_ref.h b/src/gromacs/simd/four_wide_macros_ref.h index 002f3a96f9..8b47d64d7a 100644 --- a/src/gromacs/simd/four_wide_macros_ref.h +++ b/src/gromacs/simd/four_wide_macros_ref.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -284,7 +284,7 @@ gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b) return c; } -/* gmx_anytrue_pb(x) returns if any of the boolean is x is True */ +/* gmx_simd_anytrue_b(x) returns if any of the boolean is x is True */ static gmx_inline int gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a) { diff --git a/src/gromacs/simd/general_x86_avx_128_fma.h b/src/gromacs/simd/general_x86_avx_128_fma.h index 5314d1c4a2..19ec986dd5 100644 --- a/src/gromacs/simd/general_x86_avx_128_fma.h +++ b/src/gromacs/simd/general_x86_avx_128_fma.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -139,7 +139,7 @@ _mm_msub_pd(__m128d a, __m128d b, __m128d c) #endif /* AMD FMA emulation support */ static void -gmx_mm_printxmm_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm) { float f[4]; @@ -149,7 +149,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm) { float f[4]; @@ -159,7 +159,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmm_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm) { double f[2]; @@ -168,7 +168,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm) } static void -gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm) { double f[2]; @@ -178,7 +178,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) static void -gmx_mm_printxmm_epi32(const char *s, __m128i xmmi) +gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi) { int i[4]; @@ -211,7 +211,7 @@ static int gmx_mm_check_and_reset_overflow(void) } /* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */ -#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG +#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG # define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask)) # define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x)) # define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask)) diff --git a/src/gromacs/simd/general_x86_avx_256.h b/src/gromacs/simd/general_x86_avx_256.h index d13bdeec8c..b7b1c236e8 100644 --- a/src/gromacs/simd/general_x86_avx_256.h +++ b/src/gromacs/simd/general_x86_avx_256.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -167,7 +167,7 @@ static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y) static void -gmx_mm_printxmm_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm) { float f[4]; @@ -177,7 +177,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm) { float f[4]; @@ -187,7 +187,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmm_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm) { double f[2]; @@ -196,7 +196,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm) } static void -gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm) { double f[2]; @@ -206,7 +206,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) static void -gmx_mm_printxmm_epi32(const char *s, __m128i xmmi) +gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi) { int i[4]; @@ -287,7 +287,7 @@ static int gmx_mm_check_and_reset_overflow(void) } /* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */ -#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG +#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG # define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask)) # define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x)) # define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask)) diff --git a/src/gromacs/simd/general_x86_mic.h b/src/gromacs/simd/general_x86_mic.h index b41b42e09c..9f4c191855 100644 --- a/src/gromacs/simd/general_x86_mic.h +++ b/src/gromacs/simd/general_x86_mic.h @@ -47,49 +47,49 @@ #endif typedef __m512 gmx_mm_ps; -typedef __m512 gmx_mm_pr; +typedef __m512 gmx_simd_real_t; /* boolean SIMD register type */ -typedef __mmask16 gmx_mm_pb; -typedef __m512i gmx_epi32; +typedef __mmask16 gmx_simd_bool_t; +typedef __m512i gmx_simd_int32_t; #define GMX_HAVE_SIMD_MACROS -#define GMX_SIMD_WIDTH_HERE 16 -#define GMX_SIMD_EPI32_WIDTH 16 +#define GMX_SIMD_REAL_WIDTH 16 +#define GMX_SIMD_INT32_WIDTH 16 -#define gmx_load_pr _mm512_load_ps +#define gmx_simd_load_r _mm512_load_ps /* Set all SIMD register elements to *r */ static gmx_inline gmx_mm_ps -gmx_load1_pr(const real *r) +gmx_simd_load1_r(const real *r) { return _mm512_extload_ps(r, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); } -#define gmx_set1_pr _mm512_set1_ps +#define gmx_simd_set1_r _mm512_set1_ps /* Set all SIMD register elements to 0 */ -#define gmx_setzero_pr _mm512_setzero_ps -#define gmx_store_pr _mm512_store_ps +#define gmx_simd_setzero_r _mm512_setzero_ps +#define gmx_simd_store_r _mm512_store_ps -#define gmx_add_pr _mm512_add_ps -#define gmx_sub_pr _mm512_sub_ps -#define gmx_mul_pr _mm512_mul_ps +#define gmx_simd_add_r _mm512_add_ps +#define gmx_simd_sub_r _mm512_sub_ps +#define gmx_simd_mul_r _mm512_mul_ps #define GMX_SIMD_HAVE_FMA -#define gmx_madd_pr _mm512_fmadd_ps -#define gmx_nmsub_pr _mm512_fnmadd_ps +#define gmx_simd_fmadd_r _mm512_fmadd_ps +#define gmx_simd_fnmadd_r _mm512_fnmadd_ps -#define gmx_max_pr _mm512_max_ps +#define gmx_simd_max_r _mm512_max_ps static gmx_inline gmx_mm_ps -gmx_blendzero_pr(gmx_mm_ps a, gmx_mm_pb b) +gmx_simd_blendzero_r(gmx_mm_ps a, gmx_simd_bool_t b) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), b, a); } -#define gmx_round_pr _mm512_rint_ps +#define gmx_simd_round_r _mm512_rint_ps #define GMX_SIMD_HAVE_FLOOR -#define gmx_floor_pr _mm512_floor_ps +#define gmx_simd_floor_r _mm512_floor_ps /* Copy the sign of a to b, assumes b >= 0 for efficiency */ static gmx_inline gmx_mm_ps @@ -104,49 +104,49 @@ gmx_cpsgn_nonneg_pr(gmx_mm_ps a, gmx_mm_ps b) /* Very specific operation required in the non-bonded kernels */ static gmx_inline gmx_mm_ps -gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_ps b, gmx_mm_ps c) +gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_mm_ps b, gmx_mm_ps c) { return _mm512_mask_add_ps(b, _mm512_knot(a), b, c); } /* Comparison */ -#define gmx_cmplt_pr _mm512_cmplt_ps_mask +#define gmx_simd_cmplt_r _mm512_cmplt_ps_mask /* Logical AND on SIMD booleans. */ -#define gmx_and_pb _mm512_kand +#define gmx_simd_and_b _mm512_kand /* Logical OR on SIMD booleans. */ -#define gmx_or_pb _mm512_kor +#define gmx_simd_or_b _mm512_kor /* Returns a single int (0/1) which tells if any of the booleans is True It returns the full mask (not 1 for True). But given that any non-zero is True this is OK. */ -#define gmx_anytrue_pb _mm512_mask2int +#define gmx_simd_anytrue_b _mm512_mask2int /* Conversions only used for PME table lookup */ -static gmx_inline gmx_epi32 -gmx_cvttpr_epi32(gmx_mm_ps a) +static gmx_inline gmx_simd_int32_t +gmx_simd_cvtt_r2i(gmx_mm_ps a) { return _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_ROUND_MODE_DOWN, _MM_EXPADJ_NONE); }; /* These two function only need to be approximate, Newton-Raphson iteration - * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr. + * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r. */ -#define gmx_rsqrt_pr _mm512_rsqrt23_ps -#define gmx_rcp_pr _mm512_rcp23_ps +#define gmx_simd_rsqrt_r _mm512_rsqrt23_ps +#define gmx_simd_rcp_r _mm512_rcp23_ps #define GMX_SIMD_HAVE_EXP -#define gmx_exp_pr _mm512_exp_ps +#define gmx_simd_exp_r _mm512_exp_ps #define GMX_SIMD_HAVE_ERFC -#define gmx_erfc_pr _mm512_erfc_ps +#define gmx_simd_erfc_r _mm512_erfc_ps #define GMX_SIMD_HAVE_TRIGONOMETRIC -#define gmx_sqrt_pr _mm512_sqrt_ps +#define gmx_simd_sqrt_r _mm512_sqrt_ps static gmx_inline int -gmx_sincos_pr(gmx_mm_ps a, - gmx_mm_ps *s, gmx_mm_ps *c) +gmx_simd_sincos_r(gmx_mm_ps a, + gmx_mm_ps *s, gmx_mm_ps *c) { /* TODO (only bond): optimize that both are calculated together. Or (if if that isn't fast on MIC) don't call sincos if only one is needed. */ @@ -155,7 +155,7 @@ gmx_sincos_pr(gmx_mm_ps a, return 0; } -#define gmx_acos_pr _mm512_acos_ps -#define gmx_atan2_pr _mm512_atan2_ps +#define gmx_simd_acos_r _mm512_acos_ps +#define gmx_simd_atan2_r _mm512_atan2_ps #endif /* _general_x86_mic_h_ */ diff --git a/src/gromacs/simd/general_x86_sse2.h b/src/gromacs/simd/general_x86_sse2.h index c6c8b4d3ae..8aa70852f7 100644 --- a/src/gromacs/simd/general_x86_sse2.h +++ b/src/gromacs/simd/general_x86_sse2.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -94,7 +94,7 @@ static __m128i gmx_mm_castpd_si128(__m128d a) static void -gmx_mm_printxmm_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm) { float f[4]; @@ -104,7 +104,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm) { float f[4]; @@ -114,7 +114,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmm_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm) { double f[2]; @@ -123,7 +123,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm) } static void -gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm) { double f[2]; @@ -133,7 +133,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) static void -gmx_mm_printxmm_epi32(const char *s, __m128i xmmi) +gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi) { int i[4]; diff --git a/src/gromacs/simd/general_x86_sse4_1.h b/src/gromacs/simd/general_x86_sse4_1.h index be0eaa7fa2..43b83ef90b 100644 --- a/src/gromacs/simd/general_x86_sse4_1.h +++ b/src/gromacs/simd/general_x86_sse4_1.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -94,7 +94,7 @@ static __m128i gmx_mm_castpd_si128(__m128d a) static void -gmx_mm_printxmm_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm) { float f[4]; @@ -104,7 +104,7 @@ gmx_mm_printxmm_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) +gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm) { float f[4]; @@ -114,7 +114,7 @@ gmx_mm_printxmmsum_ps(const char *s, __m128 xmm) static void -gmx_mm_printxmm_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm) { double f[2]; @@ -123,7 +123,7 @@ gmx_mm_printxmm_pd(const char *s, __m128d xmm) } static void -gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) +gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm) { double f[2]; @@ -133,7 +133,7 @@ gmx_mm_printxmmsum_pd(const char *s, __m128d xmm) static void -gmx_mm_printxmm_epi32(const char *s, __m128i xmmi) +gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi) { int i[4]; diff --git a/src/gromacs/simd/macros.h b/src/gromacs/simd/macros.h index 4aad78beb3..a24cd56d4a 100644 --- a/src/gromacs/simd/macros.h +++ b/src/gromacs/simd/macros.h @@ -46,55 +46,55 @@ /* NOTE: SSE2 acceleration does not include floor or blendv */ -#ifdef GMX_SIMD_REFERENCE_PLAIN_C +#ifdef GMX_SIMD_REFERENCE /* Plain C SIMD reference implementation, also serves as documentation */ #define GMX_HAVE_SIMD_MACROS /* Include plain-C reference implementation, also serves as documentation */ #include "gromacs/simd/macros_ref.h" -#define GMX_SIMD_WIDTH_HERE GMX_SIMD_REF_WIDTH +#define GMX_SIMD_REAL_WIDTH GMX_SIMD_REF_WIDTH /* float/double SIMD register type */ -#define gmx_mm_pr gmx_simd_ref_pr +#define gmx_simd_real_t gmx_simd_ref_pr /* boolean SIMD register type */ -#define gmx_mm_pb gmx_simd_ref_pb +#define gmx_simd_bool_t gmx_simd_ref_pb /* integer SIMD register type, only for table indexing and exclusion masks */ -#define gmx_epi32 gmx_simd_ref_epi32 -#define GMX_SIMD_EPI32_WIDTH GMX_SIMD_REF_EPI32_WIDTH +#define gmx_simd_int32_t gmx_simd_ref_epi32 +#define GMX_SIMD_INT32_WIDTH GMX_SIMD_REF_EPI32_WIDTH -/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */ -#define gmx_load_pr gmx_simd_ref_load_pr +/* Load GMX_SIMD_REAL_WIDTH reals for memory starting at r */ +#define gmx_simd_load_r gmx_simd_ref_load_pr /* Set all SIMD register elements to *r */ -#define gmx_load1_pr gmx_simd_ref_load1_pr -#define gmx_set1_pr gmx_simd_ref_set1_pr -#define gmx_setzero_pr gmx_simd_ref_setzero_pr -#define gmx_store_pr gmx_simd_ref_store_pr - -#define gmx_add_pr gmx_simd_ref_add_pr -#define gmx_sub_pr gmx_simd_ref_sub_pr -#define gmx_mul_pr gmx_simd_ref_mul_pr +#define gmx_simd_load1_r gmx_simd_ref_load1_pr +#define gmx_simd_set1_r gmx_simd_ref_set1_pr +#define gmx_simd_setzero_r gmx_simd_ref_setzero_pr +#define gmx_simd_store_r gmx_simd_ref_store_pr + +#define gmx_simd_add_r gmx_simd_ref_add_pr +#define gmx_simd_sub_r gmx_simd_ref_sub_pr +#define gmx_simd_mul_r gmx_simd_ref_mul_pr /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */ -#define gmx_madd_pr gmx_simd_ref_madd_pr -#define gmx_nmsub_pr gmx_simd_ref_nmsub_pr +#define gmx_simd_fmadd_r gmx_simd_ref_madd_pr +#define gmx_simd_fnmadd_r gmx_simd_ref_nmsub_pr -#define gmx_max_pr gmx_simd_ref_max_pr -#define gmx_blendzero_pr gmx_simd_ref_blendzero_pr +#define gmx_simd_max_r gmx_simd_ref_max_pr +#define gmx_simd_blendzero_r gmx_simd_ref_blendzero_pr -#define gmx_round_pr gmx_simd_ref_round_pr +#define gmx_simd_round_r gmx_simd_ref_round_pr /* Not required, only used to speed up the nbnxn tabulated PME kernels */ #define GMX_SIMD_HAVE_FLOOR #ifdef GMX_SIMD_HAVE_FLOOR -#define gmx_floor_pr gmx_simd_ref_floor_pr +#define gmx_simd_floor_r gmx_simd_ref_floor_pr #endif /* Not required, only used when blendv is faster than comparison */ #define GMX_SIMD_HAVE_BLENDV #ifdef GMX_SIMD_HAVE_BLENDV -#define gmx_blendv_pr gmx_simd_ref_blendv_pr +#define gmx_simd_blendv_r gmx_simd_ref_blendv_pr #endif /* Copy the sign of a to b, assumes b >= 0 for efficiency */ @@ -104,39 +104,39 @@ #define gmx_masknot_add_pr gmx_simd_ref_masknot_add_pr /* Comparison */ -#define gmx_cmplt_pr gmx_simd_ref_cmplt_pr +#define gmx_simd_cmplt_r gmx_simd_ref_cmplt_pr /* Logical operations on SIMD booleans */ -#define gmx_and_pb gmx_simd_ref_and_pb -#define gmx_or_pb gmx_simd_ref_or_pb +#define gmx_simd_and_b gmx_simd_ref_and_pb +#define gmx_simd_or_b gmx_simd_ref_or_pb /* Returns a single int (0/1) which tells if any of the 4 booleans is True */ -#define gmx_anytrue_pb gmx_simd_ref_anytrue_pb +#define gmx_simd_anytrue_b gmx_simd_ref_anytrue_pb /* Conversions only used for PME table lookup */ -#define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32 -#define gmx_cvtepi32_pr gmx_simd_ref_cvtepi32_pr +#define gmx_simd_cvtt_r2i gmx_simd_ref_cvttpr_epi32 +#define gmx_simd_cvt_i2r gmx_simd_ref_cvtepi32_pr /* These two function only need to be approximate, Newton-Raphson iteration - * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr. + * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r. */ -#define gmx_rsqrt_pr gmx_simd_ref_rsqrt_pr -#define gmx_rcp_pr gmx_simd_ref_rcp_pr +#define gmx_simd_rsqrt_r gmx_simd_ref_rsqrt_pr +#define gmx_simd_rcp_r gmx_simd_ref_rcp_pr /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */ #define GMX_SIMD_HAVE_EXP #ifdef GMX_SIMD_HAVE_EXP -#define gmx_exp_pr gmx_simd_ref_exp_pr +#define gmx_simd_exp_r gmx_simd_ref_exp_pr #endif #define GMX_SIMD_HAVE_TRIGONOMETRIC #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC -#define gmx_sqrt_pr gmx_simd_ref_sqrt_pr -#define gmx_sincos_pr gmx_simd_ref_sincos_pr -#define gmx_acos_pr gmx_simd_ref_acos_pr -#define gmx_atan2_pr gmx_simd_ref_atan2_pr +#define gmx_simd_sqrt_r gmx_simd_ref_sqrt_pr +#define gmx_simd_sincos_r gmx_simd_ref_sincos_pr +#define gmx_simd_acos_r gmx_simd_ref_acos_pr +#define gmx_simd_atan2_r gmx_simd_ref_atan2_pr #endif -#endif /* GMX_SIMD_REFERENCE_PLAIN_C */ +#endif /* GMX_SIMD_REFERENCE */ /* The same SIMD macros can be translated to SIMD intrinsics (and compiled @@ -150,7 +150,7 @@ #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE -#if defined GMX_X86_AVX_256 || defined __MIC__ +#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__ /* We have half SIMD width support, continue */ #else #error "half SIMD width intrinsics are not supported" @@ -159,349 +159,349 @@ #if defined GMX_TARGET_X86 && !defined __MIC__ -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER /* This is for general x86 SIMD instruction sets that also support SSE2 */ #define GMX_HAVE_SIMD_MACROS /* Include the highest supported x86 SIMD intrisics + math functions */ -#ifdef GMX_X86_AVX_256 +#ifdef GMX_SIMD_X86_AVX_256_OR_HIGHER #include "general_x86_avx_256.h" #ifdef GMX_DOUBLE #include "math_x86_avx_256_double.h" #else /* GMX_DOUBLE */ #include "math_x86_avx_256_single.h" #endif /* GMX_DOUBLE */ -#else /* GMX_X86_AVX_256 */ -#ifdef GMX_X86_AVX_128_FMA +#else /* GMX_SIMD_X86_AVX_256_OR_HIGHER */ +#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER #include "general_x86_avx_128_fma.h" #ifdef GMX_DOUBLE #include "math_x86_avx_128_fma_double.h" #else /* GMX_DOUBLE */ #include "math_x86_avx_128_fma_single.h" #endif /* GMX_DOUBLE */ -#else /* GMX_X86_AVX_128_FMA */ -#ifdef GMX_X86_SSE4_1 +#else /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */ +#ifdef GMX_SIMD_X86_SSE4_1 #include "general_x86_sse4_1.h" #ifdef GMX_DOUBLE #include "math_x86_sse4_1_double.h" #else /* GMX_DOUBLE */ #include "math_x86_sse4_1_single.h" #endif /* GMX_DOUBLE */ -#else /* GMX_X86_SSE4_1 */ -#ifdef GMX_X86_SSE2 +#else /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */ +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER #include "general_x86_sse2.h" #ifdef GMX_DOUBLE #include "math_x86_sse2_double.h" #else /* GMX_DOUBLE */ #include "math_x86_sse2_single.h" #endif /* GMX_DOUBLE */ -#else /* GMX_X86_SSE2 */ +#else /* GMX_SIMD_X86_SSE2_OR_HIGHER */ #error No x86 acceleration defined -#endif /* GMX_X86_SSE2 */ -#endif /* GMX_X86_SSE4_1 */ -#endif /* GMX_X86_AVX_128_FMA */ -#endif /* GMX_X86_AVX_256 */ +#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */ +#endif /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */ +#endif /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */ +#endif /* GMX_SIMD_X86_AVX_256_OR_HIGHER */ /* exp and trigonometric functions are included above */ #define GMX_SIMD_HAVE_EXP #define GMX_SIMD_HAVE_ERFC #define GMX_SIMD_HAVE_TRIGONOMETRIC -#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE +#if !defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined GMX_USE_HALF_WIDTH_SIMD_HERE #ifndef GMX_DOUBLE -#define GMX_SIMD_WIDTH_HERE 4 +#define GMX_SIMD_REAL_WIDTH 4 -#define gmx_mm_pr __m128 +#define gmx_simd_real_t __m128 -#define gmx_mm_pb __m128 +#define gmx_simd_bool_t __m128 -#define gmx_epi32 __m128i -#define GMX_SIMD_EPI32_WIDTH 4 +#define gmx_simd_int32_t __m128i +#define GMX_SIMD_INT32_WIDTH 4 -#define gmx_load_pr _mm_load_ps -#define gmx_load1_pr _mm_load1_ps -#define gmx_set1_pr _mm_set1_ps -#define gmx_setzero_pr _mm_setzero_ps -#define gmx_store_pr _mm_store_ps +#define gmx_simd_load_r _mm_load_ps +#define gmx_simd_load1_r _mm_load1_ps +#define gmx_simd_set1_r _mm_set1_ps +#define gmx_simd_setzero_r _mm_setzero_ps +#define gmx_simd_store_r _mm_store_ps -#define gmx_add_pr _mm_add_ps -#define gmx_sub_pr _mm_sub_ps -#define gmx_mul_pr _mm_mul_ps -#ifdef GMX_X86_AVX_128_FMA +#define gmx_simd_add_r _mm_add_ps +#define gmx_simd_sub_r _mm_sub_ps +#define gmx_simd_mul_r _mm_mul_ps +#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER #define GMX_SIMD_HAVE_FMA -#define gmx_madd_pr(a, b, c) _mm_macc_ps(a, b, c) -#define gmx_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c) +#define gmx_simd_fmadd_r(a, b, c) _mm_macc_ps(a, b, c) +#define gmx_simd_fnmadd_r(a, b, c) _mm_nmacc_ps(a, b, c) #else -#define gmx_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b)) -#define gmx_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) +#define gmx_simd_fmadd_r(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b)) +#define gmx_simd_fnmadd_r(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) #endif -#define gmx_max_pr _mm_max_ps -#define gmx_blendzero_pr _mm_and_ps +#define gmx_simd_max_r _mm_max_ps +#define gmx_simd_blendzero_r _mm_and_ps -#define gmx_cmplt_pr _mm_cmplt_ps -#define gmx_and_pb _mm_and_ps -#define gmx_or_pb _mm_or_ps +#define gmx_simd_cmplt_r _mm_cmplt_ps +#define gmx_simd_and_b _mm_and_ps +#define gmx_simd_or_b _mm_or_ps -#ifdef GMX_X86_SSE4_1 -#define gmx_round_pr(x) _mm_round_ps(x, 0x0) +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER +#define gmx_simd_round_r(x) _mm_round_ps(x, 0x0) #define GMX_SIMD_HAVE_FLOOR -#define gmx_floor_pr _mm_floor_ps +#define gmx_simd_floor_r _mm_floor_ps #else -#define gmx_round_pr(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x)) +#define gmx_simd_round_r(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x)) #endif -#ifdef GMX_X86_SSE4_1 +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER #define GMX_SIMD_HAVE_BLENDV -#define gmx_blendv_pr _mm_blendv_ps +#define gmx_simd_blendv_r _mm_blendv_ps #endif -static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b) { /* The value -0.0 has only the sign-bit set */ - gmx_mm_pr sign_mask = _mm_set1_ps(-0.0); + gmx_simd_real_t sign_mask = _mm_set1_ps(-0.0); return _mm_or_ps(_mm_and_ps(a, sign_mask), b); }; -static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); }; -#define gmx_anytrue_pb _mm_movemask_ps +#define gmx_simd_anytrue_b _mm_movemask_ps -#define gmx_cvttpr_epi32 _mm_cvttps_epi32 -#define gmx_cvtepi32_pr _mm_cvtepi32_ps +#define gmx_simd_cvtt_r2i _mm_cvttps_epi32 +#define gmx_simd_cvt_i2r _mm_cvtepi32_ps -#define gmx_rsqrt_pr _mm_rsqrt_ps -#define gmx_rcp_pr _mm_rcp_ps +#define gmx_simd_rsqrt_r _mm_rsqrt_ps +#define gmx_simd_rcp_r _mm_rcp_ps -#define gmx_exp_pr gmx_mm_exp_ps -#define gmx_sqrt_pr gmx_mm_sqrt_ps -#define gmx_sincos_pr gmx_mm_sincos_ps -#define gmx_acos_pr gmx_mm_acos_ps -#define gmx_atan2_pr gmx_mm_atan2_ps -#define gmx_erfc_pr gmx_mm_erfc_ps +#define gmx_simd_exp_r gmx_mm_exp_ps +#define gmx_simd_sqrt_r gmx_mm_sqrt_ps +#define gmx_simd_sincos_r gmx_mm_sincos_ps +#define gmx_simd_acos_r gmx_mm_acos_ps +#define gmx_simd_atan2_r gmx_mm_atan2_ps +#define gmx_simd_erfc_r gmx_mm_erfc_ps #else /* ifndef GMX_DOUBLE */ -#define GMX_SIMD_WIDTH_HERE 2 +#define GMX_SIMD_REAL_WIDTH 2 -#define gmx_mm_pr __m128d +#define gmx_simd_real_t __m128d -#define gmx_mm_pb __m128d +#define gmx_simd_bool_t __m128d -#define gmx_epi32 __m128i -#define GMX_SIMD_EPI32_WIDTH 4 +#define gmx_simd_int32_t __m128i +#define GMX_SIMD_INT32_WIDTH 4 -#define gmx_load_pr _mm_load_pd -#define gmx_load1_pr _mm_load1_pd -#define gmx_set1_pr _mm_set1_pd -#define gmx_setzero_pr _mm_setzero_pd -#define gmx_store_pr _mm_store_pd +#define gmx_simd_load_r _mm_load_pd +#define gmx_simd_load1_r _mm_load1_pd +#define gmx_simd_set1_r _mm_set1_pd +#define gmx_simd_setzero_r _mm_setzero_pd +#define gmx_simd_store_r _mm_store_pd -#define gmx_add_pr _mm_add_pd -#define gmx_sub_pr _mm_sub_pd -#define gmx_mul_pr _mm_mul_pd -#ifdef GMX_X86_AVX_128_FMA +#define gmx_simd_add_r _mm_add_pd +#define gmx_simd_sub_r _mm_sub_pd +#define gmx_simd_mul_r _mm_mul_pd +#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER #define GMX_SIMD_HAVE_FMA -#define gmx_madd_pr(a, b, c) _mm_macc_pd(a, b, c) -#define gmx_nmsub_pr(a, b, c) _mm_nmacc_pd(a, b, c) +#define gmx_simd_fmadd_r(a, b, c) _mm_macc_pd(a, b, c) +#define gmx_simd_fnmadd_r(a, b, c) _mm_nmacc_pd(a, b, c) #else -#define gmx_madd_pr(a, b, c) _mm_add_pd(c, _mm_mul_pd(a, b)) -#define gmx_nmsub_pr(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b)) +#define gmx_simd_fmadd_r(a, b, c) _mm_add_pd(c, _mm_mul_pd(a, b)) +#define gmx_simd_fnmadd_r(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b)) #endif -#define gmx_max_pr _mm_max_pd -#define gmx_blendzero_pr _mm_and_pd +#define gmx_simd_max_r _mm_max_pd +#define gmx_simd_blendzero_r _mm_and_pd -#ifdef GMX_X86_SSE4_1 -#define gmx_round_pr(x) _mm_round_pd(x, 0x0) +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER +#define gmx_simd_round_r(x) _mm_round_pd(x, 0x0) #define GMX_SIMD_HAVE_FLOOR -#define gmx_floor_pr _mm_floor_pd +#define gmx_simd_floor_r _mm_floor_pd #else -#define gmx_round_pr(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x)) -/* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */ +#define gmx_simd_round_r(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x)) +/* gmx_simd_floor_r is not used in code for pre-SSE4_1 hardware */ #endif -#ifdef GMX_X86_SSE4_1 +#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER #define GMX_SIMD_HAVE_BLENDV -#define gmx_blendv_pr _mm_blendv_pd +#define gmx_simd_blendv_r _mm_blendv_pd #endif -static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b) { - gmx_mm_pr sign_mask = _mm_set1_pd(-0.0); + gmx_simd_real_t sign_mask = _mm_set1_pd(-0.0); return _mm_or_pd(_mm_and_pd(a, sign_mask), b); }; -static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); }; -#define gmx_cmplt_pr _mm_cmplt_pd +#define gmx_simd_cmplt_r _mm_cmplt_pd -#define gmx_and_pb _mm_and_pd -#define gmx_or_pb _mm_or_pd +#define gmx_simd_and_b _mm_and_pd +#define gmx_simd_or_b _mm_or_pd -#define gmx_anytrue_pb _mm_movemask_pd +#define gmx_simd_anytrue_b _mm_movemask_pd -#define gmx_cvttpr_epi32 _mm_cvttpd_epi32 -#define gmx_cvtepi32_pr _mm_cvtepi32_pd +#define gmx_simd_cvtt_r2i _mm_cvttpd_epi32 +#define gmx_simd_cvt_i2r _mm_cvtepi32_pd -#define gmx_rsqrt_pr(r) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r))) -#define gmx_rcp_pr(r) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r))) +#define gmx_simd_rsqrt_r(r) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r))) +#define gmx_simd_rcp_r(r) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r))) -#define gmx_exp_pr gmx_mm_exp_pd -#define gmx_sqrt_pr gmx_mm_sqrt_pd -#define gmx_sincos_pr gmx_mm_sincos_pd -#define gmx_acos_pr gmx_mm_acos_pd -#define gmx_atan2_pr gmx_mm_atan2_pd -#define gmx_erfc_pr gmx_mm_erfc_pd +#define gmx_simd_exp_r gmx_mm_exp_pd +#define gmx_simd_sqrt_r gmx_mm_sqrt_pd +#define gmx_simd_sincos_r gmx_mm_sincos_pd +#define gmx_simd_acos_r gmx_mm_acos_pd +#define gmx_simd_atan2_r gmx_mm_atan2_pd +#define gmx_simd_erfc_r gmx_mm_erfc_pd #endif /* ifndef GMX_DOUBLE */ #else -/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE, +/* We have GMX_SIMD_X86_AVX_256_OR_HIGHER and not GMX_USE_HALF_WIDTH_SIMD_HERE, * so we use 256-bit SIMD. */ #ifndef GMX_DOUBLE -#define GMX_SIMD_WIDTH_HERE 8 +#define GMX_SIMD_REAL_WIDTH 8 -#define gmx_mm_pr __m256 +#define gmx_simd_real_t __m256 -#define gmx_mm_pb __m256 +#define gmx_simd_bool_t __m256 -#define gmx_epi32 __m256i -#define GMX_SIMD_EPI32_WIDTH 8 +#define gmx_simd_int32_t __m256i +#define GMX_SIMD_INT32_WIDTH 8 -#define gmx_load_pr _mm256_load_ps -#define gmx_load1_pr(x) _mm256_set1_ps((x)[0]) -#define gmx_set1_pr _mm256_set1_ps -#define gmx_setzero_pr _mm256_setzero_ps -#define gmx_store_pr _mm256_store_ps +#define gmx_simd_load_r _mm256_load_ps +#define gmx_simd_load1_r(x) _mm256_set1_ps((x)[0]) +#define gmx_simd_set1_r _mm256_set1_ps +#define gmx_simd_setzero_r _mm256_setzero_ps +#define gmx_simd_store_r _mm256_store_ps -#define gmx_add_pr _mm256_add_ps -#define gmx_sub_pr _mm256_sub_ps -#define gmx_mul_pr _mm256_mul_ps -#define gmx_madd_pr(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) -#define gmx_nmsub_pr(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b)) -#define gmx_max_pr _mm256_max_ps -#define gmx_blendzero_pr _mm256_and_ps +#define gmx_simd_add_r _mm256_add_ps +#define gmx_simd_sub_r _mm256_sub_ps +#define gmx_simd_mul_r _mm256_mul_ps +#define gmx_simd_fmadd_r(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) +#define gmx_simd_fnmadd_r(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b)) +#define gmx_simd_max_r _mm256_max_ps +#define gmx_simd_blendzero_r _mm256_and_ps -#define gmx_round_pr(x) _mm256_round_ps(x, 0x0) +#define gmx_simd_round_r(x) _mm256_round_ps(x, 0x0) #define GMX_SIMD_HAVE_FLOOR -#define gmx_floor_pr _mm256_floor_ps +#define gmx_simd_floor_r _mm256_floor_ps #define GMX_SIMD_HAVE_BLENDV -#define gmx_blendv_pr _mm256_blendv_ps +#define gmx_simd_blendv_r _mm256_blendv_ps -static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b) { - gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0); + gmx_simd_real_t sign_mask = _mm256_set1_ps(-0.0); return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b); }; -static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); }; /* Less-than (we use ordered, non-signaling, but that's not required) */ -#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11) -#define gmx_and_pb _mm256_and_ps -#define gmx_or_pb _mm256_or_ps +#define gmx_simd_cmplt_r(x, y) _mm256_cmp_ps(x, y, 0x11) +#define gmx_simd_and_b _mm256_and_ps +#define gmx_simd_or_b _mm256_or_ps -#define gmx_anytrue_pb _mm256_movemask_ps +#define gmx_simd_anytrue_b _mm256_movemask_ps -#define gmx_cvttpr_epi32 _mm256_cvttps_epi32 +#define gmx_simd_cvtt_r2i _mm256_cvttps_epi32 -#define gmx_rsqrt_pr _mm256_rsqrt_ps -#define gmx_rcp_pr _mm256_rcp_ps +#define gmx_simd_rsqrt_r _mm256_rsqrt_ps +#define gmx_simd_rcp_r _mm256_rcp_ps -#define gmx_exp_pr gmx_mm256_exp_ps -#define gmx_sqrt_pr gmx_mm256_sqrt_ps -#define gmx_sincos_pr gmx_mm256_sincos_ps -#define gmx_acos_pr gmx_mm256_acos_ps -#define gmx_atan2_pr gmx_mm256_atan2_ps -#define gmx_erfc_pr gmx_mm256_erfc_ps +#define gmx_simd_exp_r gmx_mm256_exp_ps +#define gmx_simd_sqrt_r gmx_mm256_sqrt_ps +#define gmx_simd_sincos_r gmx_mm256_sincos_ps +#define gmx_simd_acos_r gmx_mm256_acos_ps +#define gmx_simd_atan2_r gmx_mm256_atan2_ps +#define gmx_simd_erfc_r gmx_mm256_erfc_ps #else /* ifndef GMX_DOUBLE */ -#define GMX_SIMD_WIDTH_HERE 4 +#define GMX_SIMD_REAL_WIDTH 4 -#define gmx_mm_pr __m256d +#define gmx_simd_real_t __m256d -#define gmx_mm_pb __m256d +#define gmx_simd_bool_t __m256d /* We use 128-bit integer registers because of missing 256-bit operations */ -#define gmx_epi32 __m128i -#define GMX_SIMD_EPI32_WIDTH 4 - -#define gmx_load_pr _mm256_load_pd -#define gmx_load1_pr(x) _mm256_set1_pd((x)[0]) -#define gmx_set1_pr _mm256_set1_pd -#define gmx_setzero_pr _mm256_setzero_pd -#define gmx_store_pr _mm256_store_pd - -#define gmx_add_pr _mm256_add_pd -#define gmx_sub_pr _mm256_sub_pd -#define gmx_mul_pr _mm256_mul_pd -#define gmx_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b)) -#define gmx_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b)) -#define gmx_max_pr _mm256_max_pd -#define gmx_blendzero_pr _mm256_and_pd - -#define gmx_round_pr(x) _mm256_round_pd(x, 0x0) +#define gmx_simd_int32_t __m128i +#define GMX_SIMD_INT32_WIDTH 4 + +#define gmx_simd_load_r _mm256_load_pd +#define gmx_simd_load1_r(x) _mm256_set1_pd((x)[0]) +#define gmx_simd_set1_r _mm256_set1_pd +#define gmx_simd_setzero_r _mm256_setzero_pd +#define gmx_simd_store_r _mm256_store_pd + +#define gmx_simd_add_r _mm256_add_pd +#define gmx_simd_sub_r _mm256_sub_pd +#define gmx_simd_mul_r _mm256_mul_pd +#define gmx_simd_fmadd_r(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b)) +#define gmx_simd_fnmadd_r(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b)) +#define gmx_simd_max_r _mm256_max_pd +#define gmx_simd_blendzero_r _mm256_and_pd + +#define gmx_simd_round_r(x) _mm256_round_pd(x, 0x0) #define GMX_SIMD_HAVE_FLOOR -#define gmx_floor_pr _mm256_floor_pd +#define gmx_simd_floor_r _mm256_floor_pd #define GMX_SIMD_HAVE_BLENDV -#define gmx_blendv_pr _mm256_blendv_pd +#define gmx_simd_blendv_r _mm256_blendv_pd -static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b) { - gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0); + gmx_simd_real_t sign_mask = _mm256_set1_pd(-0.0); return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b); }; -static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); }; /* Less-than (we use ordered, non-signaling, but that's not required) */ -#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11) +#define gmx_simd_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11) -#define gmx_and_pb _mm256_and_pd -#define gmx_or_pb _mm256_or_pd +#define gmx_simd_and_b _mm256_and_pd +#define gmx_simd_or_b _mm256_or_pd -#define gmx_anytrue_pb _mm256_movemask_pd +#define gmx_simd_anytrue_b _mm256_movemask_pd -#define gmx_cvttpr_epi32 _mm256_cvttpd_epi32 +#define gmx_simd_cvtt_r2i _mm256_cvttpd_epi32 -#define gmx_rsqrt_pr(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r))) -#define gmx_rcp_pr(r) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r))) +#define gmx_simd_rsqrt_r(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r))) +#define gmx_simd_rcp_r(r) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r))) -#define gmx_exp_pr gmx_mm256_exp_pd -#define gmx_sqrt_pr gmx_mm256_sqrt_pd -#define gmx_sincos_pr gmx_mm256_sincos_pd -#define gmx_acos_pr gmx_mm256_acos_pd -#define gmx_atan2_pr gmx_mm256_atan2_pd -#define gmx_erfc_pr gmx_mm256_erfc_pd +#define gmx_simd_exp_r gmx_mm256_exp_pd +#define gmx_simd_sqrt_r gmx_mm256_sqrt_pd +#define gmx_simd_sincos_r gmx_mm256_sincos_pd +#define gmx_simd_acos_r gmx_mm256_acos_pd +#define gmx_simd_atan2_r gmx_mm256_atan2_pd +#define gmx_simd_erfc_r gmx_mm256_erfc_pd #endif /* ifndef GMX_DOUBLE */ #endif /* 128- or 256-bit x86 SIMD */ -#endif /* GMX_X86_SSE2 */ +#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */ #endif /* GMX_TARGET_X86 */ -#ifdef GMX_CPU_ACCELERATION_IBM_QPX +#ifdef GMX_SIMD_IBM_QPX /* This hack works on the compilers that can reach this code. A real solution with broader scope will be proposed in master branch. */ @@ -518,13 +518,13 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_ /* No need to version the code by the precision, because the QPX AXU extends to and truncates from double precision for free. */ -#define GMX_SIMD_WIDTH_HERE 4 -typedef vector4double gmx_mm_pr; -typedef vector4double gmx_mm_pb; -typedef vector4double gmx_epi32; -#define GMX_SIMD_EPI32_WIDTH 4 +#define GMX_SIMD_REAL_WIDTH 4 +typedef vector4double gmx_simd_real_t; +typedef vector4double gmx_simd_bool_t; +typedef vector4double gmx_simd_int32_t; +#define GMX_SIMD_INT32_WIDTH 4 -static gmx_inline gmx_mm_pr gmx_always_inline gmx_load_pr(const real *a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load_r(const real *a) { #ifdef NDEBUG return vec_ld(0, (real *) a); @@ -533,22 +533,22 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_load_pr(const real *a) #endif } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_load1_pr(const real *a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load1_r(const real *a) { return vec_splats(*a); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_set1_pr(real a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_set1_r(real a) { return vec_splats(a); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_setzero_pr() +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_setzero_r() { return vec_splats(0.0); } -static gmx_inline void gmx_always_inline gmx_store_pr(real *a, gmx_mm_pr b) +static gmx_inline void gmx_always_inline gmx_simd_store_r(real *a, gmx_simd_real_t b) { #ifdef NDEBUG vec_st(b, 0, a); @@ -557,81 +557,81 @@ static gmx_inline void gmx_always_inline gmx_store_pr(real *a, gmx_mm_pr b) #endif } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_add_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_add_r(gmx_simd_real_t a, gmx_simd_real_t b) { return vec_add(a, b); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_sub_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sub_r(gmx_simd_real_t a, gmx_simd_real_t b) { return vec_sub(a, b); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_mul_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_mul_r(gmx_simd_real_t a, gmx_simd_real_t b) { return vec_mul(a, b); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_madd_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c) { return vec_madd(a, b, c); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_nmsub_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fnmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c) { return vec_nmsub(a, b, c); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_max_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_max_r(gmx_simd_real_t a, gmx_simd_real_t b) { return vec_sel(b, a, vec_sub(a, b)); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendzero_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendzero_r(gmx_simd_real_t a, gmx_simd_real_t b) { - return vec_sel(gmx_setzero_pr(), a, b); + return vec_sel(gmx_simd_setzero_r(), a, b); } -static gmx_inline gmx_mm_pb gmx_always_inline gmx_cmplt_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_cmplt_r(gmx_simd_real_t a, gmx_simd_real_t b) { return vec_cmplt(a, b); } -static gmx_inline gmx_mm_pb gmx_always_inline gmx_and_pb(gmx_mm_pb a, gmx_mm_pb b) +static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_and_b(gmx_simd_bool_t a, gmx_simd_bool_t b) { return vec_and(a, b); } -static gmx_inline gmx_mm_pb gmx_always_inline gmx_or_pb(gmx_mm_pb a, gmx_mm_pb b) +static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_or_b(gmx_simd_bool_t a, gmx_simd_bool_t b) { return vec_or(a, b); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_round_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_round_r(gmx_simd_real_t a) { return vec_round(a); } #define GMX_SIMD_HAVE_FLOOR -static gmx_inline gmx_mm_pr gmx_always_inline gmx_floor_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_floor_r(gmx_simd_real_t a) { return vec_floor(a); } #define GMX_SIMD_HAVE_BLENDV -static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendv_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendv_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c) { - return vec_sel(b, a, gmx_cmplt_pr(gmx_setzero_pr(), c)); + return vec_sel(b, a, gmx_simd_cmplt_r(gmx_simd_setzero_r(), c)); } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b) { return vec_cpsgn(a, b); }; -static gmx_inline gmx_mm_pr gmx_always_inline gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c) { - return vec_add(b, vec_sel(c, gmx_setzero_pr(), a)); + return vec_add(b, vec_sel(c, gmx_simd_setzero_r(), a)); }; static gmx_inline gmx_bool gmx_always_inline @@ -640,17 +640,17 @@ GMX_SIMD_IS_TRUE(real x) return x >= 0.0; } -static gmx_inline gmx_epi32 gmx_always_inline gmx_cvttpr_epi32(gmx_mm_pr a) +static gmx_inline gmx_simd_int32_t gmx_always_inline gmx_simd_cvtt_r2i(gmx_simd_real_t a) { return vec_ctiwuz(a); } /* Don't want this, we have floor */ -/* #define gmx_cvtepi32_pr vec_cvtepi32 */ +/* #define gmx_simd_cvt_i2r vec_cvtepi32 */ /* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA Architecture only promises 2^-8. So probably no need for Newton-Raphson iterates at single or double. */ -static gmx_inline gmx_mm_pr gmx_always_inline gmx_rsqrt_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rsqrt_r(gmx_simd_real_t a) { return vec_rsqrte(a); } @@ -658,7 +658,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_rsqrt_pr(gmx_mm_pr a) /* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA Architecture only promises 2^-5. So probably no need for Newton-Raphson iterates at single or double. */ -static gmx_inline gmx_mm_pr gmx_always_inline gmx_rcp_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rcp_r(gmx_simd_real_t a) { return vec_re(a); } @@ -667,7 +667,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_rcp_pr(gmx_mm_pr a) compiling on BlueGene/Q with clang */ #define GMX_SIMD_HAVE_EXP -static gmx_inline gmx_mm_pr gmx_always_inline gmx_exp_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_exp_r(gmx_simd_real_t a) { #ifdef __clang__ #ifndef GMX_DOUBLE @@ -684,7 +684,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_exp_pr(gmx_mm_pr a) #endif } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_sqrt_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sqrt_r(gmx_simd_real_t a) { #ifdef NDEBUG return vec_swsqrt_nochk(a); @@ -694,7 +694,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_sqrt_pr(gmx_mm_pr a) } #define GMX_SIMD_HAVE_TRIGONOMETRIC -static gmx_inline int gmx_always_inline gmx_sincos_pr(gmx_mm_pr a, gmx_mm_pr *b, gmx_mm_pr *c) +static gmx_inline int gmx_always_inline gmx_simd_sincos_r(gmx_simd_real_t a, gmx_simd_real_t *b, gmx_simd_real_t *c) { #ifdef __clang__ #ifndef GMX_DOUBLE @@ -712,7 +712,7 @@ static gmx_inline int gmx_always_inline gmx_sincos_pr(gmx_mm_pr a, gmx_mm_pr *b, return 1; } -static gmx_inline gmx_mm_pr gmx_always_inline gmx_acos_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_acos_r(gmx_simd_real_t a) { #ifdef __clang__ #ifndef GMX_DOUBLE @@ -731,7 +731,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_acos_pr(gmx_mm_pr a) /* NB The order of parameters here is correct; the documentation of atan2[df]4 in SIMD MASS is wrong. */ -static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_pr b) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_atan2_r(gmx_simd_real_t a, gmx_simd_real_t b) { #ifdef __clang__ #ifndef GMX_DOUBLE @@ -749,7 +749,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_p } #define GMX_SIMD_HAVE_ERFC -static gmx_inline gmx_mm_pr gmx_always_inline gmx_erfc_pr(gmx_mm_pr a) +static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_erfc_r(gmx_simd_real_t a) { /* The BG/Q qpxmath.h vector math library intended for use with bgclang does not have erfc, so we need to use a function from @@ -765,7 +765,7 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_erfc_pr(gmx_mm_pr a) /* TODO: gmx_mm_erfc_p[sd] should be generalized using gmx_*_pr, so that it just works on BlueGene */ static gmx_inline int gmx_always_inline -gmx_anytrue_pb(gmx_mm_pb a) +gmx_simd_anytrue_b(gmx_simd_bool_t a) { /* The "anytrue" is done solely on the QPX AXU (which is the only available FPU). This is awkward, because pretty much no @@ -780,19 +780,19 @@ gmx_anytrue_pb(gmx_mm_pb a) comparison on the zeroth vector element, which avoids needing memory at all. */ - gmx_mm_pb vec_shifted_left_0 = a; - gmx_mm_pb vec_shifted_left_1 = vec_sldw(a, a, 1); - gmx_mm_pb vec_shifted_left_2 = vec_sldw(a, a, 2); - gmx_mm_pb vec_shifted_left_3 = vec_sldw(a, a, 3); + gmx_simd_bool_t vec_shifted_left_0 = a; + gmx_simd_bool_t vec_shifted_left_1 = vec_sldw(a, a, 1); + gmx_simd_bool_t vec_shifted_left_2 = vec_sldw(a, a, 2); + gmx_simd_bool_t vec_shifted_left_3 = vec_sldw(a, a, 3); - gmx_mm_pb vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3), - vec_or(vec_shifted_left_0, vec_shifted_left_1)); + gmx_simd_bool_t vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3), + vec_or(vec_shifted_left_0, vec_shifted_left_1)); return (0.0 < vec_extract(vec_return, 0)); }; #undef gmx_always_inline -#endif /* GMX_CPU_ACCELERATION_IBM_QPX */ +#endif /* GMX_SIMD_IBM_QPX */ #ifdef __MIC__ #include "general_x86_mic.h" @@ -800,20 +800,20 @@ gmx_anytrue_pb(gmx_mm_pb a) #ifdef GMX_HAVE_SIMD_MACROS /* Generic functions to extract a SIMD aligned pointer from a pointer x. - * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared + * x should have at least GMX_SIMD_REAL_WIDTH elements extra compared * to how many you want to use, to avoid indexing outside the aligned region. */ static gmx_inline real * -gmx_simd_align_real(const real *x) +gmx_simd_align_r(const real *x) { - return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1)))); + return (real *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(real)-1)))); } static gmx_inline int * -gmx_simd_align_int(const int *x) +gmx_simd_align_i(const int *x) { - return (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1)))); + return (int *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(int )-1)))); } diff --git a/src/gromacs/simd/macros_ref.h b/src/gromacs/simd/macros_ref.h index 80021d0bb3..2f11e04d99 100644 --- a/src/gromacs/simd/macros_ref.h +++ b/src/gromacs/simd/macros_ref.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -410,7 +410,7 @@ gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a) }; /* These two function only need to be approximate, Newton-Raphson iteration - * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr. + * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r. */ static gmx_inline gmx_simd_ref_pr gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a) diff --git a/src/gromacs/simd/math_double.h b/src/gromacs/simd/math_double.h index 76dcc95a12..8e7d733185 100644 --- a/src/gromacs/simd/math_double.h +++ b/src/gromacs/simd/math_double.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -37,32 +37,32 @@ /* 1.0/sqrt(x) */ -static gmx_inline gmx_mm_pr -gmx_invsqrt_pr(gmx_mm_pr x) +static gmx_inline gmx_simd_real_t +gmx_simd_invsqrt_r(gmx_simd_real_t x) { - const gmx_mm_pr half = gmx_set1_pr(0.5); - const gmx_mm_pr three = gmx_set1_pr(3.0); + const gmx_simd_real_t half = gmx_simd_set1_r(0.5); + const gmx_simd_real_t three = gmx_simd_set1_r(3.0); /* Lookup instruction only exists in single precision, convert back and forth... */ - gmx_mm_pr lu = gmx_rsqrt_pr(x); + gmx_simd_real_t lu = gmx_simd_rsqrt_r(x); - lu = gmx_mul_pr(gmx_mul_pr(half, lu), gmx_nmsub_pr(gmx_mul_pr(lu, lu), x, three)); - return gmx_mul_pr(gmx_mul_pr(half, lu), gmx_nmsub_pr(gmx_mul_pr(lu, lu), x, three)); + lu = gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three)); + return gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three)); } /* 1.0/x */ -static gmx_inline gmx_mm_pr -gmx_inv_pr(gmx_mm_pr x) +static gmx_inline gmx_simd_real_t +gmx_simd_inv_r(gmx_simd_real_t x) { - const gmx_mm_pr two = gmx_set1_pr(2.0); + const gmx_simd_real_t two = gmx_simd_set1_r(2.0); /* Lookup instruction only exists in single precision, convert back and forth... */ - gmx_mm_pr lu = gmx_rcp_pr(x); + gmx_simd_real_t lu = gmx_simd_rcp_r(x); /* Perform two N-R steps for double precision */ - lu = gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two)); - return gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two)); + lu = gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two)); + return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two)); } @@ -134,54 +134,54 @@ gmx_inv_pr(gmx_mm_pr x) * vectorial force to add to the particles. * */ -static gmx_mm_pr -gmx_pmecorrF_pr(gmx_mm_pr z2) +static gmx_simd_real_t +gmx_simd_pmecorrF_r(gmx_simd_real_t z2) { - const gmx_mm_pr FN10 = gmx_set1_pr(-8.0072854618360083154e-14); - const gmx_mm_pr FN9 = gmx_set1_pr(1.1859116242260148027e-11); - const gmx_mm_pr FN8 = gmx_set1_pr(-8.1490406329798423616e-10); - const gmx_mm_pr FN7 = gmx_set1_pr(3.4404793543907847655e-8); - const gmx_mm_pr FN6 = gmx_set1_pr(-9.9471420832602741006e-7); - const gmx_mm_pr FN5 = gmx_set1_pr(0.000020740315999115847456); - const gmx_mm_pr FN4 = gmx_set1_pr(-0.00031991745139313364005); - const gmx_mm_pr FN3 = gmx_set1_pr(0.0035074449373659008203); - const gmx_mm_pr FN2 = gmx_set1_pr(-0.031750380176100813405); - const gmx_mm_pr FN1 = gmx_set1_pr(0.13884101728898463426); - const gmx_mm_pr FN0 = gmx_set1_pr(-0.75225277815249618847); + const gmx_simd_real_t FN10 = gmx_simd_set1_r(-8.0072854618360083154e-14); + const gmx_simd_real_t FN9 = gmx_simd_set1_r(1.1859116242260148027e-11); + const gmx_simd_real_t FN8 = gmx_simd_set1_r(-8.1490406329798423616e-10); + const gmx_simd_real_t FN7 = gmx_simd_set1_r(3.4404793543907847655e-8); + const gmx_simd_real_t FN6 = gmx_simd_set1_r(-9.9471420832602741006e-7); + const gmx_simd_real_t FN5 = gmx_simd_set1_r(0.000020740315999115847456); + const gmx_simd_real_t FN4 = gmx_simd_set1_r(-0.00031991745139313364005); + const gmx_simd_real_t FN3 = gmx_simd_set1_r(0.0035074449373659008203); + const gmx_simd_real_t FN2 = gmx_simd_set1_r(-0.031750380176100813405); + const gmx_simd_real_t FN1 = gmx_simd_set1_r(0.13884101728898463426); + const gmx_simd_real_t FN0 = gmx_simd_set1_r(-0.75225277815249618847); - const gmx_mm_pr FD5 = gmx_set1_pr(0.000016009278224355026701); - const gmx_mm_pr FD4 = gmx_set1_pr(0.00051055686934806966046); - const gmx_mm_pr FD3 = gmx_set1_pr(0.0081803507497974289008); - const gmx_mm_pr FD2 = gmx_set1_pr(0.077181146026670287235); - const gmx_mm_pr FD1 = gmx_set1_pr(0.41543303143712535988); - const gmx_mm_pr FD0 = gmx_set1_pr(1.0); + const gmx_simd_real_t FD5 = gmx_simd_set1_r(0.000016009278224355026701); + const gmx_simd_real_t FD4 = gmx_simd_set1_r(0.00051055686934806966046); + const gmx_simd_real_t FD3 = gmx_simd_set1_r(0.0081803507497974289008); + const gmx_simd_real_t FD2 = gmx_simd_set1_r(0.077181146026670287235); + const gmx_simd_real_t FD1 = gmx_simd_set1_r(0.41543303143712535988); + const gmx_simd_real_t FD0 = gmx_simd_set1_r(1.0); - gmx_mm_pr z4; - gmx_mm_pr polyFN0, polyFN1, polyFD0, polyFD1; + gmx_simd_real_t z4; + gmx_simd_real_t polyFN0, polyFN1, polyFD0, polyFD1; - z4 = gmx_mul_pr(z2, z2); + z4 = gmx_simd_mul_r(z2, z2); - polyFD1 = gmx_madd_pr(FD5, z4, FD3); - polyFD1 = gmx_madd_pr(polyFD1, z4, FD1); - polyFD1 = gmx_mul_pr(polyFD1, z2); - polyFD0 = gmx_madd_pr(FD4, z4, FD2); - polyFD0 = gmx_madd_pr(polyFD0, z4, FD0); - polyFD0 = gmx_add_pr(polyFD0, polyFD1); + polyFD1 = gmx_simd_fmadd_r(FD5, z4, FD3); + polyFD1 = gmx_simd_fmadd_r(polyFD1, z4, FD1); + polyFD1 = gmx_simd_mul_r(polyFD1, z2); + polyFD0 = gmx_simd_fmadd_r(FD4, z4, FD2); + polyFD0 = gmx_simd_fmadd_r(polyFD0, z4, FD0); + polyFD0 = gmx_simd_add_r(polyFD0, polyFD1); - polyFD0 = gmx_inv_pr(polyFD0); + polyFD0 = gmx_simd_inv_r(polyFD0); - polyFN0 = gmx_madd_pr(FN10, z4, FN8); - polyFN0 = gmx_madd_pr(polyFN0, z4, FN6); - polyFN0 = gmx_madd_pr(polyFN0, z4, FN4); - polyFN0 = gmx_madd_pr(polyFN0, z4, FN2); - polyFN0 = gmx_madd_pr(polyFN0, z4, FN0); - polyFN1 = gmx_madd_pr(FN9, z4, FN7); - polyFN1 = gmx_madd_pr(polyFN1, z4, FN5); - polyFN1 = gmx_madd_pr(polyFN1, z4, FN3); - polyFN1 = gmx_madd_pr(polyFN1, z4, FN1); - polyFN0 = gmx_madd_pr(polyFN1, z2, polyFN0); + polyFN0 = gmx_simd_fmadd_r(FN10, z4, FN8); + polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN6); + polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN4); + polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN2); + polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN0); + polyFN1 = gmx_simd_fmadd_r(FN9, z4, FN7); + polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN5); + polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN3); + polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN1); + polyFN0 = gmx_simd_fmadd_r(polyFN1, z2, polyFN0); - return gmx_mul_pr(polyFN0, polyFD0); + return gmx_simd_mul_r(polyFN0, polyFD0); } @@ -212,51 +212,51 @@ gmx_pmecorrF_pr(gmx_mm_pr z2) * and you have your potential. * */ -static gmx_mm_pr -gmx_pmecorrV_pr(gmx_mm_pr z2) +static gmx_simd_real_t +gmx_simd_pmecorrV_r(gmx_simd_real_t z2) { - const gmx_mm_pr VN9 = gmx_set1_pr(-9.3723776169321855475e-13); - const gmx_mm_pr VN8 = gmx_set1_pr(1.2280156762674215741e-10); - const gmx_mm_pr VN7 = gmx_set1_pr(-7.3562157912251309487e-9); - const gmx_mm_pr VN6 = gmx_set1_pr(2.6215886208032517509e-7); - const gmx_mm_pr VN5 = gmx_set1_pr(-4.9532491651265819499e-6); - const gmx_mm_pr VN4 = gmx_set1_pr(0.00025907400778966060389); - const gmx_mm_pr VN3 = gmx_set1_pr(0.0010585044856156469792); - const gmx_mm_pr VN2 = gmx_set1_pr(0.045247661136833092885); - const gmx_mm_pr VN1 = gmx_set1_pr(0.11643931522926034421); - const gmx_mm_pr VN0 = gmx_set1_pr(1.1283791671726767970); + const gmx_simd_real_t VN9 = gmx_simd_set1_r(-9.3723776169321855475e-13); + const gmx_simd_real_t VN8 = gmx_simd_set1_r(1.2280156762674215741e-10); + const gmx_simd_real_t VN7 = gmx_simd_set1_r(-7.3562157912251309487e-9); + const gmx_simd_real_t VN6 = gmx_simd_set1_r(2.6215886208032517509e-7); + const gmx_simd_real_t VN5 = gmx_simd_set1_r(-4.9532491651265819499e-6); + const gmx_simd_real_t VN4 = gmx_simd_set1_r(0.00025907400778966060389); + const gmx_simd_real_t VN3 = gmx_simd_set1_r(0.0010585044856156469792); + const gmx_simd_real_t VN2 = gmx_simd_set1_r(0.045247661136833092885); + const gmx_simd_real_t VN1 = gmx_simd_set1_r(0.11643931522926034421); + const gmx_simd_real_t VN0 = gmx_simd_set1_r(1.1283791671726767970); - const gmx_mm_pr VD5 = gmx_set1_pr(0.000021784709867336150342); - const gmx_mm_pr VD4 = gmx_set1_pr(0.00064293662010911388448); - const gmx_mm_pr VD3 = gmx_set1_pr(0.0096311444822588683504); - const gmx_mm_pr VD2 = gmx_set1_pr(0.085608012351550627051); - const gmx_mm_pr VD1 = gmx_set1_pr(0.43652499166614811084); - const gmx_mm_pr VD0 = gmx_set1_pr(1.0); + const gmx_simd_real_t VD5 = gmx_simd_set1_r(0.000021784709867336150342); + const gmx_simd_real_t VD4 = gmx_simd_set1_r(0.00064293662010911388448); + const gmx_simd_real_t VD3 = gmx_simd_set1_r(0.0096311444822588683504); + const gmx_simd_real_t VD2 = gmx_simd_set1_r(0.085608012351550627051); + const gmx_simd_real_t VD1 = gmx_simd_set1_r(0.43652499166614811084); + const gmx_simd_real_t VD0 = gmx_simd_set1_r(1.0); - gmx_mm_pr z4; - gmx_mm_pr polyVN0, polyVN1, polyVD0, polyVD1; + gmx_simd_real_t z4; + gmx_simd_real_t polyVN0, polyVN1, polyVD0, polyVD1; - z4 = gmx_mul_pr(z2, z2); + z4 = gmx_simd_mul_r(z2, z2); - polyVD1 = gmx_madd_pr(VD5, z4, VD3); - polyVD0 = gmx_madd_pr(VD4, z4, VD2); - polyVD1 = gmx_madd_pr(polyVD1, z4, VD1); - polyVD0 = gmx_madd_pr(polyVD0, z4, VD0); - polyVD0 = gmx_madd_pr(polyVD1, z2, polyVD0); + polyVD1 = gmx_simd_fmadd_r(VD5, z4, VD3); + polyVD0 = gmx_simd_fmadd_r(VD4, z4, VD2); + polyVD1 = gmx_simd_fmadd_r(polyVD1, z4, VD1); + polyVD0 = gmx_simd_fmadd_r(polyVD0, z4, VD0); + polyVD0 = gmx_simd_fmadd_r(polyVD1, z2, polyVD0); - polyVD0 = gmx_inv_pr(polyVD0); + polyVD0 = gmx_simd_inv_r(polyVD0); - polyVN1 = gmx_madd_pr(VN9, z4, VN7); - polyVN0 = gmx_madd_pr(VN8, z4, VN6); - polyVN1 = gmx_madd_pr(polyVN1, z4, VN5); - polyVN0 = gmx_madd_pr(polyVN0, z4, VN4); - polyVN1 = gmx_madd_pr(polyVN1, z4, VN3); - polyVN0 = gmx_madd_pr(polyVN0, z4, VN2); - polyVN1 = gmx_madd_pr(polyVN1, z4, VN1); - polyVN0 = gmx_madd_pr(polyVN0, z4, VN0); - polyVN0 = gmx_madd_pr(polyVN1, z2, polyVN0); + polyVN1 = gmx_simd_fmadd_r(VN9, z4, VN7); + polyVN0 = gmx_simd_fmadd_r(VN8, z4, VN6); + polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN5); + polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN4); + polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN3); + polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN2); + polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN1); + polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN0); + polyVN0 = gmx_simd_fmadd_r(polyVN1, z2, polyVN0); - return gmx_mul_pr(polyVN0, polyVD0); + return gmx_simd_mul_r(polyVN0, polyVD0); } diff --git a/src/gromacs/simd/math_single.h b/src/gromacs/simd/math_single.h index 377855c549..c956b9ad86 100644 --- a/src/gromacs/simd/math_single.h +++ b/src/gromacs/simd/math_single.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -37,40 +37,40 @@ /* 1.0/sqrt(x) */ -static gmx_inline gmx_mm_pr -gmx_invsqrt_pr(gmx_mm_pr x) +static gmx_inline gmx_simd_real_t +gmx_simd_invsqrt_r(gmx_simd_real_t x) { /* This is one of the few cases where FMA adds a FLOP, but ends up with * less instructions in total when FMA is available in hardware. * Usually we would not optimize this far, but invsqrt is used often. */ #ifdef GMX_SIMD_HAVE_FMA - const gmx_mm_pr half = gmx_set1_pr(0.5); - const gmx_mm_pr one = gmx_set1_pr(1.0); + const gmx_simd_real_t half = gmx_simd_set1_r(0.5); + const gmx_simd_real_t one = gmx_simd_set1_r(1.0); - gmx_mm_pr lu = gmx_rsqrt_pr(x); + gmx_simd_real_t lu = gmx_simd_rsqrt_r(x); - return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu); + return gmx_simd_fmadd_r(gmx_simd_fnmadd_r(x, gmx_simd_mul_r(lu, lu), one), gmx_simd_mul_r(lu, half), lu); #else - const gmx_mm_pr half = gmx_set1_pr(0.5); - const gmx_mm_pr three = gmx_set1_pr(3.0); + const gmx_simd_real_t half = gmx_simd_set1_r(0.5); + const gmx_simd_real_t three = gmx_simd_set1_r(3.0); - gmx_mm_pr lu = gmx_rsqrt_pr(x); + gmx_simd_real_t lu = gmx_simd_rsqrt_r(x); - return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu)); + return gmx_simd_mul_r(half, gmx_simd_mul_r(gmx_simd_sub_r(three, gmx_simd_mul_r(gmx_simd_mul_r(lu, lu), x)), lu)); #endif } /* 1.0/x */ -static gmx_inline gmx_mm_pr -gmx_inv_pr(gmx_mm_pr x) +static gmx_inline gmx_simd_real_t +gmx_simd_inv_r(gmx_simd_real_t x) { - const gmx_mm_pr two = gmx_set1_pr(2.0); + const gmx_simd_real_t two = gmx_simd_set1_r(2.0); - gmx_mm_pr lu = gmx_rcp_pr(x); + gmx_simd_real_t lu = gmx_simd_rcp_r(x); - return gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two)); + return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two)); } @@ -142,49 +142,49 @@ gmx_inv_pr(gmx_mm_pr x) * vectorial force to add to the particles. * */ -static gmx_mm_pr -gmx_pmecorrF_pr(gmx_mm_pr z2) +static gmx_simd_real_t +gmx_simd_pmecorrF_r(gmx_simd_real_t z2) { - const gmx_mm_pr FN6 = gmx_set1_pr(-1.7357322914161492954e-8f); - const gmx_mm_pr FN5 = gmx_set1_pr(1.4703624142580877519e-6f); - const gmx_mm_pr FN4 = gmx_set1_pr(-0.000053401640219807709149f); - const gmx_mm_pr FN3 = gmx_set1_pr(0.0010054721316683106153f); - const gmx_mm_pr FN2 = gmx_set1_pr(-0.019278317264888380590f); - const gmx_mm_pr FN1 = gmx_set1_pr(0.069670166153766424023f); - const gmx_mm_pr FN0 = gmx_set1_pr(-0.75225204789749321333f); - - const gmx_mm_pr FD4 = gmx_set1_pr(0.0011193462567257629232f); - const gmx_mm_pr FD3 = gmx_set1_pr(0.014866955030185295499f); - const gmx_mm_pr FD2 = gmx_set1_pr(0.11583842382862377919f); - const gmx_mm_pr FD1 = gmx_set1_pr(0.50736591960530292870f); - const gmx_mm_pr FD0 = gmx_set1_pr(1.0f); - - gmx_mm_pr z4; - gmx_mm_pr polyFN0, polyFN1, polyFD0, polyFD1; - - z4 = gmx_mul_pr(z2, z2); - - polyFD0 = gmx_madd_pr(FD4, z4, FD2); - polyFD1 = gmx_madd_pr(FD3, z4, FD1); - polyFD0 = gmx_madd_pr(polyFD0, z4, FD0); - polyFD0 = gmx_madd_pr(polyFD1, z2, polyFD0); - - polyFD0 = gmx_inv_pr(polyFD0); - - polyFN0 = gmx_madd_pr(FN6, z4, FN4); - polyFN1 = gmx_madd_pr(FN5, z4, FN3); - polyFN0 = gmx_madd_pr(polyFN0, z4, FN2); - polyFN1 = gmx_madd_pr(polyFN1, z4, FN1); - polyFN0 = gmx_madd_pr(polyFN0, z4, FN0); - polyFN0 = gmx_madd_pr(polyFN1, z2, polyFN0); - - return gmx_mul_pr(polyFN0, polyFD0); + const gmx_simd_real_t FN6 = gmx_simd_set1_r(-1.7357322914161492954e-8f); + const gmx_simd_real_t FN5 = gmx_simd_set1_r(1.4703624142580877519e-6f); + const gmx_simd_real_t FN4 = gmx_simd_set1_r(-0.000053401640219807709149f); + const gmx_simd_real_t FN3 = gmx_simd_set1_r(0.0010054721316683106153f); + const gmx_simd_real_t FN2 = gmx_simd_set1_r(-0.019278317264888380590f); + const gmx_simd_real_t FN1 = gmx_simd_set1_r(0.069670166153766424023f); + const gmx_simd_real_t FN0 = gmx_simd_set1_r(-0.75225204789749321333f); + + const gmx_simd_real_t FD4 = gmx_simd_set1_r(0.0011193462567257629232f); + const gmx_simd_real_t FD3 = gmx_simd_set1_r(0.014866955030185295499f); + const gmx_simd_real_t FD2 = gmx_simd_set1_r(0.11583842382862377919f); + const gmx_simd_real_t FD1 = gmx_simd_set1_r(0.50736591960530292870f); + const gmx_simd_real_t FD0 = gmx_simd_set1_r(1.0f); + + gmx_simd_real_t z4; + gmx_simd_real_t polyFN0, polyFN1, polyFD0, polyFD1; + + z4 = gmx_simd_mul_r(z2, z2); + + polyFD0 = gmx_simd_fmadd_r(FD4, z4, FD2); + polyFD1 = gmx_simd_fmadd_r(FD3, z4, FD1); + polyFD0 = gmx_simd_fmadd_r(polyFD0, z4, FD0); + polyFD0 = gmx_simd_fmadd_r(polyFD1, z2, polyFD0); + + polyFD0 = gmx_simd_inv_r(polyFD0); + + polyFN0 = gmx_simd_fmadd_r(FN6, z4, FN4); + polyFN1 = gmx_simd_fmadd_r(FN5, z4, FN3); + polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN2); + polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN1); + polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN0); + polyFN0 = gmx_simd_fmadd_r(polyFN1, z2, polyFN0); + + return gmx_simd_mul_r(polyFN0, polyFD0); } /* Calculate the potential correction due to PME analytically. * - * See gmx_pmecorrF_pr() for details about the approximation. + * See gmx_simd_pmecorrF_r() for details about the approximation. * * This routine calculates Erf(z)/z, although you should provide z^2 * as the input argument. @@ -210,41 +210,41 @@ gmx_pmecorrF_pr(gmx_mm_pr z2) * 6. Add the result to 1/r, multiply by the product of the charges, * and you have your potential. */ -static gmx_mm_pr -gmx_pmecorrV_pr(gmx_mm_pr z2) +static gmx_simd_real_t +gmx_simd_pmecorrV_r(gmx_simd_real_t z2) { - const gmx_mm_pr VN6 = gmx_set1_pr(1.9296833005951166339e-8f); - const gmx_mm_pr VN5 = gmx_set1_pr(-1.4213390571557850962e-6f); - const gmx_mm_pr VN4 = gmx_set1_pr(0.000041603292906656984871f); - const gmx_mm_pr VN3 = gmx_set1_pr(-0.00013134036773265025626f); - const gmx_mm_pr VN2 = gmx_set1_pr(0.038657983986041781264f); - const gmx_mm_pr VN1 = gmx_set1_pr(0.11285044772717598220f); - const gmx_mm_pr VN0 = gmx_set1_pr(1.1283802385263030286f); - - const gmx_mm_pr VD3 = gmx_set1_pr(0.0066752224023576045451f); - const gmx_mm_pr VD2 = gmx_set1_pr(0.078647795836373922256f); - const gmx_mm_pr VD1 = gmx_set1_pr(0.43336185284710920150f); - const gmx_mm_pr VD0 = gmx_set1_pr(1.0f); - - gmx_mm_pr z4; - gmx_mm_pr polyVN0, polyVN1, polyVD0, polyVD1; - - z4 = gmx_mul_pr(z2, z2); - - polyVD1 = gmx_madd_pr(VD3, z4, VD1); - polyVD0 = gmx_madd_pr(VD2, z4, VD0); - polyVD0 = gmx_madd_pr(polyVD1, z2, polyVD0); - - polyVD0 = gmx_inv_pr(polyVD0); - - polyVN0 = gmx_madd_pr(VN6, z4, VN4); - polyVN1 = gmx_madd_pr(VN5, z4, VN3); - polyVN0 = gmx_madd_pr(polyVN0, z4, VN2); - polyVN1 = gmx_madd_pr(polyVN1, z4, VN1); - polyVN0 = gmx_madd_pr(polyVN0, z4, VN0); - polyVN0 = gmx_madd_pr(polyVN1, z2, polyVN0); - - return gmx_mul_pr(polyVN0, polyVD0); + const gmx_simd_real_t VN6 = gmx_simd_set1_r(1.9296833005951166339e-8f); + const gmx_simd_real_t VN5 = gmx_simd_set1_r(-1.4213390571557850962e-6f); + const gmx_simd_real_t VN4 = gmx_simd_set1_r(0.000041603292906656984871f); + const gmx_simd_real_t VN3 = gmx_simd_set1_r(-0.00013134036773265025626f); + const gmx_simd_real_t VN2 = gmx_simd_set1_r(0.038657983986041781264f); + const gmx_simd_real_t VN1 = gmx_simd_set1_r(0.11285044772717598220f); + const gmx_simd_real_t VN0 = gmx_simd_set1_r(1.1283802385263030286f); + + const gmx_simd_real_t VD3 = gmx_simd_set1_r(0.0066752224023576045451f); + const gmx_simd_real_t VD2 = gmx_simd_set1_r(0.078647795836373922256f); + const gmx_simd_real_t VD1 = gmx_simd_set1_r(0.43336185284710920150f); + const gmx_simd_real_t VD0 = gmx_simd_set1_r(1.0f); + + gmx_simd_real_t z4; + gmx_simd_real_t polyVN0, polyVN1, polyVD0, polyVD1; + + z4 = gmx_simd_mul_r(z2, z2); + + polyVD1 = gmx_simd_fmadd_r(VD3, z4, VD1); + polyVD0 = gmx_simd_fmadd_r(VD2, z4, VD0); + polyVD0 = gmx_simd_fmadd_r(polyVD1, z2, polyVD0); + + polyVD0 = gmx_simd_inv_r(polyVD0); + + polyVN0 = gmx_simd_fmadd_r(VN6, z4, VN4); + polyVN1 = gmx_simd_fmadd_r(VN5, z4, VN3); + polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN2); + polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN1); + polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN0); + polyVN0 = gmx_simd_fmadd_r(polyVN1, z2, polyVN0); + + return gmx_simd_mul_r(polyVN0, polyVD0); } diff --git a/src/gromacs/simd/vector_operations.h b/src/gromacs/simd/vector_operations.h index 42448ada0d..1fb9a142e1 100644 --- a/src/gromacs/simd/vector_operations.h +++ b/src/gromacs/simd/vector_operations.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -52,60 +52,60 @@ /* x^2 + y^2 + z^2 */ -static gmx_inline gmx_mm_pr -gmx_calc_rsq_pr(gmx_mm_pr x, gmx_mm_pr y, gmx_mm_pr z) +static gmx_inline gmx_simd_real_t +gmx_simd_calc_rsq_r(gmx_simd_real_t x, gmx_simd_real_t y, gmx_simd_real_t z) { - return gmx_madd_pr(z, z, gmx_madd_pr(y, y, gmx_mul_pr(x, x))); + return gmx_simd_fmadd_r(z, z, gmx_simd_fmadd_r(y, y, gmx_simd_mul_r(x, x))); } /* inner-product of multiple vectors */ -static gmx_inline gmx_mm_pr -gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az, - gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz) +static gmx_inline gmx_simd_real_t +gmx_simd_iprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az, + gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz) { - gmx_mm_pr ret; + gmx_simd_real_t ret; - ret = gmx_mul_pr(ax, bx); - ret = gmx_madd_pr(ay, by, ret); - ret = gmx_madd_pr(az, bz, ret); + ret = gmx_simd_mul_r(ax, bx); + ret = gmx_simd_fmadd_r(ay, by, ret); + ret = gmx_simd_fmadd_r(az, bz, ret); return ret; } /* norm squared of multiple vectors */ -static gmx_inline gmx_mm_pr -gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az) +static gmx_inline gmx_simd_real_t +gmx_simd_norm2_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az) { - gmx_mm_pr ret; + gmx_simd_real_t ret; - ret = gmx_mul_pr(ax, ax); - ret = gmx_madd_pr(ay, ay, ret); - ret = gmx_madd_pr(az, az, ret); + ret = gmx_simd_mul_r(ax, ax); + ret = gmx_simd_fmadd_r(ay, ay, ret); + ret = gmx_simd_fmadd_r(az, az, ret); return ret; } /* cross-product of multiple vectors */ static gmx_inline void -gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az, - gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz, - gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz) +gmx_simd_cprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az, + gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz, + gmx_simd_real_t *cx, gmx_simd_real_t *cy, gmx_simd_real_t *cz) { - *cx = gmx_mul_pr(ay, bz); - *cx = gmx_nmsub_pr(az, by, *cx); + *cx = gmx_simd_mul_r(ay, bz); + *cx = gmx_simd_fnmadd_r(az, by, *cx); - *cy = gmx_mul_pr(az, bx); - *cy = gmx_nmsub_pr(ax, bz, *cy); + *cy = gmx_simd_mul_r(az, bx); + *cy = gmx_simd_fnmadd_r(ax, bz, *cy); - *cz = gmx_mul_pr(ax, by); - *cz = gmx_nmsub_pr(ay, bx, *cz); + *cz = gmx_simd_mul_r(ax, by); + *cz = gmx_simd_fnmadd_r(ay, bx, *cz); } /* a + b + c + d (not really a vector operation, but where else put this?) */ -static gmx_inline gmx_mm_pr -gmx_sum4_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d) +static gmx_inline gmx_simd_real_t +gmx_simd_sum4_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c, gmx_simd_real_t d) { - return gmx_add_pr(gmx_add_pr(a, b), gmx_add_pr(c, d)); + return gmx_simd_add_r(gmx_simd_add_r(a, b), gmx_simd_add_r(c, d)); } diff --git a/src/gromacs/utility/gmxomp.h b/src/gromacs/utility/gmxomp.h index 178eaac4bf..4b4ec6fd59 100644 --- a/src/gromacs/utility/gmxomp.h +++ b/src/gromacs/utility/gmxomp.h @@ -53,7 +53,7 @@ #include "config.h" #endif -#ifdef GMX_X86_SSE2 +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER #include #endif @@ -113,7 +113,7 @@ void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr, static gmx_inline void gmx_pause() { /* Replace with tbb::internal::atomic_backoff when/if we use TBB */ -#if defined GMX_X86_SSE2 +#if defined GMX_SIMD_X86_SSE2_OR_HIGHER _mm_pause(); #elif defined __MIC__ _mm_delay_32(32);