Merge "Essential dynamics: fixed restarts when ED group has > 1 molecule" into releas...
authorRoland Schulz <roland@rschulz.eu>
Mon, 7 Jan 2013 01:21:10 +0000 (02:21 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Mon, 7 Jan 2013 01:21:10 +0000 (02:21 +0100)
177 files changed:
CMakeLists.txt
cmake/FindFFTW.cmake
cmake/gmxBuildTypeReference.cmake
cmake/gmxCheckGCCVersion.cmake [deleted file]
include/copyrite.h
include/gmx_arpack.h
include/gmx_cpuid.h
include/gmx_lapack.h
include/gmx_simd_macros.h [moved from include/gmx_x86_simd_macros.h with 89% similarity]
include/gmx_x86_avx_256.h
include/main.h
include/maths.h
include/nbsearch.h
include/network.h
include/thread_mpi/atomic/gcc_x86.h
include/trajana.h
include/types/nb_verlet.h
include/types/nbnxn_pairlist.h
include/vec.h
scripts/GMXRC.cmakein
share/template/CMakeLists.txt
share/template/template.c
src/config.h.cmakein
src/gmxlib/copyrite.c
src/gmxlib/gmx_cpuid.c
src/gmxlib/gmx_detect_hardware.c
src/gmxlib/gpu_utils/CMakeLists.txt
src/gmxlib/libgmx.pc.cmakein
src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre
src/gmxlib/thread_mpi/collective.c
src/gmxlib/thread_mpi/once.c
src/gmxlib/thread_mpi/tmpi_init.c
src/kernel/CMakeLists.txt
src/kernel/calc_verletbuf.c
src/kernel/grompp.c
src/kernel/md.c
src/kernel/pdb2gmx.c
src/kernel/pme_loadbal.c
src/kernel/readir.h
src/kernel/readpull.c
src/mdlib/CMakeLists.txt
src/mdlib/clincs.c
src/mdlib/coupling.c
src/mdlib/fft5d.h
src/mdlib/forcerec.c
src/mdlib/init.c
src/mdlib/libmd.pc.cmakein
src/mdlib/nbnxn_atomdata.c
src/mdlib/nbnxn_cuda/CMakeLists.txt
src/mdlib/nbnxn_internal.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c with 83% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h with 69% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h [new file with mode: 0644]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h [new file with mode: 0644]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h [new file with mode: 0644]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c with 84% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h with 71% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h with 86% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h with 99% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h with 91% similarity]
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h [moved from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h with 90% similarity]
src/mdlib/nbnxn_search.c
src/mdlib/nbnxn_search_simd_2xnn.h [new file with mode: 0644]
src/mdlib/nbnxn_search_simd_4xn.h [moved from src/mdlib/nbnxn_search_x86_simd.h with 76% similarity]
src/mdlib/sim_util.c
src/mdlib/tables.c
src/tools/gmx_angle.c
src/tools/gmx_tune_pme.c

index ac998d0caeda216cccc7fe201505042ce4e99d64..0251e1048849676bc575a1965d3eee71910c0dee 100644 (file)
@@ -40,10 +40,9 @@ set(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required
 set(CPACK_COMPONENT_GROUP_TOOLS_DESCRIPTION "All GROMACS executable tools")
 set(CPACK_COMPONENT_GROUP_MDRUN_DESCRIPTION "GROMACS executable for running simulations")
 
-# override bugs on OS X where Cmake picks gcc (GNU) for C instead of system default cc (Clang).
-if(APPLE)
-    set(CMAKE_C_COMPILER_INIT "cc")
-endif(APPLE)
+# CMake modules/macros are in a subdirectory to keep this file cleaner
+# This needs to be set before project() in order to pick up toolchain files
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
 
 project(Gromacs C)
 include(Dart)
@@ -57,7 +56,7 @@ mark_as_advanced(DART_ROOT)
 # machine with no git. 
 #
 # NOTE: when releasing the "-dev" suffix needs to be stripped off!
-set(PROJECT_VERSION "4.6-beta2-dev")
+set(PROJECT_VERSION "4.6-beta3-dev")
 set(CUSTOM_VERSION_STRING ""
     CACHE STRING "Custom version string (if empty, use hard-coded default)")
 mark_as_advanced(CUSTOM_VERSION_STRING)
@@ -86,9 +85,6 @@ endif()
 # provide backward compatibility of software written against the Gromacs API.
 set(API_VERSION ${NUM_VERSION})
 
-# Cmake modules/macros are in a subdirectory to keep this file cleaner
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
 set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
 endif()
@@ -153,6 +149,8 @@ mark_as_advanced(GMX_OPENMM)
 option(GMX_FORCE_CXX "Enable C++ compilation even if not necessary" OFF)
 mark_as_advanced(GMX_FORCE_CXX)
 
+option(GMX_NO_QUOTES "Disable Gromacs cool quotes" OFF)
+
 if(GMX_GPU OR GMX_OPENMM OR GMX_FORCE_CXX)
     enable_language(CXX)
 endif()
@@ -169,6 +167,7 @@ IF( WIN32 AND NOT CYGWIN)
     SET(SHARED_LIBS_DEFAULT OFF)
   else()
     add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
+    set(PKG_CFLAGS "$PKG_CFLAGS -DUSE_VISIBILITY -DTMPI_USE_VISIBILITY")
   endif()
 
   IF (GMX_PREFER_STATIC_LIBS)
@@ -187,6 +186,10 @@ IF( WIN32 AND NOT CYGWIN)
 
   #Workaround for cmake bug 13174. Replace deprecated options.
   IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
+    if(BUILD_SHARED_LIBS)
+        STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+        SET(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} CACHE STRING "" FORCE)
+    endif()
     STRING(REPLACE /GZ /RTC1 CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
     SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
   ENDIF()
@@ -209,8 +212,6 @@ option(GMX_MPI    "Build a parallel (message-passing) version of GROMACS" OFF)
 option(GMX_THREAD_MPI  "Build a thread-MPI-based multithreaded version of GROMACS (not compatible with MPI)" ON)
 option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
 mark_as_advanced(GMX_SOFTWARE_INVSQRT)
-option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" OFF)
-mark_as_advanced(GMX_POWERPC_INVSQRT)
 option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
 mark_as_advanced(GMX_FAHCORE)
 
@@ -224,7 +225,7 @@ if(NOT DEFINED GMX_CPU_ACCELERATION)
 endif(NOT DEFINED GMX_CPU_ACCELERATION)
 
 set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
-    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene, Power6, Fortran")
+    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
 
 set(GMX_FFT_LIBRARY "fftw3" 
     CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
@@ -371,9 +372,6 @@ endif(GMX_DOUBLE)
 if(GMX_SOFTWARE_INVSQRT)
   set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
 endif(GMX_SOFTWARE_INVSQRT)
-if(GMX_POWERPC_INVSQRT)
-  set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
-endif(GMX_POWERPC_INVSQRT)
 
 ########################################################################
 #Process MPI settings
@@ -852,12 +850,6 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
         endif()
     endif()
 
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "FORTRAN")
-
-#    Fortran is temporarily disabled while we push in nbNxN kernels.
-#    We need to fake it a bit here to avoid jenkins build errors!
-#    add_definitions(-DGMX_FORTRAN)
-
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
 # GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
     if (NOT ACCELERATION_QUIETLY)
@@ -869,7 +861,6 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
         set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
     endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
     set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
-    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
     set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
     set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI not compatible with BlueGene, disabled!" FORCE)
     set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
@@ -880,52 +871,11 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
 # The automatic testing for endianness does not work for the BlueGene cross-compiler
     set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
     set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
-    set(GMX_POWER6 1)
-    set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
-    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
 else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
-    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, Fortran, BlueGene, Power6")
+    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
 endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
 set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
 
-if(GMX_FORTRAN OR GMX_POWER6)
-    if (GMX_THREAD_MPI)
-        message(FATAL_ERROR "FORTRAN/POWER6 is incompatible with thread-MPI and only provides a speed-up on certain IBM compilers. Disable FORTRAN (or threads if you really want to use FORTRAN kernels).")
-    endif(GMX_THREAD_MPI)
-    enable_language(Fortran)
-    include(FortranCInterface)
-    discover_fortran_mangling(prefix isupper suffix extra_under_score found)
-    if(extra_under_score)
-        set(extrasuffix "_")
-    endif(extra_under_score)
-    if(prefix)
-      set(prefix "${prefix} ##")
-    endif(prefix)
-    if(suffix)
-      set(suffix "## ${suffix}")
-      if(extrasuffix)
-       set(extrasuffix "${suffix}${extrasuffix}")
-      endif(extrasuffix)
-    else(suffix)
-      if(extrasuffix)
-       # Don't know if this is needed, but it can't hurt
-       set(extrasuffix "## ${extrasuffix}")
-      endif(extrasuffix)
-    endif(suffix)
-
-    if(isupper)
-        set(F77_FUNCDEF   "${prefix} NAME ${suffix}")
-        set(F77_FUNCDEF_  "${prefix} NAME ${extrasuffix}")
-    else(isupper)
-        set(F77_FUNCDEF   "${prefix} name ${suffix}")
-        set(F77_FUNCDEF_  "${prefix} name ${extrasuffix}")
-    endif(isupper)
-else(GMX_FORTRAN OR GMX_POWER6)
-        set(F77_FUNCDEF   "name ## _")
-        set(F77_FUNCDEF_  "name ## _")
-endif(GMX_FORTRAN OR GMX_POWER6)
-
 # Process QM/MM Settings
 string(TOUPPER ${GMX_QMMM_PROGRAM} ${GMX_QMMM_PROGRAM})
 if(${GMX_QMMM_PROGRAM} STREQUAL "GAUSSIAN")
@@ -1122,6 +1072,7 @@ if(NOT GMX_OPENMP)
     unset(OpenMP_LINKER_FLAGS CACHE)
     unset(OpenMP_SHARED_LINKER_FLAGS)
 endif()
+set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
 
 ######################################
 # Output compiler and CFLAGS used
index 6a4d724e626e03e112eefc1c78e79ffac50a1778..f9053cee9b9e081e5b9bf8d721853dd254a6bb0c 100644 (file)
@@ -134,4 +134,4 @@ if (${FFTW}_FOUND)
 endif (${FFTW}_FOUND)
 set(${FFTW}_HAVE_SIMD FALSE CACHE BOOL "If ${${FFTW}_PKG} was built with SIMD support")
 
-mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD)
+mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD ${FFTW}_HAVE_AVX)
index 09401b5bca037d298ef5bfa24d150f182b34b425..35f85d5a75ae45094f9f39624bc832b9b7662366 100644 (file)
@@ -45,10 +45,14 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Reference")
     set(GMX_CPU_ACCELERATION "None" CACHE STRING "Disabled for regressiontests reference builds" FORCE)
     set(GMX_FFT_LIBRARY "fftpack" CACHE STRING "Use fftpack for regressiontests reference builds" FORCE)
     set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
-    set(GMX_THREAD_MPI OFF OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
+    set(GMX_THREAD_MPI OFF CACHE BOOL "Disabled for regressiontests reference builds" FORCE)
 
+    # C_COMPILER_VERSION is not defined automatically for CMake below 2.8.8,
+    # so we call the GROMACS work-around for that
+    include(gmxGetCompilerInfo)
+    get_compiler_version()
     if(NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR NOT "${C_COMPILER_VERSION}" MATCHES "4.7")
         message(WARNING "Reference values for regressiontests should use Gromacs compiled with "
-                        "gcc-4.7, but your configuration is using ${CMAKE_C_COMPILER_ID}-${C_COMPILER_VERSION}.")
+            "gcc 4.7, but your configuration is using ${CMAKE_C_COMPILER_ID}-${C_COMPILER_VERSION}.")
     endif()
 endif()
diff --git a/cmake/gmxCheckGCCVersion.cmake b/cmake/gmxCheckGCCVersion.cmake
deleted file mode 100644 (file)
index f9223ef..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2012, by the GROMACS development team, led by
-# David van der Spoel, Berk Hess, Erik Lindahl, and including many
-# others, as listed in the AUTHORS file in the top-level source
-# directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-#
-# Check GCC version and if any of the 4.1.x family compiler suites is found
-# quit the build system generating process. 
-#
-# The GCC 4.1.x compilers contain an optimization related bug which might 
-# results in code that exhibits incorrect behaviour and often leads to 
-# exploding systems or crashes. 
-#
-# For further details see e.g. 
-# https://bugs.launchpad.net/ubuntu/+source/gcc-4.1/+bug/158799
-#
-# Szilard Pall (pszilard@cbr.su.se)
-#
-
-if(NOT GMX_DISABLE_GCC41_CHECK)
-
-if(CMAKE_COMPILER_IS_GNUCC)
-    # if we have -dumpversion flag use that, otherwise try the --version
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
-        RESULT_VARIABLE _gcc_dumpversion_res
-        OUTPUT_VARIABLE _gcc_dumpversion_out
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    # if gcc returned with error the -dumpversion is not available 
-    if(${_gcc_dumpversion_res} EQUAL 0)
-        if(${_gcc_dumpversion_out} MATCHES ".*4\\.1\\.[0-9]+.*")
-            message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x 
-                family (detected version: ${_gcc_dumpversion_out}). These compilers 
-                contain an optimization related bug which might results in code that 
-                exhibits incorrect behaviour and often leads to exploding systems or 
-                crashes. To disable this check set GMX_DISABLE_GCC41_CHECK=YES.")
-        endif()
-    else()    
-        message(WARNING " The GCC compiler in use does not support the -dumpversion flag. 
-            Will attempt parsing the version from the \"gcc --version\" output.")        
-        execute_process(COMMAND ${CMAKE_C_COMPILER} --version
-            OUTPUT_VARIABLE _gcc_version_out
-            OUTPUT_STRIP_TRAILING_WHITESPACE)            
-        if("${_gcc_version_out}" MATCHES ".*4\\.1\\.[0-9]+.*")
-            message(FATAL_ERROR " The GCC compiler in use seems to belong to the 4.1.x 
-                family. These compiler  compilers contain an optimization related bug 
-                which might results in code that exhibits incorrect behaviour and 
-                often leads to exploding systems or crashes. To disable this check set 
-                GMX_DISABLE_GCC41_CHECK=YES.")
-        endif()
-    endif()
-endif()
-
-endif()
index 994734fb533a9de1b7a1dbad384192e6b1045ebf..ab10fa9947144ef6336d9cf122ae1d8cdb81ff2e 100644 (file)
@@ -70,10 +70,10 @@ CopyrightText[] = {
 };
 
 static const char *
-GPLText[] = {
+LicenseText[] = {
   "This program is free software; you can redistribute it and/or",
-  "modify it under the terms of the GNU General Public License",
-  "as published by the Free Software Foundation; either version 2",
+  "modify it under the terms of the GNU Lesser General Public License",
+  "as published by the Free Software Foundation; either version 2.1",
   "of the License, or (at your option) any later version."
 };
 
index 612ebf62fa30e379f81f9c0bf1d19a021a01faa2..d60a56e91085002bc0f8c626120fdabee94f6d73 100644 (file)
@@ -106,6 +106,7 @@ extern "C" {
  *                 and 3 that no shifts could be applied. Negative numbers
  *                 correspond to errors in the arguments provided.
  */
+GMX_LIBGMX_EXPORT
 void
 F77_FUNC(dsaupd,DSAUPD)(int *     ido, 
                         const char *    bmat, 
@@ -165,6 +166,7 @@ F77_FUNC(dsaupd,DSAUPD)(int *     ido,
  *  \param lworkl  Provide the same argument as you did to dsaupd()
  *  \param info    Provide the same argument as you did to dsaupd()
  */
+GMX_LIBGMX_EXPORT
 void
 F77_FUNC(dseupd,DSEUPD)(int *     rvec, 
                         const char *    howmny, 
index 71d89a9c54194c7aee3367455e21037b63c8f8ec..3b6673c807b195fe68605e232581d1e96be25e82 100644 (file)
  */
 #ifndef GMX_CPUID_H_
 #define GMX_CPUID_H_
+
+#include <stdio.h>
+
 #include "visibility.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
index c21f5b034c669df5531b70eef46490d4356e6074..a7bed9cc2462794d9c8d98bd52b7e0372e1e785f 100644 (file)
@@ -396,6 +396,7 @@ void
 F77_FUNC(dsytrd,DSYTRD)(const char *uplo, int *n, double *  a, int *lda, double *d, 
        double *e, double *tau, double *work, int *lwork, int *info);
 
+GMX_LIBGMX_EXPORT
 void
 F77_FUNC(dsyevr,DSYEVR)(const char *jobz, const char *range, const char *uplo, int *n, 
        double *a, int *lda, double *vl, double *vu, int *
similarity index 89%
rename from include/gmx_x86_simd_macros.h
rename to include/gmx_simd_macros.h
index fd85c6e5e6422db3e173628086db42b14c75bd28..2cd5ed91cb987e38ea40b554b22f042ace3c7c7c 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
+/* The macros in this file are intended to be used for writing
+ * architecture independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
 /* Undefine all defines used below so we can include this file multiple times
  * with different settings from the same source file.
  */
 
 /* NOTE: floor and blend are NOT available with SSE2 only acceleration */
 
-#undef GMX_X86_SIMD_WIDTH_HERE
+#undef GMX_SIMD_WIDTH_HERE
 
 #undef gmx_epi32
 
  */
 
 #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
-"You should define GMX_MM128_HERE or GMX_MM256_HERE"
+#error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
 #endif
 
 #if defined GMX_MM128_HERE && defined GMX_MM256_HERE
-"You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
 #endif
 
 #ifdef GMX_MM128_HERE
 
 #include "gmx_x86_simd_single.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  4
+#define GMX_SIMD_WIDTH_HERE  4
 
 #define gmx_mm_pr  __m128
 
 
 #include "gmx_x86_simd_double.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  2
+#define GMX_SIMD_WIDTH_HERE  2
 
 #define gmx_mm_pr  __m128d
 
 
 #include "gmx_x86_simd_single.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  8
+#define GMX_SIMD_WIDTH_HERE  8
 
 #define gmx_mm_pr  __m256
 
 #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_ps
 #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_ps
 
+#define gmx_loaddh_pr     gmx_mm256_load4_ps
+
+/* Half SIMD-width type */
+#define gmx_mm_hpr  __m128
+
+/* Half SIMD-width macros */
+#define gmx_load_hpr      _mm_load_ps
+#define gmx_load1_hpr(x)  _mm_set1_ps((x)[0])
+#define gmx_store_hpr     _mm_store_ps
+#define gmx_add_hpr       _mm_add_ps
+#define gmx_sub_hpr       _mm_sub_ps
+
+#define gmx_sum4_hpr      gmx_mm256_sum4h_m128
+
+/* Conversion between half and full SIMD-width */
+#define gmx_2hpr_to_pr    gmx_mm256_set_m128
+
 #else
 
 #include "gmx_x86_simd_double.h"
 
-#define GMX_X86_SIMD_WIDTH_HERE  4
+#define GMX_SIMD_WIDTH_HERE  4
 
 #define gmx_mm_pr  __m256d
 
index 1e444f46e9478a4ede3a2c98ae07ae7c52e9cc35..9f266e834ff031090a34f3aadd91d315e4ae6208 100644 (file)
@@ -128,6 +128,16 @@ gmx_mm256_set_m128(__m128 hi, __m128 lo)
 }
 
 
+static gmx_inline __m256
+gmx_mm256_load4_ps(float const * p)
+{
+    __m128 a;
+
+    a = _mm_load_ps(p);
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
+}
+
+
 static __m256d
 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
 {
@@ -147,6 +157,13 @@ gmx_mm256_set_m128d(__m128d hi, __m128d lo)
 }
 
 
+static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
+{
+    __m256 sum;
+
+    sum = _mm256_add_ps(x,y);
+    return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1));
+}
 
 
 static void
index d6e7afc7ab6d1065a9b7173245079faf61766901..64d64466aaa0239de32221304ca942ec2e8fa641 100644 (file)
@@ -75,7 +75,7 @@ void check_multi_large_int(FILE *log,const gmx_multisim_t *ms,
  * The string name is used to print to the log file and in a fatal error
  * if the val's don't match.
  */
-
+GMX_LIBGMX_EXPORT
 void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
                       int nfile, const t_filenm fnm[], gmx_bool bParFn);
 /* Splits the communication into nsim separate simulations
index 2895026cca6c894c0c518c9de1255745e5cf6c04..daa2ec58df3b01a9e49314aef460ce5435f00829 100644 (file)
@@ -121,6 +121,7 @@ real    sign(real x,real y);
 real    cuberoot (real a);
 GMX_LIBGMX_EXPORT
 double  gmx_erfd(double x);
+GMX_LIBGMX_EXPORT
 double  gmx_erfcd(double x);
 GMX_LIBGMX_EXPORT
 float   gmx_erff(float x);
index bff7a88a5285992cf34e6335fe3701ddad8b3507..6d51d00894368fb0fd60d17d0401e5c17daa0a79 100644 (file)
@@ -62,6 +62,7 @@ struct gmx_ana_pos_t;
 typedef struct gmx_ana_nbsearch_t gmx_ana_nbsearch_t;
 
 /** Create a new neighborhood search data structure. */
+GMX_LIBGMX_EXPORT
 int
 gmx_ana_nbsearch_create(gmx_ana_nbsearch_t **d, real cutoff, int maxn);
 /** Free memory allocated for neighborhood search. */
@@ -72,6 +73,7 @@ gmx_ana_nbsearch_free(gmx_ana_nbsearch_t *d);
 int
 gmx_ana_nbsearch_init(gmx_ana_nbsearch_t *d, t_pbc *pbc, int n, rvec x[]);
 /** Initializes neighborhood search for a frame using \c gmx_ana_pos_t.  */
+GMX_LIBGMX_EXPORT
 int
 gmx_ana_nbsearch_pos_init(gmx_ana_nbsearch_t *d, t_pbc *pbc,
                           struct gmx_ana_pos_t *p);
@@ -89,6 +91,7 @@ gmx_ana_nbsearch_pos_is_within(gmx_ana_nbsearch_t *d,
 real
 gmx_ana_nbsearch_mindist(gmx_ana_nbsearch_t *d, rvec x);
 /** Calculates the minimun distance from the reference points. */
+GMX_LIBGMX_EXPORT
 real
 gmx_ana_nbsearch_pos_mindist(gmx_ana_nbsearch_t *d,
                              struct gmx_ana_pos_t *p, int i);
index 35ce9bc5286506d0940d3400ddb8b854dd82e86f..680a88136f9435fa46f746334af55339514e572f 100644 (file)
@@ -129,6 +129,7 @@ GMX_LIBGMX_EXPORT
 void gmx_sumf_sim(int nr,float r[],const gmx_multisim_t *ms);
 /* Calculate the sum over the simulations of an array of floats */
 
+GMX_LIBGMX_EXPORT
 void gmx_sumd_sim(int nr,double r[],const gmx_multisim_t *ms);
 /* Calculate the sum over the simulations of an array of doubles */
 
index 08f878a0198a42ab7af97b093c0da22b64bf363b..9eb6679ca9b635f209a902ef47d6cef00e78dafa 100644 (file)
@@ -57,22 +57,23 @@ files.
 
 /* we put all of these on their own cache line by padding the data structure
    to the size of a cache line on x86 (64 bytes): */
+#define TMPI_SIZEOF_X86_CACHE_LINE 64
 typedef struct tMPI_Atomic
 {
     int value; 
-    char padding[64-sizeof(int)];
+    char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
 } tMPI_Atomic_t;
 
 typedef struct tMPI_Atomic_ptr
 {
     void* value; 
-    char padding[64-sizeof(void*)];
+    char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
 } tMPI_Atomic_ptr_t;
 
 typedef struct tMPI_Spinlock
 {
     unsigned int lock; 
-    char padding[64-sizeof(unsigned int)];
+    char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
 } tMPI_Spinlock_t;
 
 
@@ -95,9 +96,12 @@ typedef struct tMPI_Spinlock
    as the 486, and gcc on some Linux versions still target 80386 by default). 
   
    We also specifically check for icc, because intrinsics are not always
-   supported there. */
-#if ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
-     !defined(__INTEL_COMPILER) ) 
+   supported there.
+
+   llvm has issues with inline assembly and also in 32 bits has support for
+   the gcc intrinsics */
+#if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
+      !defined(__INTEL_COMPILER) )  || defined(__llvm__) )
 #include "gcc_intrinsics.h"
 
 #else
@@ -114,7 +118,7 @@ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
     __asm__ __volatile__("lock ; xaddl %0, %1;"
                          :"=r"(i) :"m"(a->value), "0"(i) : "memory");
     return i + __i;
-}  
+}
 
 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 {
@@ -125,7 +129,7 @@ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 
 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 {
-    unsigned int prev;
+    int prev;
     
     __asm__ __volatile__("lock ; cmpxchgl %1,%2"
                          : "=a"(prev)
index 42bff59464257bd6417020f890990b54abc78470..d4b32015c954e0e741c71b7637c51d51c89a416a 100644 (file)
@@ -196,6 +196,7 @@ gmx_ana_traj_free(gmx_ana_traj_t *d);
 int
 gmx_ana_add_flags(gmx_ana_traj_t *d, unsigned long flags);
 /** Sets the number of reference groups required. */
+GMX_LIBGMX_EXPORT
 int
 gmx_ana_set_nrefgrps(gmx_ana_traj_t *d, int nrefgrps);
 /** Sets the number of analysis groups required. */
@@ -250,6 +251,7 @@ GMX_LIBGMX_EXPORT
 int
 gmx_ana_get_nanagrps(gmx_ana_traj_t *d, int *nanagrps);
 /** Gets the selection object for a reference selection. */
+GMX_LIBGMX_EXPORT
 int
 gmx_ana_get_refsel(gmx_ana_traj_t *d, int i, gmx_ana_selection_t **sel);
 /** Gets the selection object for a reference selection. */
index 32dd9432d76f2b685b1180eadaaf7589cc39ea34..8a33375c03f04564c62fabe666a5a2ddcf63f8a3 100644 (file)
 extern "C" {
 #endif
 
-/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
-enum { nbkNotSet = 0, 
-       nbk4x4_PlainC, 
-       nbk4xN_X86_SIMD128,
-       nbk4xN_X86_SIMD256,
-       nbk8x8x8_CUDA,
-       nbk8x8x8_PlainC };
+#ifdef GMX_X86_SSE2
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+
+#ifdef GMX_X86_AVX_256
+/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+#define GMX_NBNXN_SIMD_BITWIDTH  256
+#else
+#define GMX_NBNXN_SIMD_BITWIDTH  128
+#endif
+
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense and are only implemented
+ * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ */
+#define GMX_NBNXN_SIMD_4XN
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#endif
+
+
+/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+typedef enum
+{
+    nbnxnkNotSet = 0, 
+    nbnxnk4x4_PlainC, 
+    nbnxnk4xN_SIMD_4xN,
+    nbnxnk4xN_SIMD_2xNN,
+    nbnxnk8x8x8_CUDA,
+    nbnxnk8x8x8_PlainC,
+    nbnxnkNR
+} nbnxn_kernel_type;
 
 /* Note that _mm_... intrinsics can be converted to either SSE or AVX
  * depending on compiler flags.
  * For gcc we check for __AVX__
  * At least a check for icc should be added (if there is a macro)
  */
-static const char *nbk_name[] =
-  { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+static const char *nbnxn_kernel_name[nbnxnkNR] =
+  { "not set", "plain C",
+#if !(defined GMX_X86_SSE2)
+    "not available", "not available",
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 #ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
-    "SSE2 4x4",
+    "SSE2", "SSE2",
 #else
-    "SSE2 4x2",
+    "SSE4.1", "SSE4.1",
 #endif
 #else
-#ifndef GMX_DOUBLE
-    "SSE4.1 4x4",
-#else
-    "SSE4.1 4x2",
+    "AVX-128", "AVX-128",
 #endif
-#endif
-#else
-#ifndef GMX_DOUBLE
-    "AVX-128 4x4",
 #else
-    "AVX-128 4x2",
+    "AVX-256",  "AVX-256",
 #endif
 #endif
-#ifndef GMX_DOUBLE
-    "AVX-256 4x8",
-#else
-    "AVX-256 4x4",
-#endif
-    "CUDA 8x8x8", "plain C 8x8x8" };
+    "CUDA", "plain C" };
 
 enum { ewaldexclTable, ewaldexclAnalytical };
 
@@ -119,9 +137,9 @@ typedef struct {
 
 /* non-bonded data structure with Verlet-type cut-off */
 typedef struct {
-    nbnxn_search_t           nbs;   /* n vs n atom pair searching data          */
-    int                      ngrp;  /* number of interaction groups             */
-    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group    */
+    nbnxn_search_t           nbs;   /* n vs n atom pair searching data       */
+    int                      ngrp;  /* number of interaction groups          */
+    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
 
     gmx_bool         bUseGPU;          /* TRUE when GPU acceleration is used */
     nbnxn_cuda_ptr_t cu_nbv;           /* pointer to CUDA nb verlet data     */
index b6bc9650c6d1d367ee4fe5e426958a1d74bd4924..4d337cf1a3f4a49260a2b6becafa5b6724b62df5 100644 (file)
@@ -71,6 +71,14 @@ typedef struct {
     unsigned excl;  /* The exclusion (interaction) bits */
 } nbnxn_cj_t;
 
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc))   => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
 #define NBNXN_CI_SHIFT          127
 #define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
 #define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
@@ -79,7 +87,7 @@ typedef struct {
 /* Simple pair-list i-unit */
 typedef struct {
     int ci;             /* i-cluster             */
-    int shift;          /* Shift vector index plus possible flags */
+    int shift;          /* Shift vector index plus possible flags, see above */
     int cj_ind_start;   /* Start index into cj   */
     int cj_ind_end;     /* End index into cj     */
 } nbnxn_ci_t;
index 6c9995f2770bcb528c39216b0d36ac27d83e2e54..410b35e53613db3097fb933c8c9cd913885c52b3 100644 (file)
@@ -183,40 +183,6 @@ static real gmx_software_invsqrt(real x)
 #define INVSQRT_DONE 
 #endif /* gmx_invsqrt */
 
-#ifdef GMX_POWERPC_SQRT
-static real gmx_powerpc_invsqrt(real x)
-{
-  const real  half=0.5;
-  const real  three=3.0;
-  t_convert   result,bit_pattern;
-  unsigned int exp,fract;
-  real        lu;
-  real        y;
-#ifdef GMX_DOUBLE
-  real        y2;
-#endif
-
-  lu = __frsqrte((double)x);
-
-  y=(half*lu*(three-((x*lu)*lu)));
-
-#if (GMX_POWERPC_SQRT==2)
-  /* Extra iteration required */
-  y=(half*y*(three-((x*y)*y)));
-#endif
-
-#ifdef GMX_DOUBLE
-  y2=(half*y*(three-((x*y)*y)));
-
-  return y2;                    /* 10 Flops */
-#else
-  return y;                     /* 5  Flops */
-#endif
-}
-#define gmx_invsqrt(x) gmx_powerpc_invsqrt(x)
-#define INVSQRT_DONE
-#endif /* powerpc_invsqrt */
-
 #ifndef INVSQRT_DONE
 #    ifdef GMX_DOUBLE
 #        ifdef HAVE_RSQRT
index de457e2b1b8b7a0921b3bada649bc0b3556af23b..b57659a9495927e3e7b38ecb9060f5a182907956 100644 (file)
@@ -5,9 +5,8 @@
 # If you only use one shell you can copy that GMXRC.* instead.
 
 
-# only csh/tcsh understand 'set'
-set is_csh = 123
-test "$is_csh" = 123 && goto CSH
+# only csh/tcsh set the variable $shell (note: lower case!)
+test $shell && goto CSH
 
 # if we got here, shell is bsh/bash/zsh/ksh
 # bsh cannot remove part of a variable with %%
index e426784be0b302316b5e446994873966aabfeab3..a4ace20c75582bcb0164f9095be653b3902f757c 100644 (file)
@@ -37,6 +37,30 @@ install(FILES CMakeLists.txt.template
         RENAME CMakeLists.txt
         COMPONENT development)
 
+file(GLOB_RECURSE GROMACS_HEADERS ${CMAKE_SOURCE_DIR}/include *.h)
+add_custom_command(OUTPUT gromacs
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/include gromacs 
+    DEPENDS ${GROMACS_HEADERS})
+add_custom_target(gromacs_include_links DEPENDS gromacs)
+
+option(GMX_BUILD_TEMPLATE "Build gromacs template program" ON)
+mark_as_advanced(GMX_BUILD_TEMPLATE)
+# GMX_PREFER_STATIC_OPENMP=yes is a special case to build binaries
+# to distribute and as the template is not installed it can be
+# ignored.
+# The template is build in a user-like environment, hence we use
+# flags from PKG_CFLAGS. Again GMX_PREFER_STATIC_OPENMP=yes would
+# need special link flags (OpenMP_LINKER_FLAGS), which are not
+# very user-like.
+if (GMX_BUILD_TEMPLATE AND NOT GMX_PREFER_STATIC_OPENMP)
+    add_executable(template template.c)
+    remove_definitions( -DHAVE_CONFIG_H )
+    add_definitions("${PKG_CFLAGS}")
+    target_link_libraries(template gmx)
+    include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+    add_dependencies(template gromacs_include_links)
+endif()
+
 install(FILES README template.c Makefile.pkg
         DESTINATION ${DATA_INSTALL_DIR}/template
         COMPONENT development)
index 5a6450219055c3bc3cf6942d3111cb79d27b7b59..decd7dad65657042dc3da4d66a49b15c5ed61f15 100644 (file)
@@ -34,8 +34,8 @@
 #include <gromacs/pbc.h>
 #include <gromacs/smalloc.h>
 #include <gromacs/statutil.h>
-#include <gromacs/vec.h>
 #include <gromacs/xvgr.h>
+#include <gromacs/gmx_fatal.h>
 
 #include <gromacs/nbsearch.h>
 #include <gromacs/trajana.h>
index b2c9855867af11f142bac69b1f7961690e04df8d..528b0bea7997deb8875b02547b5264f8920cd543 100644 (file)
 /* Turn off all water neighborlist optimization - not used right now */
 #cmakedefine DISABLE_WATER_NLIST
 
-/* Fortran support */
-#cmakedefine GMX_FORTRAN
-
-/* Define to a macro mangling the given C identifier (in lower and upper
-   case), which must not contain underscores, for linking with Fortran. */
-#define F77_FUNC(name,NAME)     @F77_FUNCDEF@
-
-/* As F77_FUNC, but for C identifiers containing underscores. */
-#define F77_FUNC_(name,NAME)    @F77_FUNCDEF_@
-
 /* IEEE754 floating-point format. Memory layout is defined by macros
  * GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER and GMX_IEEE754_BIG_ENDIAN_WORD_ORDER. 
  */
@@ -80,9 +70,6 @@
 /* Use assembly intrinsics kernels for BlueGene */
 #cmakedefine GMX_BLUEGENE
 
-/* Power6 acceleration */
-#cmakedefine GMX_POWER6
-
 /* Work around broken calloc() */
 #cmakedefine GMX_BROKEN_CALLOC
 
 /* Use the GROMACS software 1/sqrt(x) */
 #cmakedefine GMX_SOFTWARE_INVSQRT
 
-/* Use the PowerPC hardware 1/sqrt(x) */
-#cmakedefine GMX_POWERPC_INVSQRT
-
 /* Use sub-counters */
 #cmakedefine GMX_CYCLE_SUBCOUNTERS
 
 /* Build special-purpose mdrun library */
 #cmakedefine GMX_FAHCORE   
 
+/* Disable gromacs quotes */
+#cmakedefine GMX_NO_QUOTES
+
 #ifdef GMX_FAHCORE
 #define FULLINDIRECT 1
 #define USE_FAH_XDR  1
index 574dcee2f7a475255003a94e841ba10c67a78ba2..5354ae8ca1b287b3427d450f1be582896ff094ed 100644 (file)
@@ -126,7 +126,7 @@ gmx_bool be_cool(void)
    * but we dont call this routine often, and it avoids using 
    * a mutex for locking the variable...
    */
-#ifdef GMX_FAHCORE
+#if defined(GMX_FAHCORE) || defined(GMX_NO_QUOTES)
   /*be uncool*/
   return FALSE;
 #else
@@ -237,10 +237,11 @@ void CopyRight(FILE *out,const char *szProgram)
    * name of a file. Otherwise, we won't be able to find the library dir.
    */
 #define NCR (int)asize(CopyrightText)
+/* TODO: Is this exception still needed? */
 #ifdef GMX_FAHCORE
-#define NGPL 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
+#define NLICENSE 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
 #else
-#define NGPL (int)asize(GPLText)
+#define NLICENSE (int)asize(LicenseText)
 #endif
 
   char buf[256],tmpstr[1024];
@@ -270,8 +271,8 @@ void CopyRight(FILE *out,const char *szProgram)
 
   for(i=0; (i<NCR); i++) 
     sp_print(out,CopyrightText[i]);
-  for(i=0; (i<NGPL); i++)
-    sp_print(out,GPLText[i]);
+  for(i=0; (i<NLICENSE); i++)
+    sp_print(out,LicenseText[i]);
 
   fprintf(out,"\n");
 
index 4304807723630695a4f010561d7b5b9c5b405afe..bf7c1302a2a17b9a14649174699b90aa1c72f945 100644 (file)
 #include <unistd.h>
 #endif
 
+#include "gmx_cpuid.h"
 
 
 
-#include "gmx_cpuid.h"
-
+/* For convenience, and to enable configure-time invocation, we keep all architectures
+ * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
+ */
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#    define GMX_CPUID_X86
+#endif
 
 /* Global constant character strings corresponding to our enumerated types */
 const char *
@@ -209,10 +214,7 @@ compiled_acc = GMX_CPUID_ACCELERATION_NONE;
 #endif
 
 
-/* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
- * if the compiler handles GNU-style inline assembly.
- */
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#ifdef GMX_CPUID_X86
 
 /* Execute CPUID on x86 class CPUs. level sets function to exec, and the
  * contents of register output is returned. See Intel/AMD docs for details.
@@ -231,6 +233,10 @@ execute_x86cpuid(unsigned int   level,
 {
     int rc = 0;
 
+    /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
+     * if the compiler handles GNU-style inline assembly.
+     */
+
 #if (defined _MSC_VER)
     int CPUInfo[4];
 
@@ -283,7 +289,6 @@ execute_x86cpuid(unsigned int   level,
 #endif
     return rc;
 }
-#endif /* architecture is x86 */
 
 
 /* Identify CPU features common to Intel & AMD - mainly brand string,
@@ -465,6 +470,9 @@ cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
     }
     return 0;
 }
+#endif /* GMX_CPUID_X86 */
+
+
 
 /* Try to find the vendor of the current CPU, so we know what specific
  * detection routine to call.
@@ -480,6 +488,7 @@ cpuid_check_vendor(void)
     /* Set default first */
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
 
+#ifdef GMX_CPUID_X86
     execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
 
     memcpy(vendorstring,&ebx,4);
@@ -495,7 +504,10 @@ cpuid_check_vendor(void)
             vendor = i;
         }
     }
-
+#else
+    vendor = GMX_CPUID_VENDOR_UNKNOWN;
+#endif
+    
     return vendor;
 }
 
@@ -521,12 +533,14 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
 
     switch(cpuid->vendor)
     {
+#ifdef GMX_CPUID_X86
         case GMX_CPUID_VENDOR_INTEL:
             cpuid_check_intel_x86(cpuid);
             break;
         case GMX_CPUID_VENDOR_AMD:
             cpuid_check_amd_x86(cpuid);
             break;
+#endif
         default:
             /* Could not find vendor */
             strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
@@ -706,7 +720,7 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 enum gmx_cpuid_x86_smt
 gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
 {
-
+#ifdef GMX_CPUID_X86
 #if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
     int            i;
     int            nproc;
@@ -787,6 +801,10 @@ gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
         return GMX_CPUID_X86_SMT_CANNOTDETECT;
     }
 #endif
+#else 
+    /* not x86 */
+    return GMX_CPUID_X86_SMT_CANNOTDETECT;
+#endif
 }
 
 
index 21e0c649b92a5c6e8c80c0580e68b7c3aea45c1f..ad3dd9488a4e661d0254acea985ee7a9873b6715 100644 (file)
@@ -63,6 +63,9 @@
  * ridiculous number. */
 static unsigned int max_gpu_ids_user = 64;
 
+static const char* invalid_gpuid_hint =
+    "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+
 /* FW decl. */
 void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
 
@@ -173,7 +176,8 @@ static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
     {
         if (idstr[i] < '0' || idstr[i] > '9')
         {
-            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
+            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
+                      idstr[i], invalid_gpuid_hint);
         }
         idlist[i] = idstr[i] - '0';
     }
@@ -492,10 +496,15 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
     bGPUBin      = FALSE;
 #endif
 
-    /* Bail if binary is not compiled with GPU on */
+    /* Bail if binary is not compiled with GPU acceleration, but this is either
+     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
     if (bForceUseGPU && !bGPUBin)
     {
-        gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+    }
+    if (gpu_id != NULL && !bGPUBin)
+    {
+        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
     }
 
     /* run the detection if the binary was compiled with GPU support */
@@ -545,7 +554,7 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
 
             if (nid == 0)
             {
-                gmx_fatal(FARGS, "Empty GPU ID string passed\n");
+                gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
             }
 
             res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
index 1ad107dfdbe9c174926f5bc66ed9325b8999b468..cc749674305e33aa0a436bb2b08a27cf967025d1 100644 (file)
@@ -55,5 +55,8 @@ CUDA_ADD_LIBRARY(gpu_utils STATIC ${GPU_UTILS_SOURCES}
                  OPTIONS ${_os_def}
                  RELWITHDEBINFO -g
                  DEBUG -g -D_DEBUG_=1 )
+#Because this is a static library linked into the (potential) shared library
+#it should have the export of the shared library.
+SET_TARGET_PROPERTIES(gpu_utils PROPERTIES DEFINE_SYMBOL "gmx_EXPORTS" )
 
 CUDA_BUILD_CLEAN_TARGET()
index 3b4227907e411d21ee86beb2203b98e916dc8e9b..0bc0b0d3cc71aff5d945b0454116d8bc5628e510 100644 (file)
@@ -6,7 +6,7 @@ Description: Gromacs default lib
 URL: http://www.gromacs.org
 Version: @PROJECT_VERSION@
 Requires:
-Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
 Libs: -L${libdir} -lgmx@GMX_LIBS_SUFFIX@ -lm
 Cflags: -I${includedir} @PKG_CFLAGS@
 
index e915536c5f95844860c9899bd318a187455d4ce2..242260ac86374d045f508a8955cf061856c0049a 100644 (file)
@@ -77,7 +77,7 @@ gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double *
 
     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
-    return gmx_mm256_set_m128(t2,t1);
+    return gmx_mm256_set_m128d(t2,t1);
 }
 
 
@@ -201,8 +201,8 @@ gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * g
 {
     __m256d t1,t2;
 
-    t1   = gmx_mm256_set_m128(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
-    t2   = gmx_mm256_set_m128(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
+    t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
+    t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 
     *c6  = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
     *c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
@@ -230,9 +230,9 @@ gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shif
     ty  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
     tz  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
 
-    *x1 = gmx_mm256_set_m128(tx,tx);
-    *y1 = gmx_mm256_set_m128(ty,ty);
-    *z1 = gmx_mm256_set_m128(tz,tz);
+    *x1 = gmx_mm256_set_m128d(tx,tx);
+    *y1 = gmx_mm256_set_m128d(ty,ty);
+    *z1 = gmx_mm256_set_m128d(tz,tz);
 }
 
 
@@ -265,21 +265,21 @@ gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shif
     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
-    *x1 = gmx_mm256_set_m128(tx,tx);
-    *y1 = gmx_mm256_set_m128(ty,ty);
-    *z1 = gmx_mm256_set_m128(tz,tz);
+    *x1 = gmx_mm256_set_m128d(tx,tx);
+    *y1 = gmx_mm256_set_m128d(ty,ty);
+    *z1 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
-    *x2 = gmx_mm256_set_m128(tx,tx);
-    *y2 = gmx_mm256_set_m128(ty,ty);
-    *z2 = gmx_mm256_set_m128(tz,tz);
+    *x2 = gmx_mm256_set_m128d(tx,tx);
+    *y2 = gmx_mm256_set_m128d(ty,ty);
+    *z2 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
-    *x3 = gmx_mm256_set_m128(tx,tx);
-    *y3 = gmx_mm256_set_m128(ty,ty);
-    *z3 = gmx_mm256_set_m128(tz,tz);
+    *x3 = gmx_mm256_set_m128d(tx,tx);
+    *y3 = gmx_mm256_set_m128d(ty,ty);
+    *z3 = gmx_mm256_set_m128d(tz,tz);
 }
 
 
@@ -315,27 +315,27 @@ gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shif
     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
-    *x1 = gmx_mm256_set_m128(tx,tx);
-    *y1 = gmx_mm256_set_m128(ty,ty);
-    *z1 = gmx_mm256_set_m128(tz,tz);
+    *x1 = gmx_mm256_set_m128d(tx,tx);
+    *y1 = gmx_mm256_set_m128d(ty,ty);
+    *z1 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
-    *x2 = gmx_mm256_set_m128(tx,tx);
-    *y2 = gmx_mm256_set_m128(ty,ty);
-    *z2 = gmx_mm256_set_m128(tz,tz);
+    *x2 = gmx_mm256_set_m128d(tx,tx);
+    *y2 = gmx_mm256_set_m128d(ty,ty);
+    *z2 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
-    *x3 = gmx_mm256_set_m128(tx,tx);
-    *y3 = gmx_mm256_set_m128(ty,ty);
-    *z3 = gmx_mm256_set_m128(tz,tz);
+    *x3 = gmx_mm256_set_m128d(tx,tx);
+    *y3 = gmx_mm256_set_m128d(ty,ty);
+    *z3 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
     ty   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
     tz   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
-    *x4 = gmx_mm256_set_m128(tx,tx);
-    *y4 = gmx_mm256_set_m128(ty,ty);
-    *z4 = gmx_mm256_set_m128(tz,tz);
+    *x4 = gmx_mm256_set_m128d(tx,tx);
+    *y4 = gmx_mm256_set_m128d(ty,ty);
+    *z4 = gmx_mm256_set_m128d(tz,tz);
 }
 
 
@@ -1333,7 +1333,7 @@ gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
 
-    fix1 = gmx_mm256_set_m128(tB,tA); /* 0 fiz fiy fix */
+    fix1 = gmx_mm256_set_m128d(tB,tA); /* 0 fiz fiy fix */
 
     t1   = _mm256_loadu_pd(fptr);
     t2   = _mm256_loadu_pd(fshiftptr);
@@ -1363,7 +1363,7 @@ gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
     tC   = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
     
-    t1   = gmx_mm256_set_m128(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
+    t1   = gmx_mm256_set_m128d(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
 
     t2   = _mm256_loadu_pd(fptr);
     tD   = _mm_loadu_pd(fptr+4);
index 5e579d1f618c687e3f23df2ebc81b29279cc3abb..4e471c3f7c91a613afb70e3e4d6d8ece9547b52e 100644 (file)
@@ -306,7 +306,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -702,7 +702,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6f57d4038d5ce09ece063217f78968c0daf9ec95..2999542943fa28f89f0e1bacca86581fd777d7ae 100644 (file)
@@ -435,7 +435,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1071,7 +1071,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ad08857305bf9d76f83296bfdeaf27ebb03de497..6de88eae5b1c0c839d98bb7812d7439649b51221 100644 (file)
@@ -740,7 +740,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1954,7 +1954,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4b005303769bd90e4cc025cf4e362f195529c61d..c1a36e5ec7ff1a2024e691c105959f21b843dcc3 100644 (file)
@@ -478,7 +478,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1195,7 +1195,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c7f3e88822ad6a852a6f03722ab968e1b8da7e23..f15087d8c2d6d81d6ae09ebe5d40d455973cc30f 100644 (file)
@@ -788,7 +788,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2091,7 +2091,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4c3835b2cff175b9931c1e61ec2ea4c5ab57d1b4..e0fe9b7eb5068950827caf58d121953af94d234c 100644 (file)
@@ -286,7 +286,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -644,7 +644,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b8eaf5bb463eb7b46a2e0d9f6f66007dff5c253b..b0242ced74d0a3820d4d1646235ee4cbc59ed77a 100644 (file)
@@ -415,7 +415,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1013,7 +1013,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 04d6e3566b2371d3d72cb0a13754d77a59dd514a..075e7448a1448e37051e79faf6191a0b461f9d96 100644 (file)
@@ -720,7 +720,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1896,7 +1896,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d8ceaeda18ba2fd9a5c26ac93f46582d2ab14809..adb9e10230bfe767b298984cc0c64b2cc067467d 100644 (file)
@@ -449,7 +449,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1109,7 +1109,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 26291eb122534ef0d2ea6e82bfc8ed7c7b63414f..36c78bb7b34544f6a59fafc1a274a461d49ad43d 100644 (file)
@@ -759,7 +759,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2005,7 +2005,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 0d0d6fba50ff82f816d89517b67645b72f21dcb5..ef9f48193e6e7f128cf5f66a2033738f07501012 100644 (file)
@@ -255,7 +255,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -565,7 +565,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 27ef79745ced7ffec86757b06c50d3515e27ac85..1e8c7123fb3440c808cd4fc36b3c1175db06e687 100644 (file)
@@ -384,7 +384,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -934,7 +934,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 0c923102df6696f634dbb8b6feb7cff84dad0da9..be53c0da10b1de5635fd76809a8641589df4226a 100644 (file)
@@ -695,7 +695,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1838,7 +1838,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 7dc8f6e2f94344cc05313779799f264a71c2c622..2e0cffc04fb10dbaa71e93e24fff4760bf7ad2a6 100644 (file)
@@ -384,7 +384,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -934,7 +934,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b21dbd4acf2fdeebb26afc665cd272afe68b73d8..a74295dea453eb39ee8ccf8d8dac74ff8dc29b40 100644 (file)
@@ -695,7 +695,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1838,7 +1838,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 9422be871bb8004b57055f499c02a1df62028dce..5d2ea1a29e0a0abe427eea9a8f21ada987998658 100644 (file)
@@ -298,7 +298,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -680,7 +680,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 15d009fab4d7cd0dd69325cc6f3447a0c3de159c..73911c879c2b015671e07f6082312d4a8925975c 100644 (file)
@@ -395,7 +395,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -955,7 +955,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 64d198b7b8420689ddaafa429e8e2c1ff5361952..f7daf7473fdcda7f9e9eaf7cc9e675365447708d 100644 (file)
@@ -604,7 +604,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1556,7 +1556,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 9653ff01e89d6fdb2fef3caefbfb51fcf5819a8b..208650a52f1136ccd1ddfedc0e2be013ff7aedee 100644 (file)
@@ -430,7 +430,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1054,7 +1054,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 94a19d8aecc647d63dc47162d6bc7b2a0f2b7c38..98226b9f9ff9d9552baafbf9426764e8a1cb3318 100644 (file)
@@ -644,7 +644,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1668,7 +1668,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 693980c7776c72dabdfdeb75164800747e3d4000..d67174bec1f1d0f02facc05de4b00e03441bc7b3 100644 (file)
@@ -262,7 +262,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -580,7 +580,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 57000e7b1eee96aca86001ca24d18bfbf882e31e..a528385eb0e73e8bed5304785a21b1fedb9a4e2b 100644 (file)
@@ -359,7 +359,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -855,7 +855,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2d71191330e58d3ed67372ce7f60a353c6d90da8..58a632e437234f6fd3c5bf6dabf9176f98c1c26b 100644 (file)
@@ -568,7 +568,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1456,7 +1456,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b4708dd5fc9b8b0bbd25c4860b7888bf6801f930..701d71ffef27a0ed4c9148876d27f8674ef83b5f 100644 (file)
@@ -394,7 +394,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -954,7 +954,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ddb6425c2e029ee2cc8dca6dcbe4ed6876ad6ca4..0fd01e6507d24236295853d80132434c4d295a58 100644 (file)
@@ -608,7 +608,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1568,7 +1568,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c57f5c239c47f827ca41eb67da69b968b2ad0bfa..934ea7dd51f9781f0316d35753e6a96636b4abbc 100644 (file)
@@ -233,7 +233,7 @@ nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -507,7 +507,7 @@ nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 162245432564888cfed2dd91cc193558b58b71a0..e63d4842f67441a4044312048895da4f6ff10c39 100644 (file)
@@ -330,7 +330,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -782,7 +782,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 8462f8620996a675325394652eb60b83fba95b3a..8db5f4bc1bac98cfe93ce8a2e7d345bc41ac1aa7 100644 (file)
@@ -545,7 +545,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1404,7 +1404,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index fcfafbc61a42a60eabdb02cc2aea0c01f63b80cb..a02c629844c1a5ae5dd4a63e31761a2f563fffb7 100644 (file)
@@ -330,7 +330,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -782,7 +782,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 31c3c94014f8258c1fc9f5faa62627cb8a6d9a98..299a85acc61db1a43c019aca48fa56beafc1163f 100644 (file)
@@ -545,7 +545,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1404,7 +1404,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e9d4cd5d25c3f14f04a2fd7ff351ddd1045e0430..afd626dd74a9b4365c04b7fa90207047b7712e9f 100644 (file)
@@ -310,7 +310,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -696,7 +696,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6de6a74a28a314929c1e681dc3336a57d2d4a464..79c9f51a36e9112b5c9022b9a7a99d63def804d8 100644 (file)
@@ -457,7 +457,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1111,7 +1111,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ac5428beaebc93538aa8031df7fa78c67ba02bb7..a8b8f82c08eb57745055d741893936f15ab89330 100644 (file)
@@ -816,7 +816,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2132,7 +2132,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b16c6db54b4cc0ff835a6d72c9d5e8e4edfa9e2b..27d672d3c120aeb4fe40524a3fb515987a9d8ed8 100644 (file)
@@ -501,7 +501,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1237,7 +1237,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d0351ef6aa9aed92644098cb93390326ad34f587..4915511e113b950e1f8980ace10a13cd7827ad7a 100644 (file)
@@ -865,7 +865,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2271,7 +2271,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 18cdb9503382f34c788b57ca0fcab7bb47f321eb..5db2b6dd21c66d75207622b47f86991783be22b3 100644 (file)
@@ -276,7 +276,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -613,7 +613,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 850fd9fef92d74a8e225d677e6688d92b0cbc68f..aefb72e463fc920a07e8089e95cc3a1e1f7ab233 100644 (file)
@@ -423,7 +423,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1028,7 +1028,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 737096d54596ce312cf45b0d68d72374ea9f98b2..9fd59160c388b5d5ff19596ffa551221b35cabc9 100644 (file)
@@ -788,7 +788,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2070,7 +2070,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 5a3e04a34d1bed1715d67cc3460a1c2b305a8822..f167fc0a70114361ca114cb2537c7ad265eb5684 100644 (file)
@@ -423,7 +423,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1028,7 +1028,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6fc666c14b484394057ec46f254a6584f87ec374..018c05a12e4a9b06f20cfcbc11b8c68ef0c9ba58 100644 (file)
@@ -788,7 +788,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2070,7 +2070,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 0e4c619dc6980563262f1b72a8d6167dfa8b8ab4..bc79ecfa1c41da09c7385e5fadeca0e254c30931 100644 (file)
@@ -333,7 +333,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -761,7 +761,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c8ac07b56d7c0dc7b7bed23f973561223e438d8b..33ee7f245701b5dc7bc05b476e59bbf53108e2df 100644 (file)
@@ -502,7 +502,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1250,7 +1250,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1f4487647a8297e67c17afec83ce5be6990dc82d..93c18ee9bee9ad8f6d5b2db85d4088c68cb19e99 100644 (file)
@@ -927,7 +927,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2493,7 +2493,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c4d109ba4f9099057bc44187e9064593e5a1eca0..e971ddc79ab6e6835c22ed6bf2c25d1b60a3d1e7 100644 (file)
@@ -558,7 +558,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1413,7 +1413,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c1074761b4b357b9893dc226372e9d0c5ec76909..6a8b0c4f40d62513045f7e5db8088ca6c0de408a 100644 (file)
@@ -988,7 +988,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2669,7 +2669,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 096a790eba0bf23a32fd9fe3a152c4dcb8991b76..79dcfe0d62d1e54b997347731698fcf6b34c7669 100644 (file)
@@ -301,7 +301,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -678,7 +678,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 5137d85711daf465ef12746bd5351fe3625b027c..c75cbb7390342f624f9d7449f9315af8be93705b 100644 (file)
@@ -470,7 +470,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1167,7 +1167,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6951a9d6562ba59421aed5eb68a970731dc6c42b..cfa8e445e324695e63829240d597cd410087135d 100644 (file)
@@ -901,7 +901,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2431,7 +2431,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 03b5f5fc34b38ccdb03b74a46d0011b59b166045..fcb44ac1f5e3a3876e42133b81b852580e2e768d 100644 (file)
@@ -470,7 +470,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1167,7 +1167,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 9cc88e70259ff71d81bfcd2aeb1e70eae984cf68..4ace2e87a2731e2a4c3cf9d9ab6e35456d972af3 100644 (file)
@@ -901,7 +901,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2431,7 +2431,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 287fb2eef64364f2e406bdaed59124fd6decaedd..6eb8d61bc9fcf3674a5b2cf9c887f4ef00697df9 100644 (file)
@@ -324,7 +324,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -740,7 +740,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4b3078ad9cdb9b11accb513018b2c695c317e490..0cf578d4a80ff808c55ac401cf40a4385a473039 100644 (file)
@@ -451,7 +451,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1097,7 +1097,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 91e8d98a978d8c15c660b9023231254e5998a368..5457ed1e947ab47f82c4685108efed3f99dfd0b6 100644 (file)
@@ -750,7 +750,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1944,7 +1944,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 82b44f266786a4f361275d29a2a5b5728f939863..54bd0d940aeb45847a1138416ffd2c949d8d707c 100644 (file)
@@ -488,7 +488,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1203,7 +1203,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e853d2fc4e719d73b436552987e370f351fd3df7..e885cf9585f471afeef8533ff5a198976ba5c2c8 100644 (file)
@@ -792,7 +792,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2063,7 +2063,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d110386d842f24eb7eac481621016cdccb46d2be..6f249032d2b3fa138720a9f9140d47104b0bfa7d 100644 (file)
@@ -290,7 +290,7 @@ nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -647,7 +647,7 @@ nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1adcedfddbe0b642ab74218d40ef02dffab1b98d..1ec1fd6b247c38f5f6476f481fbd9e7d3d059437 100644 (file)
@@ -417,7 +417,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1004,7 +1004,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b0ee8bd719e05c8f97f6ad3f1e0373c423b7b7da..25563d326e3926146ef14bde87cb304d1fee8f69 100644 (file)
@@ -716,7 +716,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1851,7 +1851,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f8ac075100951d39221ddb3a1e1c0156acd68ea3..3424c8438e7b1e1424a6eba937b2c1a24925727f 100644 (file)
@@ -452,7 +452,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1103,7 +1103,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 8d250c4c2035de05b29c0dca4e4ad4bd6da0a489..2c6cb48220f334a4955265e39a0ae5a744f198fa 100644 (file)
@@ -756,7 +756,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1963,7 +1963,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index cf1fb2f66d7d0a02d4a5ffc6bb6ae696371c2f86..07252ae3fba0d411e5467553a5974614390db56b 100644 (file)
@@ -261,7 +261,7 @@ nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -574,7 +574,7 @@ nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index cb3127b33a28b5931e430c8ba62ab20e2860e31d..fa76a525c47ab12d9ba925b9bda1ad1adae6dd29 100644 (file)
@@ -388,7 +388,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -931,7 +931,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 5b47b24cf2098ab6affde0a27e3d66ee39fc0345..478a0d621ab5a814271a557b3a88fd83adcb8fff 100644 (file)
@@ -693,7 +693,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1799,7 +1799,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ecc8c675f3163a51c754ca640202dc54e0b19e90..d171dc653a75d544b9d3bd74636c036b206fd29a 100644 (file)
@@ -388,7 +388,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -931,7 +931,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 15457ea0f0c7e9c4dc55d90f18ec5b8925b20ffe..47c16d4dc26865b44ebaacc8024bcf3ea5cb3597 100644 (file)
@@ -693,7 +693,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1799,7 +1799,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1ef26a7877f2b2d2c52e44dd427bdc578f978697..93dddf903f5196b121cc8b5741e7473d1dd8d239 100644 (file)
@@ -343,7 +343,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -806,7 +806,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 246f8642309d50126bfedf5576e27e51918940fc..ceac3dcc4f6f8b3bae5afd71033211796cbe0a5b 100644 (file)
@@ -315,7 +315,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -727,7 +727,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 9d4318fdc32b344f29140b3f589f755d2545348d..d240932c4eb824932d4b25954ca6655130729614 100644 (file)
@@ -284,7 +284,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -648,7 +648,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index fb1d4115c07cf2b3c513fed0107737ee48b2caf9..47b713daead1ac800a6f247d0cc73f1d8be1c075 100644 (file)
@@ -282,7 +282,7 @@ nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -638,7 +638,7 @@ nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 559493a0220755b0417fb8581bffdb4545658772..c5b30162fd2ee24706ed6d382acd739b708c6926 100644 (file)
@@ -264,7 +264,7 @@ nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -583,7 +583,7 @@ nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index bbbe149beb962fb692ae6daf8d66d601a2e15c24..ae2c7df49769039f803f6452d57ce888d4fbef4d 100644 (file)
@@ -289,7 +289,7 @@ nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -651,7 +651,7 @@ nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index a2781a5f164a8c8296fbd00f4ea8615d38fb705b..f81b14eef12b06e825c3e061021a41cdd60509a4 100644 (file)
@@ -246,7 +246,7 @@ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -538,7 +538,7 @@ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 45659fc29abf21b13b28796895e91c4db15075ae..3cccb1deb8e662675941b15a9b84bed3d911d23f 100644 (file)
@@ -317,7 +317,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -726,7 +726,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 96c0b08e882b467b3e37ef5d9302a382b2239c16..47dca61994b374b9c27127bd155165e943f648a9 100644 (file)
@@ -434,7 +434,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1057,7 +1057,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 94a92b8f9c2be6cf0a3a3ce36628ca0f6d0fa850..1d9c763f4c8596317fc1a78821ac79a7c7aa5093 100644 (file)
@@ -703,7 +703,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1826,7 +1826,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ad9ffc4366d45b834d5f8223a2e9767a01870540..2c3cede629c5b4686a5562d1d1619f0be9364446 100644 (file)
@@ -468,7 +468,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1154,7 +1154,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 72695842ac4e7b8118c3054615a63fe78b341900..2444bd284deff6e4cf55146910368d0384de1e63 100644 (file)
@@ -752,7 +752,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1965,7 +1965,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2702549656289db8b9a4652270c9f07b8377071e..2c3b78c50697c282ad216dcfa9452c53ddac4325 100644 (file)
@@ -285,7 +285,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -634,7 +634,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1e48649445a96bdb609647c48f01421d0e21b57e..6a232d552f63f5b316772210f8b1b8a158a917ec 100644 (file)
@@ -402,7 +402,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -965,7 +965,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 16950b998f490433cb9ee5c45216157ca5f8c799..1b7dcc4049d85567882ce42c589a2c70730934e9 100644 (file)
@@ -671,7 +671,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1734,7 +1734,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 802f3ec8e7303c446773e7c5b95fc747995e3b2b..1bb9ff2b316db062f16fc7efe3e462dcfe6ed358 100644 (file)
@@ -446,7 +446,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1091,7 +1091,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f75304f6501f106e9edc4919b8560092b229f577..9be2260c7717bae25f1c05be10c348d8aaf27666 100644 (file)
@@ -720,7 +720,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1873,7 +1873,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4b053cf3bf8cc3901fa964aed73cf338a92d7ae8..4ac2fdb4161c0300c99f771d22344bfb2daae822 100644 (file)
@@ -308,7 +308,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -696,7 +696,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 40902802b6304e8a2392d3fee1c6d8658a987e87..4d60c4858ae39419366e4ad0fec536945c517e4b 100644 (file)
@@ -425,7 +425,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1027,7 +1027,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index a74e5e0c0cb0007047d60324ec232dd247c46d2b..7f0c7a0bf855f34a5b2cb79e1221600015e65067 100644 (file)
@@ -694,7 +694,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1796,7 +1796,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 25a4f01d20e607d9ba33e142eb7ee618879af439..f53b08af1479aee0613e16ce8711f3cd0105b94b 100644 (file)
@@ -470,7 +470,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1156,7 +1156,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 3b0ebc549c962c710841956b1e6b16aa4b530f76..228039714e4be11d0c06365005805eb86d24a137 100644 (file)
@@ -744,7 +744,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1938,7 +1938,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index a1e0ec347dfc98371b1d6ab45e3369038f24743b..9cb4eb85e6b147ae25772d4215da7f6df1d55642 100644 (file)
@@ -251,7 +251,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -551,7 +551,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1184d658d16ee0c9ead4809bff5db1b8e9a70c5c..78fd9c887070dfa27fecd1c5eaf6b393fefd5e24 100644 (file)
@@ -368,7 +368,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -882,7 +882,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ac455d1f6fbefee07df99def12057ea1b9332276..5d3f1f0da216d29c50044d49bd944f91b8406ee1 100644 (file)
@@ -643,7 +643,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1672,7 +1672,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e944f527abae9827556a4035ac0f9faf7e1f7de9..34e6cf2f22724bf179767496611297cf2ca7ca94 100644 (file)
@@ -368,7 +368,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -882,7 +882,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 5fea2fef74229e298efe1fe4b9278f19be80687f..5004f92d110fca3421b15157696d791d5cb4bd32 100644 (file)
@@ -643,7 +643,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1672,7 +1672,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b5c44bc276934f9f0d50271873b5fd0f876a4cf5..e0e67b31f30a49c5d0e3d7d8389e329fc0d46499 100644 (file)
@@ -301,7 +301,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -685,7 +685,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c8131930dd166da6dc0f970eaa0d5ad331cb2f42..1269d37a7f77444d50bbe1023f02cb20be33d9e9 100644 (file)
@@ -398,7 +398,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -958,7 +958,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 321b5cdc32aa55e7c061d07253577ba9df7e95e5..dcc278e90491bca03aa1acf6dfecc15ece40be24 100644 (file)
@@ -607,7 +607,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1553,7 +1553,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d2770f18bc37fadca33422f6597b12687cccde72..a0f4bed4ecd411de0688f11356e53a3636e4fbb7 100644 (file)
@@ -433,7 +433,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1057,7 +1057,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 3276052f6fa52e896be3da3c468e989b317e443c..c708b941ecb71350bac438d7a5089b3517e1e9bc 100644 (file)
@@ -647,7 +647,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1665,7 +1665,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c98761fa0832a18ee90e2fc5ac43a1f0b19be866..01781d9b23577ddc42d2d94072aac62a8f3bf3db 100644 (file)
@@ -265,7 +265,7 @@ nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -585,7 +585,7 @@ nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 17679c4f35dcd1b78034e91935dc45976ffd8ed4..92032419089e26b0f36a8ed3b7baee20fdd6b14f 100644 (file)
@@ -362,7 +362,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -858,7 +858,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1d7e534fa63b4e2cf33a03948ad1ba1bc7d1cc50..aae5b22fcb509785f559a5981663286a6712814a 100644 (file)
@@ -571,7 +571,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1453,7 +1453,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 86ea4775f6a2f97bb1a4f79af63abdd77e6f43fb..98bde4f40bf91130a0c1ec764d5646b2a321bd5d 100644 (file)
@@ -397,7 +397,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -957,7 +957,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index efca627a5346011d0490b0ed2eb7fa21acd04323..7d3ecf9a6425ca0db4faecc8df722d766ccaf584 100644 (file)
@@ -611,7 +611,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1565,7 +1565,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 623a01f5364209cb177d16a831fe3645ca0212d4..21d1c4bc47a922e927e99451a34722b758996f0b 100644 (file)
@@ -236,7 +236,7 @@ nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -512,7 +512,7 @@ nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2e46f91e0299748ed405ceebd133b2d93af24bb8..92a6ddc8dea11d6d6b487c355763f9d0dd14efec 100644 (file)
@@ -333,7 +333,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -785,7 +785,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d9f54eaec18a6ffb808bd49a2ec4b3d5d5d28ecd..66fcb4274744eb88168679da0fac618f69d7822f 100644 (file)
@@ -548,7 +548,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1401,7 +1401,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 97ea6f238852cea8c93062e7959be6e827120468..ae6785d7837866b5c48e7f08ee0506f7a341bf42 100644 (file)
@@ -333,7 +333,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -785,7 +785,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 82196f0425d7b988e30e3d81993ccb620ba5220d..778fbf763bff8d05e5e723633f61c98781a43747 100644 (file)
@@ -548,7 +548,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1401,7 +1401,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 9b69cf2102f73722005550e2765af80099378316..81f76e0f160386180aff7f4d3ba34c3092d82e7b 100644 (file)
@@ -416,7 +416,7 @@ void
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ace7940aa5fc761e8aff0d773e076a787c232b05..3cbf07e6aa6e0d82d288ab37c1fd7057edfea37d 100644 (file)
@@ -331,7 +331,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
                     /* We tried again, and this time there was a copied buffer. 
                        We use that, and indicate that we're not reading from the
                        regular buf. This case should be pretty rare.  */
-                    tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1);
+                    tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount),-1);
                     tMPI_Atomic_memory_barrier_acq();
                     srcbuf=try_again_srcbuf;
                 }
@@ -354,7 +354,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
         {
             /* we decrement the read count; potentially releasing the buffer. */
             tMPI_Atomic_memory_barrier_rel();
-            tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
+            tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
         }
 #endif
     }
@@ -481,7 +481,7 @@ void tMPI_Wait_for_others(struct coll_env *cev, int myrank)
     else
     {
         /* wait until everybody else is done copying the original buffer. 
-           We use fetch_add because we want to be sure of coherency.
+           We use atomic add-return because we want to be sure of coherency.
            This wait is bound to be very short (otherwise it wouldn't 
            be double-buffering) so we always spin here. */
         /*tMPI_Atomic_memory_barrier_rel();*/
@@ -490,7 +490,7 @@ void tMPI_Wait_for_others(struct coll_env *cev, int myrank)
                                     -100000))
 #endif
 #if 0
-        while (tMPI_Atomic_fetch_add( &(cev->met[myrank].buf_readcount), 0) 
+        while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0) 
                != 0)
 #endif
 #if 1
index a25c6db243452b4dfb844f14f57cc380e18b7d97..bee468ceea3fe522f374dbf7a0f633609eb369b8 100644 (file)
@@ -137,7 +137,7 @@ void* tMPI_Once_wait(tMPI_Comm comm, void* (*function)(void*), void *param,
 
         tMPI_Atomic_memory_barrier_rel();
         /* signal that we're done */
-        tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
+        tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
         /* we need to keep being in sync */
         csync->syncs++;
     }
index 7e59a300baabf3d351e7f6e09d9416b838a86433..ee3a906a0ff296afd8647136880479a4c8afe02e 100644 (file)
@@ -463,7 +463,7 @@ void tMPI_Start_threads(tmpi_bool main_returns, int N,
                 tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
             }
         }
-        /* the main thread now also runs start_fn if we don't want
+        /* the main thread also runs start_fn if we don't want
            it to return */
         if (!main_returns)
             tMPI_Thread_starter((void*)&(threads[0]));
@@ -480,12 +480,11 @@ int tMPI_Init(int *argc, char ***argv,
     tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
 #endif
 
-
     if (TMPI_COMM_WORLD==0) /* we're the main process */
     {
         int N=0;
         tMPI_Get_N(argc, argv, "-nt", &N);
-        tMPI_Start_threads(FALSE, N, TMPI_AFFINITY_ALL_CORES, argc, argv, 
+        tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv, 
                            NULL, NULL, start_function);
     }
     else
index d3a9bec8d4746454a7f7af659f1e8938df3bdbfb..459770df281e97c2fdd6f5012b66a616c79e7a28 100644 (file)
@@ -111,7 +111,11 @@ else(GMX_FAHCORE)
 list(APPEND GMX_EXTRA_LIBRARIES gmxpreprocess md ${OpenMP_LINKER_FLAGS})
 
 set(GMX_KERNEL_PROGRAMS
-    grompp tpbconv pdb2gmx g_protonate g_luck gmxdump g_x2top gmxcheck)
+    grompp tpbconv pdb2gmx g_protonate gmxdump g_x2top gmxcheck)
+if (NOT GMX_NO_QUOTES)
+  set(GMX_KERNEL_PROGRAMS ${GMX_KERNEL_PROGRAMS} g_luck)
+endif (NOT GMX_NO_QUOTES)
+
 
 foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
     add_executable(${PROGRAM} ${PROGRAM}.c main.c)
index 8a1f6ef00d00d85bdf820e3fb77d17e8a12761f9..68e3cf628ce288301a49a83f34d0e03dea767f63 100644 (file)
@@ -78,17 +78,14 @@ void verletbuf_get_list_setup(gmx_bool bGPU,
     }
     else
     {
-#ifndef GMX_X86_SSE2
+#ifndef GMX_NBNXN_SIMD
         list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
 #else
-        int simd_width;
-
-#ifdef GMX_X86_AVX_256
-        simd_width = 256;
-#else
-        simd_width = 128;
+        list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#ifdef GMX_NBNXN_SIMD_2XNN
+        /* We assume the smallest cluster size to be on the safe side */
+        list_setup->cluster_size_j /= 2;
 #endif
-        list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
 #endif
     }
 }
index 8931528f2fedc69a28313e20d1c9f75cd0a580dc..7ae3f9923df8258b1a4bd94bf2f8fcb760a30feb 100644 (file)
@@ -1672,8 +1672,35 @@ int cmain (int argc, char *argv[])
                             &(ir->nkx),&(ir->nky),&(ir->nkz));
   }
 
+  /* MRS: eventually figure out better logic for initializing the fep
+   values that makes declaring the lambda and declaring the state not
+   potentially conflict if not handled correctly. */
+  if (ir->efep != efepNO)
+  {
+      state.fep_state = ir->fepvals->init_fep_state;
+      for (i=0;i<efptNR;i++)
+      {
+          /* init_lambda trumps state definitions*/
+          if (ir->fepvals->init_lambda >= 0)
+          {
+              state.lambda[i] = ir->fepvals->init_lambda;
+          }
+          else
+          {
+              if (ir->fepvals->all_lambda[i] == NULL)
+              {
+                  gmx_fatal(FARGS,"Values of lambda not set for a free energy calculation!");
+              }
+              else
+              {
+                  state.lambda[i] = ir->fepvals->all_lambda[i][state.fep_state];
+              }
+          }
+      }
+  }
+
   if (ir->ePull != epullNO)
-    set_pull_init(ir,sys,state.x,state.box,oenv,opts->pull_start);
+      set_pull_init(ir,sys,state.x,state.box,state.lambda[efptMASS],oenv,opts->pull_start);
   
   if (ir->bRot)
   {
@@ -1716,33 +1743,6 @@ int cmain (int argc, char *argv[])
         }
     }
        
-  /* MRS: eventually figure out better logic for initializing the fep
-   values that makes declaring the lambda and declaring the state not
-   potentially conflict if not handled correctly. */
-  if (ir->efep != efepNO)
-  {
-      state.fep_state = ir->fepvals->init_fep_state;
-      for (i=0;i<efptNR;i++)
-      {
-          /* init_lambda trumps state definitions*/
-          if (ir->fepvals->init_lambda >= 0)
-          {
-              state.lambda[i] = ir->fepvals->init_lambda;
-          }
-          else
-          {
-              if (ir->fepvals->all_lambda[i] == NULL)
-              {
-                  gmx_fatal(FARGS,"Values of lambda not set for a free energy calculation!");
-              }
-              else
-              {
-                  state.lambda[i] = ir->fepvals->all_lambda[i][state.fep_state];
-              }
-          }
-      }
-  }
-
   if (bVerbose) 
     fprintf(stderr,"writing run input file...\n");
 
index c72aea114f6b9f8843dbe23a02252bf5e1ac6a5c..921cb76673dcbb61ba53ebd424f57657136d6473 100644 (file)
@@ -573,7 +573,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     {
         nstfep = ir->expandedvals->nstexpanded;
     }
-    if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
+    if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
     {
         nstfep = repl_ex_nst;
     }
@@ -1294,6 +1294,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                 {
                     if (bTrotter)
                     {
+                        m_add(force_vir,shake_vir,total_vir); /* we need the un-dispersion corrected total vir here */
                         trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
                     } 
                     else 
@@ -1575,8 +1576,9 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         /* at the start of step, randomize the velocities */
         if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
         {
-            gmx_bool bDoAndersenConstr;
-            bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
+            gmx_bool bDoAndersenConstr,bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr);
+            bDoAndersenConstr = (constr && bIfRandomize);
             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
             if (bDoAndersenConstr)
             {
@@ -1958,7 +1960,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             state->fep_state = lamnew;
             for (i=0;i<efptNR;i++)
             {
-                state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
             }
         }
         /* Remaining runtime */
index ad1e79cb398e567fcf7a00194dabcef0a3c8fa90..687080c3d1387e4b92bc2094ba49b4f0ceb1ddfb 100644 (file)
@@ -268,9 +268,10 @@ static char *search_resrename(int nrr,rtprename_t *rr,
         {
             nn = rr[i].main;
         }
+        
         if (nn[0] == '-')
         {
-            gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? " as a starting terminus" : (bEnd ? " as an ending terminus" : ""));
+            gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
         }
     }
 
index 6367ac25bde5507880f7ce7019f46c5fbb323a67..a208f08c4f2e96178ff8359fafdb04b5e5356abd 100644 (file)
@@ -624,7 +624,8 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
     ic->ewaldcoeff = set->ewaldcoeff;
 
     bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+    if (pme_lb->cutoff_scheme == ecutsVERLET &&
+        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
     {
         nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
     }
index 20de0ec33e7a35e54f80935c02a99c9a12e702d9..8232f465407186a0a5c4f878279b7d0e160808e7 100644 (file)
@@ -140,7 +140,7 @@ extern void make_pull_groups(t_pull *pull,char **pgnames,
 /* Process the pull parameters after reading the index groups */
 
 GMX_LIBGMXPREPROCESS_EXPORT
-extern void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
+extern void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box, real lambda,
                          const output_env_t oenv, gmx_bool bStart);
 /* Prints the initial pull group distances in x.
  * If bStart adds the distance to the initial reference location.
index f87aed9fde3ef206aa86f4239a901c35a37d11f6..d892482e45a8859671323772970f5677f7a39192 100644 (file)
@@ -283,7 +283,7 @@ void make_pull_groups(t_pull *pull,char **pgnames,t_blocka *grps,char **gnames)
   }
 }
 
-void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
+void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,real lambda,
                   const output_env_t oenv,gmx_bool bStart)
 {
   t_mdatoms *md;
@@ -292,21 +292,16 @@ void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
   t_pbc     pbc;
   int       ndim,g,m;
   double    t_start,tinvrate;
-  real      lambda=0;
   rvec      init;
   dvec      dr,dev;
 
-  /* need to pass in the correct masses if free energy is on*/
-  if (ir->efep)
-  {
-      lambda = ir->fepvals->all_lambda[efptMASS][ir->fepvals->init_fep_state];
-  }
   init_pull(NULL,ir,0,NULL,mtop,NULL,oenv,lambda,FALSE,0); 
   md = init_mdatoms(NULL,mtop,ir->efep);
   atoms2md(mtop,ir,0,NULL,0,mtop->natoms,md);
   if (ir->efep)
-    update_mdatoms(md,ir->fepvals->init_lambda);
-  
+  {
+    update_mdatoms(md,lambda);
+  }
   pull = ir->pull;
   if (pull->eGeom == epullgPOS)
     ndim = 3;
index 54f236e0563cac6154acef40b3bf730f2bd60bf6..150bbd2e70f52c4d7df643df351b2b2aa6cace4f 100644 (file)
@@ -66,7 +66,14 @@ if(GMX_BUILD_OWN_FFTW)
     # disabling GMX_BUILD_OWN_FFTW changes dependencies correctly.
     add_dependencies(md gmxfftw)
 endif()
-set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+option(GMX_PREFIX_LIBMD "Change install name of libmd to libgmxmd to avoid collision with BSD's/Martin Hinner's libmd, which is used in X11 and zfs" OFF)
+mark_as_advanced(GMX_PREFIX_LIBMD)
+if (GMX_PREFIX_LIBMD)
+  set(MD_PREFIX "gmx")
+else()
+  set(MD_PREFIX)
+endif()
+set_target_properties(md PROPERTIES OUTPUT_NAME "${MD_PREFIX}md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
     COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 
 install(TARGETS md DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
@@ -74,5 +81,5 @@ install(TARGETS md DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libmd.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc @ONLY)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc
         DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
-        RENAME "libmd${GMX_LIBS_SUFFIX}.pc"
+        RENAME "lib${MD_PREFIX}md${GMX_LIBS_SUFFIX}.pc"
         COMPONENT development)
index d88241d6734bd24bb915fb13f209c568d432cff5..0423134c6bf6e14d1b7a8556aec6f6d5b2c8e210 100644 (file)
@@ -228,23 +228,44 @@ static void lincs_update_atoms_noind(int ncons,const int *bla,
     int  b,i,j;
     real mvb,im1,im2,tmp0,tmp1,tmp2;
 
-    for(b=0; b<ncons; b++)
+    if (invmass != NULL)
     {
-        i = bla[2*b];
-        j = bla[2*b+1];
-        mvb = prefac*fac[b];
-        im1 = invmass[i];
-        im2 = invmass[j];
-        tmp0 = r[b][0]*mvb;
-        tmp1 = r[b][1]*mvb;
-        tmp2 = r[b][2]*mvb;
-        x[i][0] -= tmp0*im1;
-        x[i][1] -= tmp1*im1;
-        x[i][2] -= tmp2*im1;
-        x[j][0] += tmp0*im2;
-        x[j][1] += tmp1*im2;
-        x[j][2] += tmp2*im2;
-    } /* 16 ncons flops */
+        for(b=0; b<ncons; b++)
+        {
+            i = bla[2*b];
+            j = bla[2*b+1];
+            mvb = prefac*fac[b];
+            im1 = invmass[i];
+            im2 = invmass[j];
+            tmp0 = r[b][0]*mvb;
+            tmp1 = r[b][1]*mvb;
+            tmp2 = r[b][2]*mvb;
+            x[i][0] -= tmp0*im1;
+            x[i][1] -= tmp1*im1;
+            x[i][2] -= tmp2*im1;
+            x[j][0] += tmp0*im2;
+            x[j][1] += tmp1*im2;
+            x[j][2] += tmp2*im2;
+        } /* 16 ncons flops */
+    }
+    else
+    {
+        for(b=0; b<ncons; b++)
+        {
+            i = bla[2*b];
+            j = bla[2*b+1];
+            mvb = prefac*fac[b];
+            tmp0 = r[b][0]*mvb;
+            tmp1 = r[b][1]*mvb;
+            tmp2 = r[b][2]*mvb;
+            x[i][0] -= tmp0;
+            x[i][1] -= tmp1;
+            x[i][2] -= tmp2;
+            x[j][0] += tmp0;
+            x[j][1] += tmp1;
+            x[j][2] += tmp2;
+        }
+    }        
 }
 
 static void lincs_update_atoms_ind(int ncons,const int *ind,const int *bla,
@@ -256,24 +277,46 @@ static void lincs_update_atoms_ind(int ncons,const int *ind,const int *bla,
     int  bi,b,i,j;
     real mvb,im1,im2,tmp0,tmp1,tmp2;
 
-    for(bi=0; bi<ncons; bi++)
+    if (invmass != NULL)
     {
-        b = ind[bi];
-        i = bla[2*b];
-        j = bla[2*b+1];
-        mvb = prefac*fac[b];
-        im1 = invmass[i];
-        im2 = invmass[j];
-        tmp0 = r[b][0]*mvb;
-        tmp1 = r[b][1]*mvb;
-        tmp2 = r[b][2]*mvb;
-        x[i][0] -= tmp0*im1;
-        x[i][1] -= tmp1*im1;
-        x[i][2] -= tmp2*im1;
-        x[j][0] += tmp0*im2;
-        x[j][1] += tmp1*im2;
-        x[j][2] += tmp2*im2;
-    } /* 16 ncons flops */
+        for(bi=0; bi<ncons; bi++)
+        {
+            b = ind[bi];
+            i = bla[2*b];
+            j = bla[2*b+1];
+            mvb = prefac*fac[b];
+            im1 = invmass[i];
+            im2 = invmass[j];
+            tmp0 = r[b][0]*mvb;
+            tmp1 = r[b][1]*mvb;
+            tmp2 = r[b][2]*mvb;
+            x[i][0] -= tmp0*im1;
+            x[i][1] -= tmp1*im1;
+            x[i][2] -= tmp2*im1;
+            x[j][0] += tmp0*im2;
+            x[j][1] += tmp1*im2;
+            x[j][2] += tmp2*im2;
+        } /* 16 ncons flops */
+    }
+    else
+    {
+        for(bi=0; bi<ncons; bi++)
+        {
+            b = ind[bi];
+            i = bla[2*b];
+            j = bla[2*b+1];
+            mvb = prefac*fac[b];
+            tmp0 = r[b][0]*mvb;
+            tmp1 = r[b][1]*mvb;
+            tmp2 = r[b][2]*mvb;
+            x[i][0] -= tmp0;
+            x[i][1] -= tmp1;
+            x[i][2] -= tmp2;
+            x[j][0] += tmp0;
+            x[j][1] += tmp1;
+            x[j][2] += tmp2;
+        } /* 16 ncons flops */
+    }
 }
 
 static void lincs_update_atoms(struct gmx_lincsdata *li,int th,
@@ -407,35 +450,24 @@ static void do_lincsp(rvec *x,rvec *f,rvec *fp,t_pbc *pbc,
         }
     }
 
-    if (econq != econqForce)
+    /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
+    for(b=b0; b<b1; b++)
     {
-        lincs_update_atoms(lincsd,th,1.0,sol,r,invmass,fp);
+        sol[b] *= blc[b];
     }
-    else
+
+    /* When constraining forces, we should not use mass weighting,
+     * so we pass invmass=NULL, which results in the use of 1 for all atoms.
+     */
+    lincs_update_atoms(lincsd,th,1.0,sol,r,
+                       (econq != econqForce) ? invmass : NULL,fp);
+
+    if (dvdlambda != NULL)
     {
+#pragma omp barrier
         for(b=b0; b<b1; b++)
         {
-            i = bla[2*b];
-            j = bla[2*b+1];
-            mvb = blc[b]*sol[b];
-            tmp0 = r[b][0]*mvb;
-            tmp1 = r[b][1]*mvb;
-            tmp2 = r[b][2]*mvb;
-            fp[i][0] -= tmp0;
-            fp[i][1] -= tmp1;
-            fp[i][2] -= tmp2;
-            fp[j][0] += tmp0;
-            fp[j][1] += tmp1;
-            fp[j][2] += tmp2;
-        }
-
-        if (dvdlambda != NULL)
-        {
-#pragma omp barrier
-            for(b=b0; b<b1; b++)
-            {
-                *dvdlambda -= blc[b]*sol[b]*lincsd->ddist[b];
-            }
+            *dvdlambda -= sol[b]*lincsd->ddist[b];
         }
         /* 10 ncons flops */
     }
@@ -449,7 +481,7 @@ static void do_lincsp(rvec *x,rvec *f,rvec *fp,t_pbc *pbc,
          */
         for(b=b0; b<b1; b++)
         {
-            mvb = lincsd->bllen[b]*blc[b]*sol[b];
+            mvb = lincsd->bllen[b]*sol[b];
             for(i=0; i<DIM; i++)
             {
                 tmp1 = mvb*r[b][i];
index c9a83119a0c6a3c98bd67503b1cd276723a1dc30..2060205e70cf82882745b5a871ca915afb79d3b2 100644 (file)
@@ -178,7 +178,7 @@ static void NHC_trotter(t_grpopts *opts,int nvar, gmx_ekindata_t *ekind,real dtf
 }
 
 static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box, 
-                         gmx_ekindata_t *ekind, tensor vir, real pcorr, real ecorr, t_extmass *MassQ)
+                         gmx_ekindata_t *ekind, tensor vir, real pcorr, t_extmass *MassQ)
 {
 
     real  pscal;
@@ -218,7 +218,7 @@ static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box,
     /* for now, we use Elr = 0, because if you want to get it right, you
        really should be using PME. Maybe print a warning? */
 
-    pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres);
+    pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres)+pcorr;
 
     vol = det(box);
     GW = (vol*(MassQ->Winv/PRESFAC))*(DIM*pscal - trace(ir->ref_p));   /* W is in ps^2 * bar * nm^3 */
@@ -905,7 +905,7 @@ void trotter_update(t_inputrec *ir,gmx_large_int_t step, gmx_ekindata_t *ekind,
         case etrtBAROV:
         case etrtBAROV2:
             boxv_trotter(ir,&(state->veta),dt,state->box,ekind,vir,
-                         enerd->term[F_PDISPCORR],enerd->term[F_DISPCORR],MassQ);
+                         enerd->term[F_PDISPCORR],MassQ);
             break;
         case etrtBARONHC:
         case etrtBARONHC2:
index 99ceba34e4c37d193043d46b77af70072e2e9a6b..fd25a5b1c0e65048553c4dd584f2e84868fc96f8 100644 (file)
@@ -53,7 +53,7 @@ FILE* debug;
 #include "gmxcomplex.h"
 #include "gmx_fft.h"
 
-#ifndef GMX_LIB_MPI
+#ifndef GMX_MPI
 double MPI_Wtime();
 #endif
 
index b773dff15f9f2889031140b4f4fd57cab2f674bf..c1955a99548da156ae3e27ad8b764c97997509dd 100644 (file)
@@ -1166,7 +1166,7 @@ static void make_nbf_tables(FILE *fp,const output_env_t oenv,
     nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
     nbl->table_elec.ninteractions = 1;
     nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
-    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),16);
+    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),32);
 
     nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
     nbl->table_vdw.format = nbl->table_elec_vdw.format;
@@ -1177,7 +1177,7 @@ static void make_nbf_tables(FILE *fp,const output_env_t oenv,
     nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
     nbl->table_vdw.ninteractions = 2;
     nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
-    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),16);
+    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),32);
 
     for(i=0; i<=nbl->table_elec_vdw.n; i++)
     {
@@ -1403,41 +1403,48 @@ static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 static void pick_nbnxn_kernel_cpu(FILE *fp,
                                   const t_commrec *cr,
                                   const gmx_cpuid_t cpuid_info,
+                                  const t_inputrec *ir,
                                   int *kernel_type,
                                   int *ewald_excl)
 {
-    *kernel_type = nbk4x4_PlainC;
+    *kernel_type = nbnxnk4x4_PlainC;
     *ewald_excl  = ewaldexclTable;
 
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD
     {
-        /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
-         * On AMD Bulldozer AVX-256 is much slower than AVX-128.
-         */
-        if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
-           gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
-        {
-#ifdef GMX_X86_AVX_256
-            *kernel_type = nbk4xN_X86_SIMD256;
-#else
-            *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+        *kernel_type = nbnxnk4xN_SIMD_4xN;
 #endif
-        }
-        else
+#ifdef GMX_NBNXN_SIMD_2XNN
+        /* We expect the 2xNN kernels to be faster in most cases */
+        *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
         {
-            *kernel_type = nbk4xN_X86_SIMD128;
+            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+             * 10% with HT, 50% without HT, but extra zeros interactions
+             * can compensate. As we currently don't detect the actual use
+             * of HT, switch to 4x8 to avoid a potential performance hit.
+             */
+            *kernel_type = nbnxnk4xN_SIMD_4xN;
         }
-
-        if (getenv("GMX_NBNXN_AVX128") != NULL)
+#endif
+        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
         {
-            *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+            *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+            gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
         }
-        if (getenv("GMX_NBNXN_AVX256") != NULL)
+        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
         {
-#ifdef GMX_X86_AVX_256
-            *kernel_type = nbk4xN_X86_SIMD256;
+#ifdef GMX_NBNXN_SIMD_2XNN
+            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 #else
-            gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
+            gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 #endif
         }
 
@@ -1466,6 +1473,7 @@ static void pick_nbnxn_kernel(FILE *fp,
                               const gmx_hw_info_t *hwinfo,
                               gmx_bool use_cpu_acceleration,
                               gmx_bool *bUseGPU,
+                              const t_inputrec *ir,
                               int *kernel_type,
                               int *ewald_excl,
                               gmx_bool bDoNonbonded)
@@ -1475,7 +1483,7 @@ static void pick_nbnxn_kernel(FILE *fp,
 
     assert(kernel_type);
 
-    *kernel_type = nbkNotSet;
+    *kernel_type = nbnxnkNotSet;
     *ewald_excl  = ewaldexclTable;
 
     bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
@@ -1521,7 +1529,7 @@ static void pick_nbnxn_kernel(FILE *fp,
 
     if (bEmulateGPU)
     {
-        *kernel_type = nbk8x8x8_PlainC;
+        *kernel_type = nbnxnk8x8x8_PlainC;
 
         if (bDoNonbonded)
         {
@@ -1530,31 +1538,28 @@ static void pick_nbnxn_kernel(FILE *fp,
     }
     else if (bGPU)
     {
-        *kernel_type = nbk8x8x8_CUDA;
+        *kernel_type = nbnxnk8x8x8_CUDA;
     }
 
-    if (*kernel_type == nbkNotSet)
+    if (*kernel_type == nbnxnkNotSet)
     {
         if (use_cpu_acceleration)
         {
-            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
+            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
                                   kernel_type,ewald_excl);
         }
         else
         {
-            *kernel_type = nbk4x4_PlainC;
+            *kernel_type = nbnxnk4x4_PlainC;
         }
     }
 
     if (bDoNonbonded && fp != NULL)
     {
-        if (MASTER(cr))
-        {
-            fprintf(stderr,"Using %s non-bonded kernels\n",
-                    nbk_name[*kernel_type]);
-        }
-        fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
-                nbk_name[*kernel_type]);
+        fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
+                nbnxn_kernel_name[*kernel_type],
+                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+                nbnxn_kernel_to_cj_size(*kernel_type));
     }
 }
 
@@ -1610,9 +1615,9 @@ static void init_ewald_f_table(interaction_const_t *ic,
     sfree_aligned(ic->tabq_coul_V);
 
     /* Create the original table data in FDV0 */
-    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
-    snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
-    snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
+    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,32);
+    snew_aligned(ic->tabq_coul_F,ic->tabq_size,32);
+    snew_aligned(ic->tabq_coul_V,ic->tabq_size,32);
     table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
                                 ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
 }
@@ -1647,9 +1652,9 @@ void init_interaction_const(FILE *fp,
     snew(ic, 1);
 
     /* Just allocate something so we can free it */
-    snew_aligned(ic->tabq_coul_FDV0,16,16);
-    snew_aligned(ic->tabq_coul_F,16,16);
-    snew_aligned(ic->tabq_coul_V,16,16);
+    snew_aligned(ic->tabq_coul_FDV0,16,32);
+    snew_aligned(ic->tabq_coul_F,16,32);
+    snew_aligned(ic->tabq_coul_V,16,32);
 
     ic->rlist       = fr->rlist;
     ic->rlistlong   = fr->rlistlong;
@@ -1754,12 +1759,13 @@ static void init_nb_verlet(FILE *fp,
     {
         nbv->grp[i].nbl_lists.nnbl = 0;
         nbv->grp[i].nbat           = NULL;
-        nbv->grp[i].kernel_type    = nbkNotSet;
+        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 
         if (i == 0) /* local */
         {
             pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                               &nbv->bUseGPU,
+                              ir,
                               &nbv->grp[i].kernel_type,
                               &nbv->grp[i].ewald_excl,
                               fr->bNonbonded);
@@ -1771,6 +1777,7 @@ static void init_nb_verlet(FILE *fp,
                 /* Use GPU for local, select a CPU kernel for non-local */
                 pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                                   NULL,
+                                  ir,
                                   &nbv->grp[i].kernel_type,
                                   &nbv->grp[i].ewald_excl,
                                   fr->bNonbonded);
@@ -1834,7 +1841,7 @@ static void init_nb_verlet(FILE *fp,
 
     for(i=0; i<nbv->ngrp; i++)
     {
-        if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
         {
             nb_alloc = &pmalloc;
             nb_free  = &pfree;
index 4badaf7a02fa152ff8a737da43dd2c342d9abe4e..9f97b759393bb3d34ac09dc91a37b3f738bf7c3c 100644 (file)
@@ -111,10 +111,10 @@ void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes)
             snew(state->cg_p,state->nalloc);
         }
     }
-  if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE) {
+    if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc)) {
     state->nrng  = gmx_rng_n();
     state->nrngi = 1;
-    if (EI_SD(ir->eI) || ir->eI == eiBD) {
+    if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc)) {
       /* This will be correct later with DD */
       state->nrng  *= nnodes;
       state->nrngi *= nnodes;
@@ -184,7 +184,7 @@ void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
 {
     bcast_ir_mtop(cr,inputrec,mtop);
 
-    if (inputrec->eI == eiBD || EI_SD(inputrec->eI)) {
+    if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc)) {
         /* Make sure the random seeds are different on each node */
         inputrec->ld_seed += cr->nodeid;
     }
index d93a4c2b9ba5b3a43dd5da66361d629759fdb291..c9e881a7ffe1ecb845514296efab74202db2f4ff 100644 (file)
@@ -7,6 +7,6 @@ URL: http://www.gromacs.org
 Version: @PROJECT_VERSION@
 Requires: libgmx@GMX_LIBS_SUFFIX@ @PKG_FFT@
 Libs.private: -lm @CMAKE_THREAD_LIBS_INIT@
-Libs: -L${libdir} -lmd@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@
+Libs: -L${libdir} -l@MD_PREFIX@md@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@
 Cflags: -I${includedir} @PKG_CFLAGS@
 
index aaa3f22c8eb678df6a779eac3708089e373293a4..a11f764914917d3c6019c117815c1567a7f14f95 100644 (file)
@@ -151,8 +151,8 @@ static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
     ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
     ma((void **)&out->Vc  ,out->nV*sizeof(*out->Vc  ));
 
-    if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
-        nb_kernel_type == nbk4xN_X86_SIMD256)
+    if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+        nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
     {
         cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
         out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
@@ -598,17 +598,25 @@ void nbnxn_atomdata_init(FILE *fp,
     nbat->lj_comb = NULL;
     if (simple)
     {
+        int pack_x;
+
         switch (nb_kernel_type)
         {
-        case nbk4xN_X86_SIMD128:
-            nbat->XFormat = nbatX4;
-            break;
-        case nbk4xN_X86_SIMD256:
-#ifndef GMX_DOUBLE
-            nbat->XFormat = nbatX8;
-#else
-            nbat->XFormat = nbatX4;
-#endif
+        case nbnxnk4xN_SIMD_4xN:
+        case nbnxnk4xN_SIMD_2xNN:
+            pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+                         nbnxn_kernel_to_cj_size(nb_kernel_type));
+            switch (pack_x)
+            {
+                case 4:
+                    nbat->XFormat = nbatX4;
+                    break;
+                case 8:
+                    nbat->XFormat = nbatX8;
+                    break;
+                default:
+                    gmx_incons("Unsupported packing width");
+            }
             break;
         default:
             nbat->XFormat = nbatXYZ;
@@ -1034,14 +1042,14 @@ nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
 #else
 #define GMX_MM128_HERE
 #endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
 
     int       i,s;
     gmx_mm_pr dest_SSE,src_SSE;
 
     if (bDestSet)
     {
-        for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+        for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
         {
             dest_SSE = gmx_load_pr(dest+i);
             for(s=0; s<nsrc; s++)
@@ -1054,7 +1062,7 @@ nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
     }
     else
     {
-        for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+        for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
         {
             dest_SSE = gmx_load_pr(src[0]+i);
             for(s=1; s<nsrc; s++)
index ddf2719dab070a9fe6a1c4be78118195fc6664e6..2b6398c1e4ab80d8a0e5e4242a98226afe885dd7 100644 (file)
@@ -39,4 +39,7 @@ if(GMX_GPU)
             OPTIONS
             RELWITHDEBINFO -g
             DEBUG -g -D_DEBUG_=1)
+    #Because this is a static library linked into the (potential) shared library
+    #it should have the export of the shared library.
+    SET_TARGET_PROPERTIES(nbnxn_cuda PROPERTIES DEFINE_SYMBOL "md_EXPORTS" )
 endif()
index 9d0be66e30eac01cdc808ae482a2a19f146cee75..1a7f5bfcea634b6722f5851c9198f2b9f0a692d7 100644 (file)
@@ -95,29 +95,32 @@ typedef struct {
     int  nsubc_tot;      /* Total number of subcell, used for printing  */
 } nbnxn_grid_t;
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd128 {
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+typedef struct nbnxn_x_ci_simd_4xn {
     /* The i-cluster coordinates for simple search */
     gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
     gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
     gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd128_t;
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd256 {
+} nbnxn_x_ci_simd_4xn_t;
+
+typedef struct nbnxn_x_ci_simd_2xnn {
     /* The i-cluster coordinates for simple search */
     gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
-    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
-    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd256_t;
-#undef GMX_MM256_HERE
-#endif
+} nbnxn_x_ci_simd_2xnn_t;
+
 #endif
 
 /* Working data for the actual i-supercell during pair search */
@@ -126,11 +129,9 @@ typedef struct nbnxn_list_work {
 
     float *bb_ci;      /* The bounding boxes, pbc shifted, for each cluster */
     real  *x_ci;       /* The coordinates, pbc shifted, for each atom       */
-#ifdef NBNXN_SEARCH_SSE
-    nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
-#ifdef GMX_X86_AVX_256
-    nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+    nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
+    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 #endif
     int  cj_ind;       /* The current cj_ind index for the current list     */
     int  cj4_init;     /* The first unitialized cj4 block                   */
@@ -155,17 +156,18 @@ gmx_icell_set_x_t(int ci,
                   nbnxn_list_work_t *work);
 
 static gmx_icell_set_x_t icell_set_x_simple;
-#ifdef NBNXN_SEARCH_SSE
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
-#ifdef GMX_X86_AVX_256
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 #endif
 static gmx_icell_set_x_t icell_set_x_supersub;
 #ifdef NBNXN_SEARCH_SSE
 static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 #endif
 
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+
 /* Local cycle count struct for profiling */
 typedef struct {
     int          count;
index 97a0ef84b9dc535806fa6e60a7935f913619edb4..ce5a6734c8953ba4ab7bb48f1890e226871e84ed 100644 (file)
@@ -108,7 +108,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
     real       *nbfp_i;
     int        n,ci,ci_sh;
     int        ish,ishf;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
 
@@ -213,8 +213,15 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef CALC_ENERGIES
 #ifndef ENERGY_GROUPS
@@ -237,8 +244,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
             }
         }
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
 #ifdef CALC_ENERGIES
             real Vc_sub_self;
similarity index 83%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
index 8018f65f5ea3b10948f7289e597a456f1e46d79a..0106d4d7801b1d716ae1199a0cea87c61b143a48 100644 (file)
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
 
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD_2XNN
 
-#include "nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernel_simd_2xnn.h"
 
-/* Include all flavors of the 128-bit SSE or AVX kernel loops */
+/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
 
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 
 /* Analytical reaction-field kernels */
 #define CALC_COUL_RF
 
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 #undef CALC_COUL_RF
 
 #define CALC_COUL_TAB
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_TAB
 #define CALC_COUL_EWALD
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_EWALD
@@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 
 enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
                                   real *Vvdw,real *Vc)
 {
+    const int simd_width   = GMX_SIMD_WIDTH_HERE;
+    const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
     int ng_p2,i,j,j0,j1,c,s;
 
-#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
-
     ng_p2 = (1<<ng_2log);
 
     /* The size of the x86 SIMD energy group buffer array is:
-     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+     * ng*ng*ng_p2*unrollj_half*simd_width
      */
     for(i=0; i<ng; i++)
     {
@@ -163,34 +170,34 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_HALF; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
+                for(s=0; s<unrollj_half; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
                     Vc  [i*ng+j0] += VSc  [c+0];
                     Vc  [i*ng+j1] += VSc  [c+1];
-                    c += SIMD_WIDTH + 2;
+                    c += simd_width/2 + 2;
                 }
             }
         }
     }
 }
 
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD_2XNN */
 
 void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec, 
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw)
-#ifdef GMX_X86_SSE2
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       *nbl_list,
+                       const nbnxn_atomdata_t     *nbat,
+                       const interaction_const_t  *ic,
+                       int                        ewald_excl,
+                       rvec                       *shift_vec, 
+                       int                        force_flags,
+                       int                        clearF,
+                       real                       *fshift,
+                       real                       *Vc,
+                       real                       *Vvdw)
+#ifdef GMX_NBNXN_SIMD_2XNN
 {
     int              nnbl;
     nbnxn_pairlist_t **nbl;
@@ -320,6 +327,6 @@ nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
 }
 #else
 {
-    gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
+    gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
 }
 #endif
similarity index 69%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
index 9dd757dd75d547498db490f8e2bf7da192920e2d..c3a6b3b6ee90a4ef70c3f824e7c483692df68272 100644 (file)
@@ -35,8 +35,8 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifndef _nbnxn_kernel_x86_simd128_h
-#define _nbnxn_kernel_x86_simd128_h
+#ifndef _nbnxn_kernel_simd_2xnn_h
+#define _nbnxn_kernel_simd_2xnn_h
 
 #include "typedefs.h"
 
 extern "C" {
 #endif
 
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width 2*N
+ * by packing 2*N j-atom variables in SIMD registers.
+ */
 void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec,
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw);
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       *nbl_list,
+                       const nbnxn_atomdata_t     *nbat,
+                       const interaction_const_t  *ic,
+                       int                        ewald_excl,
+                       rvec                       *shift_vec,
+                       int                        force_flags,
+                       int                        clearF,
+                       real                       *fshift,
+                       real                       *Vc,
+                       real                       *Vvdw);
 
 #ifdef __cplusplus
 }
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h
new file mode 100644 (file)
index 0000000..67e63b1
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This files includes all x86 SIMD kernel flavors.
+ * Only the Electrostatics type and optionally the VdW cut-off check
+ * need to be set before including this file.
+ */
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_simd_2xnn_outer.h"
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
new file mode 100644 (file)
index 0000000..cab66c3
--- /dev/null
@@ -0,0 +1,752 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel duplicates the data for N j-particles in
+ * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ * This leads to 4/2=2 sets of most instructions. Therefore we call
+ * this kernel 2x(N+N) = 2xnn
+ *
+ * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ * and instructions removed.
+ *
+ * An alternative would be to load to different cluster of N j-particles
+ * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ * of instructions, which could lead to better scheduling. But we actually
+ * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ * It could be worth trying this option, but it takes some more effort.
+ * This 2xnn kernel is basically the 4xn equivalent with
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this can be faster with blendv (only available with SSE4.1 and later).
+ */
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS
+/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
+ * With gcc this is slower, except for RF on Sandy Bridge.
+ * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
+#define CUTOFF_BLENDV
+#endif
+/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
+ * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
+ * Tested with icc 13.
+ */
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
+#define CUTOFF_BLENDV
+#endif
+#endif
+
+        {
+            int        cj,aj,ajx,ajy,ajz;
+
+#ifdef ENERGY_GROUPS
+            /* Energy group indices for two atoms packed into one int */
+            int        egp_jj[UNROLLJ/2];
+#endif
+
+#ifdef CHECK_EXCLS
+            /* Interaction (non-exclusion) mask of all 1's or 0's */
+            gmx_mm_pr  int_SSE0;
+            gmx_mm_pr  int_SSE2;
+#endif
+
+            gmx_mm_pr  jxSSE,jySSE,jzSSE;
+            gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+            gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+            gmx_mm_pr  tx_SSE0,ty_SSE0,tz_SSE0;
+            gmx_mm_pr  tx_SSE2,ty_SSE2,tz_SSE2;
+            gmx_mm_pr  rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
+            gmx_mm_pr  rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
+#ifndef CUTOFF_BLENDV
+            /* wco: within cut-off, mask of all 1's or 0's */
+            gmx_mm_pr  wco_SSE0;
+            gmx_mm_pr  wco_SSE2;
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            gmx_mm_pr  wco_vdw_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  wco_vdw_SSE2;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+            /* 1/r masked with the interaction mask */
+            gmx_mm_pr  rinv_ex_SSE0;
+            gmx_mm_pr  rinv_ex_SSE2;
+#endif
+            gmx_mm_pr  jq_SSE;
+            gmx_mm_pr  qq_SSE0;
+            gmx_mm_pr  qq_SSE2;
+#ifdef CALC_COUL_TAB
+            /* The force (PME mesh force) we need to subtract from 1/r^2 */
+            gmx_mm_pr  fsub_SSE0;
+            gmx_mm_pr  fsub_SSE2;
+#endif
+#ifdef CALC_COUL_EWALD
+            gmx_mm_pr  brsq_SSE0,brsq_SSE2;
+            gmx_mm_pr  ewcorr_SSE0,ewcorr_SSE2;
+#endif
+
+            /* frcoul = (1/r - fsub)*r */
+            gmx_mm_pr  frcoul_SSE0;
+            gmx_mm_pr  frcoul_SSE2;
+#ifdef CALC_COUL_TAB
+            /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+            gmx_mm_pr  r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
+            gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
+            /* Table index: rs truncated to an int */
+#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
+            gmx_epi32  ti_SSE0,ti_SSE2;
+#else
+            __m128i    ti_SSE0,ti_SSE2;
+#endif
+            /* Linear force table values */
+            gmx_mm_pr  ctab0_SSE0,ctab1_SSE0;
+            gmx_mm_pr  ctab0_SSE2,ctab1_SSE2;
+#ifdef CALC_ENERGIES
+            /* Quadratic energy table value */
+            gmx_mm_pr  ctabv_SSE0;
+            gmx_mm_pr  ctabv_SSE2;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+            /* The potential (PME mesh) we need to subtract from 1/r */
+            gmx_mm_pr  vc_sub_SSE0;
+            gmx_mm_pr  vc_sub_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+            /* Electrostatic potential */
+            gmx_mm_pr  vcoul_SSE0;
+            gmx_mm_pr  vcoul_SSE2;
+#endif
+#endif
+            /* The force times 1/r */
+            gmx_mm_pr  fscal_SSE0;
+            gmx_mm_pr  fscal_SSE2;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+            /* LJ sigma_j/2 and sqrt(epsilon_j) */
+            gmx_mm_pr  hsig_j_SSE,seps_j_SSE;
+            /* LJ sigma_ij and epsilon_ij */
+            gmx_mm_pr  sig_SSE0,eps_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  sig_SSE2,eps_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+            gmx_mm_pr  sig2_SSE0,sig6_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  sig2_SSE2,sig6_SSE2;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+            gmx_mm_pr  c6s_j_SSE,c12s_j_SSE;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+            /* Index for loading LJ parameters, complicated when interleaving */
+            int         aj2;
+#endif
+
+#ifndef FIX_LJ_C
+            /* LJ C6 and C12 parameters, used with geometric comb. rule */
+            gmx_mm_pr  c6_SSE0,c12_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  c6_SSE2,c12_SSE2;
+#endif
+#endif
+
+            /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+            gmx_mm_pr  rinvsix_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  rinvsix_SSE2;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+            gmx_mm_pr  sir_SSE0,sir2_SSE0,sir6_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  sir_SSE2,sir2_SSE2,sir6_SSE2;
+#endif
+#endif
+
+            gmx_mm_pr  FrLJ6_SSE0,FrLJ12_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  FrLJ6_SSE2,FrLJ12_SSE2;
+#endif
+#ifdef CALC_ENERGIES
+            gmx_mm_pr  VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
+#ifndef HALF_LJ
+            gmx_mm_pr  VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+            /* j-cluster index */
+            cj            = l_cj[cjind].cj;
+
+            /* Atom indices (of the first atom in the cluster) */
+            aj            = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+            aj2           = aj*2;
+#else
+            aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+            ajx           = aj*DIM;
+#else
+            ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+            ajy           = ajx + STRIDE;
+            ajz           = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
+            {
+                /* Load integer interaction mask */
+                /* With AVX there are no integer operations, so cast to real */
+                gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
+                /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
+                 * With gcc we don't need the cast, but it's faster.
+                 */
+#define cast_cvt(x)  _mm256_cvtepi32_ps(_mm256_castps_si256(x))
+                int_SSE0  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
+                int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask2)),zero_SSE);
+#undef cast_cvt
+            }
+#endif
+            /* load j atom coordinates */
+            jxSSE         = gmx_loaddh_pr(x+ajx);
+            jySSE         = gmx_loaddh_pr(x+ajy);
+            jzSSE         = gmx_loaddh_pr(x+ajz);
+
+            /* Calculate distance */
+            dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
+            dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
+            dz_SSE0       = gmx_sub_pr(iz_SSE0,jzSSE);
+            dx_SSE2       = gmx_sub_pr(ix_SSE2,jxSSE);
+            dy_SSE2       = gmx_sub_pr(iy_SSE2,jySSE);
+            dz_SSE2       = gmx_sub_pr(iz_SSE2,jzSSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2      = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+#ifndef CUTOFF_BLENDV
+            wco_SSE0      = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2      = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+            /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+            if (cj == ci_sh)
+            {
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag_SSE0);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag_SSE2);
+            }
+#else
+#error "only UNROLLJ == UNROLLI currently supported in the joined kernels"
+#endif
+#else /* EXCL_FORCES */
+            /* Remove all excluded atom pairs from the list */
+            wco_SSE0      = gmx_and_pr(wco_SSE0,int_SSE0);
+            wco_SSE2      = gmx_and_pr(wco_SSE2,int_SSE2);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+            {
+                int i,j;
+                real tmp[UNROLLJ];
+                for(i=0; i<UNROLLI; i++)
+                {
+                    gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
+                    for(j=0; j<UNROLLJ; j++)
+                    {
+                        if (!(tmp[j] == 0))
+                        {
+                            npair++;
+                        }
+                    }
+                }
+            }
+#endif
+
+#ifdef CHECK_EXCLS
+            /* For excluded pairs add a small number to avoid r^-6 = NaN */
+            rsq_SSE0      = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
+            rsq_SSE2      = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
+#endif
+
+            /* Calculate 1/r */
+            rinv_SSE0     = gmx_invsqrt_pr(rsq_SSE0);
+            rinv_SSE2     = gmx_invsqrt_pr(rsq_SSE2);
+
+#ifdef CALC_COULOMB
+            /* Load parameters for j atom */
+            jq_SSE        = gmx_loaddh_pr(q+aj);
+            qq_SSE0       = gmx_mul_pr(iq_SSE0,jq_SSE);
+            qq_SSE2       = gmx_mul_pr(iq_SSE2,jq_SSE);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+            load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+#ifndef HALF_LJ
+            load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
+            c6s_j_SSE     = gmx_loaddh_pr(ljc+aj2+0);
+            c12s_j_SSE    = gmx_loaddh_pr(ljc+aj2+STRIDE);
+            c6_SSE0       = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
+#ifndef HALF_LJ
+            c6_SSE2       = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
+#endif
+            c12_SSE0      = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
+#ifndef HALF_LJ
+            c12_SSE2      = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
+            hsig_j_SSE    = gmx_loaddh_pr(ljc+aj2+0);
+            seps_j_SSE    = gmx_loaddh_pr(ljc+aj2+STRIDE);
+
+            sig_SSE0      = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
+            eps_SSE0      = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
+#ifndef HALF_LJ
+            sig_SSE2      = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
+            eps_SSE2      = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef CUTOFF_BLENDV
+            rinv_SSE0     = gmx_and_pr(rinv_SSE0,wco_SSE0);
+            rinv_SSE2     = gmx_and_pr(rinv_SSE2,wco_SSE2);
+#else
+            /* We only need to mask for the cut-off: blendv is faster */
+            rinv_SSE0     = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
+            rinv_SSE2     = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
+#endif
+
+            rinvsq_SSE0   = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
+            rinvsq_SSE2   = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
+
+#ifdef CALC_COULOMB
+            /* Note that here we calculate force*r, not the usual force/r.
+             * This allows avoiding masking the reaction-field contribution,
+             * as frcoul is later multiplied by rinvsq which has been
+             * masked with the cut-off check.
+             */
+
+#ifdef EXCL_FORCES
+            /* Only add 1/r for non-excluded atom pairs */
+            rinv_ex_SSE0  = gmx_and_pr(rinv_SSE0,int_SSE0);
+            rinv_ex_SSE2  = gmx_and_pr(rinv_SSE2,int_SSE2);
+#else
+            /* No exclusion forces, we always need 1/r */
+#define     rinv_ex_SSE0    rinv_SSE0
+#define     rinv_ex_SSE2    rinv_SSE2
+#endif
+
+#ifdef CALC_COUL_RF
+            /* Electrostatic interactions */
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
+
+#ifdef CALC_ENERGIES
+            vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
+            vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+            /* We need to mask (or limit) rsq for the cut-off,
+             * as large distances can cause an overflow in gmx_pmecorrF/V.
+             */
+#ifndef CUTOFF_BLENDV
+            brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0));
+            brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2));
+#else
+            /* Strangely, putting mul on a separate line is slower (icc 13) */
+            brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0)));
+            brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2)));
+#endif
+            ewcorr_SSE0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE);
+            ewcorr_SSE2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE);
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2)));
+
+#ifdef CALC_ENERGIES
+            vc_sub_SSE0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE);
+            vc_sub_SSE2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
+#ifdef CALC_COUL_TAB
+            /* Electrostatic interactions */
+            r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
+            r_SSE2        = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
+            /* Convert r to scaled table units */
+            rs_SSE0       = gmx_mul_pr(r_SSE0,invtsp_SSE);
+            rs_SSE2       = gmx_mul_pr(r_SSE2,invtsp_SSE);
+            /* Truncate scaled r to an int */
+            ti_SSE0       = gmx_cvttpr_epi32(rs_SSE0);
+            ti_SSE2       = gmx_cvttpr_epi32(rs_SSE2);
+#ifdef GMX_X86_SSE4_1
+            /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+            rf_SSE0       = gmx_floor_pr(rs_SSE0);
+            rf_SSE2       = gmx_floor_pr(rs_SSE2);
+#else
+            rf_SSE0       = gmx_cvtepi32_pr(ti_SSE0);
+            rf_SSE2       = gmx_cvtepi32_pr(ti_SSE2);
+#endif
+            frac_SSE0     = gmx_sub_pr(rs_SSE0,rf_SSE0);
+            frac_SSE2     = gmx_sub_pr(rs_SSE2,rf_SSE2);
+
+            /* Load and interpolate table forces and possibly energies.
+             * Force and energy can be combined in one table, stride 4: FDV0
+             * or in two separate tables with stride 1: F and V
+             * Currently single precision uses FDV0, double F and V.
+             */
+#ifndef CALC_ENERGIES
+            load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
+            load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
+#else
+#ifdef TAB_FDV0
+            load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+            load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#else
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+#endif
+#endif
+            fsub_SSE0     = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
+            fsub_SSE2     = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
+
+#ifdef CALC_ENERGIES
+            vc_sub_SSE0   = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
+            vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
+#endif
+#endif /* CALC_COUL_TAB */
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#ifndef NO_SHIFT_EWALD
+            /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
+            vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
+            vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
+#else
+            vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
+            vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
+#endif
+#endif
+            
+            vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
+            vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
+#endif
+
+#ifdef CALC_ENERGIES
+            /* Mask energy for cut-off and diagonal */
+            vcoul_SSE0    = gmx_and_pr(vcoul_SSE0,wco_SSE0);
+            vcoul_SSE2    = gmx_and_pr(vcoul_SSE2,wco_SSE2);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+            /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+            wco_vdw_SSE0  = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
+#ifndef HALF_LJ
+            wco_vdw_SSE2  = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
+#endif
+#else
+            /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define     wco_vdw_SSE0    wco_SSE0
+#define     wco_vdw_SSE2    wco_SSE2
+#endif
+
+#ifndef LJ_COMB_LB
+            rinvsix_SSE0  = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
+#ifdef EXCL_FORCES
+            rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+            rinvsix_SSE2  = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
+#ifdef EXCL_FORCES
+            rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+            rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+            FrLJ6_SSE0    = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
+#ifndef HALF_LJ
+            FrLJ6_SSE2    = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
+#endif
+            FrLJ12_SSE0   = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
+#ifndef HALF_LJ
+            FrLJ12_SSE2   = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+            sir_SSE0      = gmx_mul_pr(sig_SSE0,rinv_SSE0);
+#ifndef HALF_LJ
+            sir_SSE2      = gmx_mul_pr(sig_SSE2,rinv_SSE2);
+#endif
+            sir2_SSE0     = gmx_mul_pr(sir_SSE0,sir_SSE0);
+#ifndef HALF_LJ
+            sir2_SSE2     = gmx_mul_pr(sir_SSE2,sir_SSE2);
+#endif
+            sir6_SSE0     = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
+#ifdef EXCL_FORCES
+            sir6_SSE0     = gmx_and_pr(sir6_SSE0,int_SSE0);
+#endif
+#ifndef HALF_LJ
+            sir6_SSE2     = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
+#ifdef EXCL_FORCES
+            sir6_SSE2     = gmx_and_pr(sir6_SSE2,int_SSE2);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            sir6_SSE0     = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+            sir6_SSE2     = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
+#endif
+#endif
+            FrLJ6_SSE0    = gmx_mul_pr(eps_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+            FrLJ6_SSE2    = gmx_mul_pr(eps_SSE2,sir6_SSE2);
+#endif
+            FrLJ12_SSE0   = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
+#ifndef HALF_LJ
+            FrLJ12_SSE2   = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
+#endif
+#if defined CALC_ENERGIES
+            /* We need C6 and C12 to calculate the LJ potential shift */
+            sig2_SSE0     = gmx_mul_pr(sig_SSE0,sig_SSE0);
+#ifndef HALF_LJ
+            sig2_SSE2     = gmx_mul_pr(sig_SSE2,sig_SSE2);
+#endif
+            sig6_SSE0     = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
+#ifndef HALF_LJ
+            sig6_SSE2     = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
+#endif
+            c6_SSE0       = gmx_mul_pr(eps_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+            c6_SSE2       = gmx_mul_pr(eps_SSE2,sig6_SSE2);
+#endif
+            c12_SSE0      = gmx_mul_pr(c6_SSE0,sig6_SSE0);
+#ifndef HALF_LJ
+            c12_SSE2      = gmx_mul_pr(c6_SSE2,sig6_SSE2);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+            
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+            /* Extract the group pair index per j pair.
+             * Energy groups are stored per i-cluster, so things get
+             * complicated when the i- and j-cluster size don't match.
+             */
+            {
+                int egps_j;
+#if UNROLLJ == 2
+                egps_j    = nbat->energrp[cj>>1];
+                egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+                /* We assume UNROLLI <= UNROLLJ */
+                int jdi;
+                for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+                {
+                    int jj;
+                    egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+                    for(jj=0; jj<(UNROLLI/2); jj++)
+                    {
+                        egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+                    }
+                }
+#endif
+            }
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+            vctotSSE      = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0,vcoul_SSE2));
+#else
+            add_ener_grp_halves(vcoul_SSE0,vctp[0],vctp[1],egp_jj);
+            add_ener_grp_halves(vcoul_SSE2,vctp[2],vctp[3],egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+            /* Calculate the LJ energies */
+            VLJ6_SSE0     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
+#ifndef HALF_LJ
+            VLJ6_SSE2     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
+#endif
+            VLJ12_SSE0    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
+#ifndef HALF_LJ
+            VLJ12_SSE2    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
+#endif
+
+            VLJ_SSE0      = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
+#endif
+            /* The potential shift should be removed for pairs beyond cut-off */
+            VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
+#endif
+#ifdef CHECK_EXCLS
+            /* The potential shift should be removed for excluded pairs */
+            VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,int_SSE0);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,int_SSE2);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+            VvdwtotSSE    = gmx_add_pr(VvdwtotSSE,
+#ifndef HALF_LJ
+                                       gmx_add_pr(VLJ_SSE0,VLJ_SSE2)
+#else
+                                       VLJ_SSE0
+#endif
+                                      );
+#else
+            add_ener_grp_halves(VLJ_SSE0,vvdwtp[0],vvdwtp[1],egp_jj);
+#ifndef HALF_LJ
+            add_ener_grp_halves(VLJ_SSE2,vvdwtp[2],vvdwtp[3],egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+            fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE0,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
+#else
+            fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+            fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE2,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
+#else
+            /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+            fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
+#endif
+
+            /* Calculate temporary vectorial force */
+            tx_SSE0       = gmx_mul_pr(fscal_SSE0,dx_SSE0);
+            tx_SSE2       = gmx_mul_pr(fscal_SSE2,dx_SSE2);
+            ty_SSE0       = gmx_mul_pr(fscal_SSE0,dy_SSE0);
+            ty_SSE2       = gmx_mul_pr(fscal_SSE2,dy_SSE2);
+            tz_SSE0       = gmx_mul_pr(fscal_SSE0,dz_SSE0);
+            tz_SSE2       = gmx_mul_pr(fscal_SSE2,dz_SSE2);
+
+            /* Increment i atom force */
+            fix_SSE0      = gmx_add_pr(fix_SSE0,tx_SSE0);
+            fix_SSE2      = gmx_add_pr(fix_SSE2,tx_SSE2);
+            fiy_SSE0      = gmx_add_pr(fiy_SSE0,ty_SSE0);
+            fiy_SSE2      = gmx_add_pr(fiy_SSE2,ty_SSE2);
+            fiz_SSE0      = gmx_add_pr(fiz_SSE0,tz_SSE0);
+            fiz_SSE2      = gmx_add_pr(fiz_SSE2,tz_SSE2);
+
+            /* Decrement j atom force */
+            gmx_store_hpr(f+ajx,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0,tx_SSE2) ));
+            gmx_store_hpr(f+ajy,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0,ty_SSE2) ));
+            gmx_store_hpr(f+ajz,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0,tz_SSE2) ));
+        }
+
+#undef  rinv_ex_SSE0
+#undef  rinv_ex_SSE2
+
+#undef  wco_vdw_SSE0
+#undef  wco_vdw_SSE2
+
+#undef  CUTOFF_BLENDV
+
+#undef  EXCL_FORCES
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
new file mode 100644 (file)
index 0000000..78242d7
--- /dev/null
@@ -0,0 +1,694 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* GMX_MM256_HERE should be set before including this file */
+#include "gmx_simd_macros.h"
+
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
+
+#if defined GMX_MM256_HERE
+#define STRIDE     4
+#endif 
+
+#ifdef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+/* single precision 2x(4+4) kernel */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#define TAB_FDV0
+#else
+#error "unsupported kernel configuration"
+#endif
+#endif
+
+#define SIMD_MASK_ALL   0xffffffff
+
+#include "nbnxn_kernel_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
+#else
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
+#else
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+                            (const nbnxn_pairlist_t     *nbl,
+                             const nbnxn_atomdata_t     *nbat,
+                             const interaction_const_t  *ic,
+                             rvec                       *shift_vec, 
+                             real                       *f
+#ifdef CALC_SHIFTFORCES
+                             ,
+                             real                       *fshift
+#endif
+#ifdef CALC_ENERGIES
+                             ,
+                             real                       *Vvdw,
+                             real                       *Vc
+#endif
+                            )
+{
+    const nbnxn_ci_t   *nbln;
+    const nbnxn_cj_t   *l_cj;
+    const int          *type;
+    const real         *q;
+    const real         *shiftvec;
+    const real         *x;
+    const real         *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
+    real       facel;
+    real       *nbfp_ptr;
+    int        nbfp_stride;
+    int        n,ci,ci_sh;
+    int        ish,ish3;
+    gmx_bool   do_LJ,half_LJ,do_coul;
+    int        sci,scix,sciy,sciz,sci2;
+    int        cjind0,cjind1,cjind;
+    int        ip,jp;
+
+#ifdef ENERGY_GROUPS
+    int        Vstride_i;
+    int        egps_ishift,egps_imask;
+    int        egps_jshift,egps_jmask,egps_jstride;
+    int        egps_i;
+    real       *vvdwtp[UNROLLI];
+    real       *vctp[UNROLLI];
+#endif
+    
+    gmx_mm_pr  shX_SSE;
+    gmx_mm_pr  shY_SSE;
+    gmx_mm_pr  shZ_SSE;
+    gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
+    gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
+    gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
+    gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+    __m128     fix_SSE,fiy_SSE,fiz_SSE;
+#else
+    __m256d    fix_SSE,fiy_SSE,fiz_SSE;
+#endif
+#else
+    __m128d    fix0_SSE,fiy0_SSE,fiz0_SSE;
+    __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
+#endif
+
+    /* AVX: use floating point masks, as there are no integer instructions */
+    gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
+    gmx_mm_pr  mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+
+    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+
+#ifdef GMX_X86_SSE4_1
+    gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
+#endif
+
+    gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
+    gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
+    gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
+    gmx_mm_pr  mrc_3_SSE;
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
+#endif
+
+#ifdef CALC_COUL_TAB
+    /* Coulomb table variables */
+    gmx_mm_pr  invtsp_SSE;
+    const real *tab_coul_F;
+#ifndef TAB_FDV0
+    const real *tab_coul_V;
+#endif
+#ifdef GMX_MM256_HERE
+    int        ti0_array[2*UNROLLJ-1],*ti0;
+    int        ti2_array[2*UNROLLJ-1],*ti2;
+#endif
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  mhalfsp_SSE;
+#endif
+#endif
+
+#ifdef CALC_COUL_EWALD
+    gmx_mm_pr beta2_SSE,beta_SSE;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+    gmx_mm_pr  sh_ewald_SSE;
+#endif
+
+#ifdef LJ_COMB_LB
+    const real *ljc;
+
+    gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
+    gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
+#else
+#ifdef FIX_LJ_C
+    real       pvdw_array[2*UNROLLI*UNROLLJ+3];
+    real       *pvdw_c6,*pvdw_c12;
+    gmx_mm_pr  c6_SSE0,c12_SSE0;
+    gmx_mm_pr  c6_SSE2,c12_SSE2;
+#endif
+
+#ifdef LJ_COMB_GEOM
+    const real *ljc;
+
+    gmx_mm_pr  c6s_SSE0,c12s_SSE0;
+    gmx_mm_pr  c6s_SSE1,c12s_SSE1;
+    gmx_mm_pr  c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
+    gmx_mm_pr  c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+    gmx_mm_pr  vctotSSE,VvdwtotSSE;
+    gmx_mm_pr  sixthSSE,twelvethSSE;
+
+    gmx_mm_pr  avoid_sing_SSE;
+    gmx_mm_pr  rc2_SSE;
+#ifdef VDW_CUTOFF_CHECK
+    gmx_mm_pr  rcvdw2_SSE;
+#endif
+
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  sh_invrc6_SSE,sh_invrc12_SSE;
+
+    /* cppcheck-suppress unassignedVariable */
+    real       tmpsum_array[15],*tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+    /* cppcheck-suppress unassignedVariable */
+    real       shf_array[15],*shf;
+#endif
+
+    int ninner;
+
+#ifdef COUNT_PAIRS
+    int npair=0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+    ljc = nbat->lj_comb;
+#else
+    /* No combination rule used */
+#ifndef GMX_DOUBLE
+    nbfp_ptr    = nbat->nbfp_s4;
+#define NBFP_STRIDE  4
+#else
+    nbfp_ptr    = nbat->nbfp;
+#define NBFP_STRIDE  2
+#endif
+    nbfp_stride = NBFP_STRIDE;
+#endif
+
+#ifdef CALC_COUL_TAB
+#ifdef GMX_MM256_HERE
+    /* Generate aligned table pointers */
+    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+#endif
+
+    invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+    mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+#endif
+
+#ifdef TAB_FDV0
+    tab_coul_F = ic->tabq_coul_FDV0;
+#else
+    tab_coul_F = ic->tabq_coul_F;
+    tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+    beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+    beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+    sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+    q                   = nbat->q;
+    type                = nbat->type;
+    facel               = ic->epsfac;
+    shiftvec            = shift_vec[0];
+    x                   = nbat->x;
+
+    avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+    rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+    rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+    sixthSSE    = gmx_set1_pr(1.0/6.0);
+    twelvethSSE = gmx_set1_pr(1.0/12.0);
+
+    sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
+    sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+    mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+    hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+    
+    moh_rc_SSE = gmx_set1_pr(-ic->c_rf); 
+#endif
+
+#ifdef CALC_ENERGIES
+    tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+#endif
+#ifdef CALC_SHIFTFORCES
+    shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+#endif
+
+#ifdef FIX_LJ_C
+    pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+    for(jp=0; jp<UNROLLJ; jp++)
+    {
+        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+    }
+    c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+    c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+    c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+    c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+    c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+    c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+    c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+    c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+    egps_ishift  = nbat->neg_2log;
+    egps_imask   = (1<<egps_ishift) - 1;
+    egps_jshift  = 2*nbat->neg_2log;
+    egps_jmask   = (1<<egps_jshift) - 1;
+    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+    /* Major division is over i-particle energy groups, determine the stride */
+    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+    l_cj = nbl->cj;
+
+    ninner = 0;
+    for(n=0; n<nbl->nci; n++)
+    {
+        nbln = &nbl->ci[n];
+
+        ish              = (nbln->shift & NBNXN_CI_SHIFT);
+        ish3             = ish*3;
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
+        ci               = nbln->ci;
+        ci_sh            = (ish == CENTRAL ? ci : -1);
+
+        shX_SSE = gmx_load1_pr(shiftvec+ish3);
+        shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
+        shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+        sci              = ci*STRIDE;
+        scix             = sci*DIM;
+        sci2             = sci*2;
+#else
+        sci              = (ci>>1)*STRIDE;
+        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
+        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
+        sci             += (ci & 1)*(STRIDE>>1);
+#endif
+
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
+        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
+
+#ifdef ENERGY_GROUPS
+        egps_i = nbat->energrp[ci];
+        {
+            int ia,egp_ia;
+
+            for(ia=0; ia<UNROLLI; ia++)
+            {
+                egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+                vctp[ia]   = Vc   + egp_ia*Vstride_i;
+            }
+        }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
+#if UNROLLJ == 2
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+#endif
+#if UNROLLJ == 8
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+        {
+            int  ia;
+            real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+            Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+            Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+            Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+#ifdef CALC_COUL_EWALD
+            /* beta/sqrt(pi) */
+            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
+#endif
+
+            for(ia=0; ia<UNROLLI; ia++)
+            {
+                real qi;
+
+                qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+                Vc[0]
+#endif
+                    -= facel*qi*qi*Vc_sub_self;
+            }
+        }
+#endif
+
+#define gmx_load2_hpr(x)  _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1)
+
+        /* Load i atom data */
+        sciy             = scix + STRIDE;
+        sciz             = sciy + STRIDE;
+        ix_SSE0          = gmx_add_pr(gmx_load2_hpr(x+scix)  ,shX_SSE);
+        ix_SSE2          = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE);
+        iy_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciy)  ,shY_SSE);
+        iy_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE);
+        iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz)  ,shZ_SSE);
+        iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
+
+        if (do_coul)
+        {
+            gmx_mm_pr facel_SSE;
+
+            facel_SSE    = gmx_set1_pr(facel);
+
+            iq_SSE0      = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci));
+            iq_SSE2      = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2));
+        }
+
+#ifdef LJ_COMB_LB
+        hsig_i_SSE0      = gmx_load2_hpr(ljc+sci2+0);
+        hsig_i_SSE2      = gmx_load2_hpr(ljc+sci2+2);
+        seps_i_SSE0      = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+        seps_i_SSE2      = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+#else
+#ifdef LJ_COMB_GEOM
+        c6s_SSE0         = gmx_load2_hpr(ljc+sci2+0);
+        if (!half_LJ)
+        {
+            c6s_SSE2     = gmx_load2_hpr(ljc+sci2+2);
+        }
+        c12s_SSE0        = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+        if (!half_LJ)
+        {
+            c12s_SSE2    = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+        }
+#else
+        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
+        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+        if (!half_LJ)
+        {
+            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+        }
+#endif
+#endif
+
+        /* Zero the potential energy for this list */
+        VvdwtotSSE       = gmx_setzero_pr();
+        vctotSSE         = gmx_setzero_pr();
+
+        /* Clear i atom forces */
+        fix_SSE0           = gmx_setzero_pr();
+        fix_SSE2           = gmx_setzero_pr();
+        fiy_SSE0           = gmx_setzero_pr();
+        fiy_SSE2           = gmx_setzero_pr();
+        fiz_SSE0           = gmx_setzero_pr();
+        fiz_SSE2           = gmx_setzero_pr();
+
+        cjind = cjind0;
+
+        /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+        if (half_LJ)
+        {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+            }
+#undef HALF_LJ
+#undef CALC_COULOMB
+        }
+        else if (do_coul)
+        {
+#define CALC_COULOMB
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+            }
+#undef CALC_COULOMB
+        }
+        else
+        {
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_simd_2xnn_inner.h"
+            }
+        }
+#undef CALC_LJ
+        ninner += cjind1 - cjind0;
+
+        /* Add accumulated i-forces to the force array */
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+#define gmx_load_ps4  _mm_load_ps
+#define gmx_store_ps4 _mm_store_ps
+#define gmx_add_ps4   _mm_add_ps
+#else
+#define gmx_load_ps4  _mm256_load_pd
+#define gmx_store_ps4 _mm256_store_pd
+#define gmx_add_ps4   _mm256_add_pd
+#endif
+        GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0,fix_SSE2,fix_SSE);
+        gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+
+        GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0,fiy_SSE2,fiy_SSE);
+        gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+
+        GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0,fiz_SSE2,fiz_SSE);
+        gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+        gmx_store_ps4(shf,fix_SSE);
+        fshift[ish3+0] += SUM_SIMD4(shf);
+        gmx_store_ps4(shf,fiy_SSE);
+        fshift[ish3+1] += SUM_SIMD4(shf);
+        gmx_store_ps4(shf,fiz_SSE);
+        fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+#else
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
+        _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
+        _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
+        _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
+        _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
+        _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
+        _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+
+#ifdef CALC_SHIFTFORCES
+        _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
+        fshift[ish3+0] += shf[0] + shf[1];
+        _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
+        fshift[ish3+1] += shf[0] + shf[1];
+        _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
+        fshift[ish3+2] += shf[0] + shf[1];
+#endif
+#endif
+               
+#ifdef CALC_ENERGIES
+        if (do_coul)
+        {
+            gmx_store_pr(tmpsum,vctotSSE);
+            *Vc += SUM_SIMD(tmpsum);
+        }
+               
+        gmx_store_pr(tmpsum,VvdwtotSSE);
+        *Vvdw += SUM_SIMD(tmpsum);
+#endif
+               
+               /* Outer loop uses 6 flops/iteration */
+       }
+
+#ifdef COUNT_PAIRS
+    printf("atom pairs %d\n",npair);
+#endif
+}
+
+#undef gmx_load2_hpr
+
+#undef gmx_load_ps4
+#undef gmx_store_ps4
+#undef gmx_store_ps4
+
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI   
+#undef UNROLLJ   
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
similarity index 84%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
index eec30d3da2cc3295070c89e961a342167fd47a07..470d27dcf7f96b6412979f1ecbbd3fd7f7dc4319 100644 (file)
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
 
-#ifdef GMX_X86_AVX_256
+#ifdef GMX_NBNXN_SIMD_4XN
 
-#include "nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernel_simd_4xn.h"
 
-/* Include all flavors of the 256-bit AVX kernel loops */
+/* Include all flavors of the SSE or AVX 4xN kernel loops */
 
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
 #define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 
 /* Analytical reaction-field kernels */
 #define CALC_COUL_RF
 
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 #undef CALC_COUL_RF
 
 #define CALC_COUL_TAB
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_TAB
 #define CALC_COUL_EWALD
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_EWALD
@@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 
 enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
                                   real *Vvdw,real *Vc)
 {
+    const int simd_width   = GMX_SIMD_WIDTH_HERE;
+    const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
     int ng_p2,i,j,j0,j1,c,s;
 
-#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
-
     ng_p2 = (1<<ng_2log);
 
     /* The size of the x86 SIMD energy group buffer array is:
-     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+     * ng*ng*ng_p2*unrollj_half*simd_width
      */
     for(i=0; i<ng; i++)
     {
@@ -163,34 +170,34 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_HALF; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
+                for(s=0; s<unrollj_half; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
                     Vc  [i*ng+j0] += VSc  [c+0];
                     Vc  [i*ng+j1] += VSc  [c+1];
-                    c += SIMD_WIDTH + 2;
+                    c += simd_width + 2;
                 }
             }
         }
     }
 }
 
-#endif /* GMX_X86_AVX_256 */
+#endif /* GMX_NBNXN_SIMD_4XN */
 
 void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec, 
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw)
-#ifdef GMX_X86_AVX_256
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+                      const nbnxn_atomdata_t     *nbat,
+                      const interaction_const_t  *ic,
+                      int                        ewald_excl,
+                      rvec                       *shift_vec, 
+                      int                        force_flags,
+                      int                        clearF,
+                      real                       *fshift,
+                      real                       *Vc,
+                      real                       *Vvdw)
+#ifdef GMX_NBNXN_SIMD_4XN
 {
     int              nnbl;
     nbnxn_pairlist_t **nbl;
@@ -320,6 +327,6 @@ nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
 }
 #else
 {
-    gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
+    gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
 }
 #endif
similarity index 71%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
index 9a8e53c9022ae7221cfc27c829896ff81f835623..a9d4d19802adc75155db56133959ecdaf9a74056 100644 (file)
@@ -35,8 +35,8 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifndef _nbnxn_kernel_x86_simd256_h
-#define _nbnxn_kernel_x86_simd256_h
+#ifndef _nbnxn_kernel_simd_4xn_h
+#define _nbnxn_kernel_simd_4xn_h
 
 #include "typedefs.h"
 
 extern "C" {
 #endif
 
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width N.
+ */
 void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec,
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw);
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+                      const nbnxn_atomdata_t     *nbat,
+                      const interaction_const_t  *ic,
+                      int                        ewald_excl,
+                      rvec                       *shift_vec,
+                      int                        force_flags,
+                      int                        clearF,
+                      real                       *fshift,
+                      real                       *Vc,
+                      real                       *Vvdw);
 
 #ifdef __cplusplus
 }
similarity index 86%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
index 15c52fb097c1105719cca0f32b01d323c6acc5a7..07da218f247e9987683ef2d43557ee7dd28210da 100644 (file)
 /* Include the force+energy kernels */
 #define CALC_ENERGIES
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef CALC_ENERGIES
 
 /* Include the force+energygroups kernels */
 #define CALC_ENERGIES
 #define ENERGY_GROUPS
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef ENERGY_GROUPS
 #undef CALC_ENERGIES
 
 /* Include the force only kernels */
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
similarity index 99%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
index f1a0b9ccc82996fa94c6b750b637b6181d6f1e49..1676f1f43dc38c85901a37d8744d2afe45bdfee7 100644 (file)
@@ -35,8 +35,9 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* This is the innermost loop contents for the n vs n atom
- * SSE2 single precision kernels.
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel calculates interactions of 4 i-atoms
+ * with N j-atoms stored in N wide SIMD registers.
  */
 
 
             gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
             gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
             gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
-            /* Table index: rs converted to an int */ 
+            /* Table index: rs truncated to an int */
 #if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
             gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
 #else
             jxSSE         = gmx_load_pr(x+ajx);
             jySSE         = gmx_load_pr(x+ajy);
             jzSSE         = gmx_load_pr(x+ajz);
-            
+
             /* Calculate distance */
             dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
             dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
             dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
             dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
             dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
-            
+
             /* rsq = dx*dx+dy*dy+dz*dz */
             rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
             rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
similarity index 91%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
index 0644f9776cd49ada8300b9f5c3f93c7df1732b01..1545d40380c8d48fcc21b16b96477343ef881dfe 100644 (file)
  */
 
 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
 
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 
 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
+#define UNROLLJ    GMX_SIMD_WIDTH_HERE
 
 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
 #define STRIDE     4
 
 #ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
 #define SUM_SIMD(x) SUM_SIMD4(x)
 #define TAB_FDV0
 #else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
 #define SUM_SIMD(x) (x[0]+x[1])
 #endif
 #endif
 
 #ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
 #define TAB_FDV0
 #else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
 #define SUM_SIMD(x) SUM_SIMD4(x)
 #endif
 #endif
 
 #define SIMD_MASK_ALL   0xffffffff
 
-#include "nbnxn_kernel_x86_simd_utils.h"
+#include "nbnxn_kernel_simd_utils.h"
 
 /* All functionality defines are set here, except for:
  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
 /* Assumes all LJ parameters are identical */
 /* #define FIX_LJ_C */
 
-#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
 
 #if defined LJ_COMB_GEOM
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
 #else
 #if defined LJ_COMB_LB
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
 #else
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
 #endif
 #endif
 
 #ifdef CALC_COUL_RF
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
 #endif
 #ifdef CALC_COUL_TAB
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
 #endif
 #endif
 #ifdef CALC_COUL_EWALD
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
-#endif
-#endif
-
-#ifdef GMX_MM128_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
 #endif
-#ifdef GMX_MM256_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
 #endif
 
 static void
 #ifndef CALC_ENERGIES
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,noener)
 #else
 #ifndef ENERGY_GROUPS
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,ener)
 #else
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 #endif
 #endif
 #undef NBK_FUNC_NAME
@@ -169,7 +167,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     int        nbfp_stride;
     int        n,ci,ci_sh;
     int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        sci,scix,sciy,sciz,sci2;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
@@ -205,7 +203,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
@@ -218,7 +216,8 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
 #endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
     /* AVX: use floating point masks, as there are no integer instructions */
 #ifndef GMX_DOUBLE
     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
@@ -232,7 +231,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #endif
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
     __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
     __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
@@ -248,7 +247,8 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
     __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 #endif
-#else /* GMX_MM256_HERE */
+#endif
+#ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
     gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
     gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
@@ -266,7 +266,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #endif
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
     __m128i    zeroi_SSE = _mm_setzero_si128();
 #endif
 #ifdef GMX_X86_SSE4_1
@@ -477,7 +477,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     egps_jshift  = 2*nbat->neg_2log;
     egps_jmask   = (1<<egps_jshift) - 1;
     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 #endif
 
@@ -490,9 +490,8 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 
         ish              = (nbln->shift & NBNXN_CI_SHIFT);
         ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
@@ -511,8 +510,15 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         sci             += (ci & 1)*(STRIDE>>1);
 #endif
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef ENERGY_GROUPS
         egps_i = nbat->energrp[ci];
@@ -571,7 +577,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         }
 #endif
 
-               /* Load i atom data */
+        /* Load i atom data */
         sciy             = scix + STRIDE;
         sciz             = sciy + STRIDE;
         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
@@ -587,8 +593,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
@@ -661,13 +666,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
 #undef HALF_LJ
 #undef CALC_COULOMB
@@ -678,13 +683,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
 #undef CALC_COULOMB
         }
@@ -693,13 +698,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
         }
 #undef CALC_LJ
similarity index 90%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
index 0010a4fbe2f741f85bc612bf76a3b5e3075e66f9..45ab2aedcc9205f3d7d86d41cb3fbef186d09ef2 100644 (file)
     i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
     o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
 }
+#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE)                  \
+{                                                                       \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps());                \
+    i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps());                \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
+    i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001);                      \
+    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+}
 #else
 #define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
 {                                                                       \
@@ -228,6 +236,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
 }
 
+#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2];                     \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<2*UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+                                                                        \
+    GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE);                     \
+    GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
+}
+
 #endif
 
 #if defined GMX_MM128_HERE && defined GMX_DOUBLE
@@ -474,7 +499,7 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
 /* Add energy register to possibly multiple terms in the energy array.
  * This function is the same for SSE/AVX single/double.
  */
-static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
+static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,const int *offset_jj)
 {
     int jj;
 
@@ -486,9 +511,39 @@ static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
     {
         gmx_mm_pr v_SSE;
 
-        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
-        gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
+        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+        gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
     }
 }
 
+#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8
+/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
+static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
+                                       real *v0,real *v1,const int *offset_jj)
+{
+    gmx_mm_hpr e_SSE0,e_SSE1;
+    int jj;
+
+    e_SSE0 = _mm256_extractf128_ps(e_SSE,0);
+    e_SSE1 = _mm256_extractf128_ps(e_SSE,1);
+
+    for(jj=0; jj<(UNROLLJ/2); jj++)
+    {
+        gmx_mm_hpr v_SSE;
+
+        v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
+    }
+    for(jj=0; jj<(UNROLLJ/2); jj++)
+    {
+        gmx_mm_hpr v_SSE;
+
+        v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
+    }
+}
+#endif
+
 #endif /* _nbnxn_kernel_sse_utils_h_ */
index f356973898af2f48b8d0cdacac50b0d71ddb2a31..dd6b2c31df62d58d00fa3327c7dd63a9a12d71fa 100644 (file)
 
 #ifndef GMX_DOUBLE
 #define NBNXN_SEARCH_SSE_SINGLE
-#include "gmx_x86_simd_single.h"
-#else
-#include "gmx_x86_simd_double.h"
 #endif
 
+/* Include basic SSE2 stuff */
+#include <emmintrin.h>
+
 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
 #define NBNXN_8BB_SSE
 #endif
@@ -94,6 +94,9 @@
 #define STRIDE_8BB        4
 #define STRIDE_8BB_2LOG   2
 
+#endif /* NBNXN_SEARCH_SSE */
+
+#ifdef GMX_NBNXN_SIMD
 
 /* The functions below are macros as they are performance sensitive */
 
 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 
 /* The j-cluster size is matched to the SIMD width */
-#ifndef GMX_DOUBLE
-/* 128 bits can hold 4 floats */
-#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
-#define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
-#define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
-/* 256 bits can hold 8 floats */
-#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
-#define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
-#define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
 #else
-/* 128 bits can hold 2 doubles */
-#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
-#define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
-#define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
-/* 256 bits can hold 4 doubles */
-#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
-#define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
-#define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+#endif
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
+/* Half SIMD with j-cluster size */
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
+#endif
+#else
+#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#endif
 #endif
 
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* GMX_NBNXN_SIMD */
 
 
 /* Interaction masks for 4xN atom interactions.
@@ -253,12 +266,12 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 {
     switch (nb_kernel_type)
     {
-    case nbk4x4_PlainC:
-    case nbk4xN_X86_SIMD128:
-    case nbk4xN_X86_SIMD256:
+    case nbnxnk4x4_PlainC:
+    case nbnxnk4xN_SIMD_4xN:
+    case nbnxnk4xN_SIMD_2xNN:
         return NBNXN_CPU_CLUSTER_I_SIZE;
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
         /* The cluster size for super/sub lists is only set here.
          * Any value should work for the pair-search and atomdata code.
          * The kernels, of course, might require a particular value.
@@ -273,24 +286,33 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 
 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 {
+    int nbnxn_simd_width=0;
+    int cj_size=0;
+
+#ifdef GMX_NBNXN_SIMD
+    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#endif
+
     switch (nb_kernel_type)
     {
-    case nbk4x4_PlainC:
-        return NBNXN_CPU_CLUSTER_I_SIZE;
-    case nbk4xN_X86_SIMD128:
-        /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
-        return 16/sizeof(real);
-    case nbk4xN_X86_SIMD256:
-        /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
-        return 32/sizeof(real);
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
-        return nbnxn_kernel_to_ci_size(nb_kernel_type);
+    case nbnxnk4x4_PlainC:
+        cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+        break;
+    case nbnxnk4xN_SIMD_4xN:
+        cj_size = nbnxn_simd_width;
+        break;
+    case nbnxnk4xN_SIMD_2xNN:
+        cj_size = nbnxn_simd_width/2;
+        break;
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
+        cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+        break;
     default:
         gmx_incons("unknown kernel type");
     }
 
-    return 0;
+    return cj_size;
 }
 
 static int ci_to_cj(int na_cj_2log,int ci)
@@ -307,20 +329,20 @@ static int ci_to_cj(int na_cj_2log,int ci)
 
 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 {
-    if (nb_kernel_type == nbkNotSet)
+    if (nb_kernel_type == nbnxnkNotSet)
     {
         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
     }
 
     switch (nb_kernel_type)
     {
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
         return FALSE;
 
-    case nbk4x4_PlainC:
-    case nbk4xN_X86_SIMD128:
-    case nbk4xN_X86_SIMD256:
+    case nbnxnk4x4_PlainC:
+    case nbnxnk4xN_SIMD_4xN:
+    case nbnxnk4xN_SIMD_2xNN:
         return TRUE;
 
     default:
@@ -2360,18 +2382,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 
     snew(nbl->work,1);
 #ifdef NBNXN_BBXXXX
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
 #else
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
-#endif
-    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
-#ifdef NBNXN_SEARCH_SSE
-    snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
-#ifdef GMX_X86_AVX_256
-    snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
 #endif
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+#ifdef GMX_NBNXN_SIMD
+    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
+    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
 #endif
-    snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
 }
 
 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
@@ -2626,7 +2646,6 @@ static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 }
 
-#ifdef NBNXN_SEARCH_SSE
 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 {
@@ -2639,7 +2658,6 @@ static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 #endif
 }
 
-#ifdef GMX_X86_AVX_256
 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
 {
@@ -2647,12 +2665,23 @@ static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
              NBNXN_INT_MASK_ALL));
-#else              /* cj-size = 2 */
+#else              /* cj-size = 4 */
     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 #endif
 }
+
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define get_imask_x86_simd_4xn  get_imask_x86_simd128
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define get_imask_x86_simd_4xn  get_imask_x86_simd256
+#define get_imask_x86_simd_2xnn get_imask_x86_simd128
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 #endif
-#endif /* NBNXN_SEARCH_SSE */
 
 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
  * Checks bounding box distances and possibly atom pair distances.
@@ -2773,23 +2802,11 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
     }
 }
 
-#ifdef NBNXN_SEARCH_SSE
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S  PACK_X4
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM256_HERE
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "nbnxn_search_simd_4xn.h"
 #endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "nbnxn_search_simd_2xnn.h"
 #endif
 
 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
@@ -3345,13 +3362,17 @@ static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
     {
         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
 
-        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+        /* The counts below are used for non-bonded pair/flop counts
+         * and should therefore match the available kernel setups.
+         */
+        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
         {
-            nbl->work->ncj_hlj += jlen;
+            nbl->work->ncj_noq += jlen;
         }
-        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
         {
-            nbl->work->ncj_noq += jlen;
+            nbl->work->ncj_hlj += jlen;
         }
 
         nbl->nci++;
@@ -4495,7 +4516,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 
                                     switch (nb_kernel_type)
                                     {
-                                    case nbk4x4_PlainC:
+                                    case nbnxnk4x4_PlainC:
                                         check_subcell_list_space_simple(nbl,cl-cf+1);
 
                                         make_cluster_list_simple(gridj,
@@ -4505,30 +4526,30 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                                                                  rl2,rbb2,
                                                                  &ndistc);
                                         break;
-#ifdef NBNXN_SEARCH_SSE
-                                    case nbk4xN_X86_SIMD128:
+#ifdef GMX_NBNXN_SIMD_4XN
+                                    case nbnxnk4xN_SIMD_4xN:
                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
-                                        make_cluster_list_x86_simd128(gridj,
-                                                                      nbl,ci,cf,cl,
-                                                                      (gridi == gridj && shift == CENTRAL),
-                                                                      nbat->x,
-                                                                      rl2,rbb2,
-                                                                      &ndistc);
+                                        make_cluster_list_simd_4xn(gridj,
+                                                                   nbl,ci,cf,cl,
+                                                                   (gridi == gridj && shift == CENTRAL),
+                                                                   nbat->x,
+                                                                   rl2,rbb2,
+                                                                   &ndistc);
                                         break;
-#ifdef GMX_X86_AVX_256
-                                    case nbk4xN_X86_SIMD256:
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+                                    case nbnxnk4xN_SIMD_2xNN:
                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
-                                        make_cluster_list_x86_simd256(gridj,
-                                                                      nbl,ci,cf,cl,
-                                                                      (gridi == gridj && shift == CENTRAL),
-                                                                      nbat->x,
-                                                                      rl2,rbb2,
-                                                                      &ndistc);
+                                        make_cluster_list_simd_2xnn(gridj,
+                                                                   nbl,ci,cf,cl,
+                                                                   (gridi == gridj && shift == CENTRAL),
+                                                                   nbat->x,
+                                                                   rl2,rbb2,
+                                                                   &ndistc);
                                         break;
 #endif
-#endif
-                                    case nbk8x8x8_PlainC:
-                                    case nbk8x8x8_CUDA:
+                                    case nbnxnk8x8x8_PlainC:
+                                    case nbnxnk8x8x8_CUDA:
                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
                                         for(cj=cf; cj<=cl; cj++)
                                         {
@@ -4728,15 +4749,15 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs,
     {
         switch (nb_kernel_type)
         {
-#ifdef NBNXN_SEARCH_SSE
-        case nbk4xN_X86_SIMD128:
-            nbs->icell_set_x = icell_set_x_x86_simd128;
-            break;
-#ifdef GMX_X86_AVX_256
-        case nbk4xN_X86_SIMD256:
-            nbs->icell_set_x = icell_set_x_x86_simd256;
+#ifdef GMX_NBNXN_SIMD_4XN
+        case nbnxnk4xN_SIMD_4xN:
+            nbs->icell_set_x = icell_set_x_simd_4xn;
             break;
 #endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+        case nbnxnk4xN_SIMD_2xNN:
+            nbs->icell_set_x = icell_set_x_simd_2xnn;
+            break;
 #endif
         default:
             nbs->icell_set_x = icell_set_x_simple;
diff --git a/src/mdlib/nbnxn_search_simd_2xnn.h b/src/mdlib/nbnxn_search_simd_2xnn.h
new file mode 100644 (file)
index 0000000..04dd501
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_simd_macros.h"
+
+#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_SIMD_WIDTH_HERE/2)
+#else
+#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+{
+    gmx_mm_hpr a_SSE;
+
+    a_SSE = _mm_load_ps(a);
+
+    return gmx_2hpr_to_pr(a_SSE,a_SSE);
+}
+
+static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a,real shift)
+{
+    gmx_mm_hpr a0,a1;
+
+    a0 = _mm_set1_ps(a[0] + shift);
+    a1 = _mm_set1_ps(a[1] + shift);
+
+    return gmx_2hpr_to_pr(a1,a0);
+}
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_2xnn(int ci,
+                      real shx,real shy,real shz,
+                      int na_c,
+                      int stride,const real *x,
+                      nbnxn_list_work_t *work)
+{
+    int  ia;
+    nbnxn_x_ci_simd_2xnn_t *x_ci;
+
+    x_ci = work->x_ci_simd_2xnn;
+
+    ia = X_IND_CI_SIMD_2XNN(ci);
+
+    x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+    x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+    x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+    x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+    x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+    x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+}
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+                            nbnxn_pairlist_t *nbl,
+                            int ci,int cjf,int cjl,
+                            gmx_bool remove_sub_diag,
+                            const real *x_j,
+                            real rl2,float rbb2,
+                            int *ndistc)
+{
+    const nbnxn_x_ci_simd_2xnn_t *work;
+    const float *bb_ci;
+
+    gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
+
+    gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+    gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+
+    gmx_mm_pr  rsq_SSE0;
+    gmx_mm_pr  rsq_SSE2;
+
+    gmx_mm_pr  wco_SSE0;
+    gmx_mm_pr  wco_SSE2;
+    gmx_mm_pr  wco_any_SSE;
+
+    gmx_mm_pr  rc2_SSE;
+
+    gmx_bool   InRange;
+    float      d2;
+    int        xind_f,xind_l,cj;
+
+    cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+    cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+
+    work = nbl->work->x_ci_simd_2xnn;
+
+    bb_ci = nbl->work->bb_ci;
+
+    rc2_SSE   = gmx_set1_pr(rl2);
+
+    InRange = FALSE;
+    while (!InRange && cjf <= cjl)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj);
+        *ndistc += 2;
+
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            xind_f  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+
+            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+            wco_any_SSE        = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjf++;
+        }
+    }
+    if (!InRange)
+    {
+        return;
+    }
+
+    InRange = FALSE;
+    while (!InRange && cjl > cjf)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj);
+        *ndistc += 2;
+        
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            xind_l  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+
+            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+            wco_any_SSE        = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjl--;
+        }
+    }
+
+    if (cjf <= cjl)
+    {
+        for(cj=cjf; cj<=cjl; cj++)
+        {
+            /* Store cj and the interaction mask */
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag,ci,cj);
+            nbl->ncj++;
+        }
+        /* Increase the closing index in i super-cell list */
+        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+    }
+}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
similarity index 76%
rename from src/mdlib/nbnxn_search_x86_simd.h
rename to src/mdlib/nbnxn_search_simd_4xn.h
index a6af973d8a929d9f2c4ad33a8823a5994b623351..60742fb7196a16f90df3601808f4827995dbd96c 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file.
- * gmx_sse_or_avh.h should be included before including this file.
- */
-
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
-#ifdef GMX_MM128_HERE
-static void icell_set_x_x86_simd128
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
 #else
-#ifdef GMX_MM256_HERE
-static void icell_set_x_x86_simd256
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
 #else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 #endif
 #endif
-                                   (int ci,
-                                    real shx,real shy,real shz,
-                                    int na_c,
-                                    int stride,const real *x,
-                                    nbnxn_list_work_t *work)
-{
-    int  ia;
-#ifdef GMX_MM128_HERE
-    nbnxn_x_ci_x86_simd128_t *x_ci;
-
-    x_ci = work->x_ci_x86_simd128;
+#include "gmx_simd_macros.h"
 
-    ia = X_IND_CI_S128(ci);
+#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_SIMD_WIDTH_HERE)
 #else
-    nbnxn_x_ci_x86_simd256_t *x_ci;
+#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_4xn(int ci,
+                     real shx,real shy,real shz,
+                     int na_c,
+                     int stride,const real *x,
+                     nbnxn_list_work_t *work)
+{
+    int  ia;
+    nbnxn_x_ci_simd_4xn_t *x_ci;
 
-    x_ci = work->x_ci_x86_simd256;
+    x_ci = work->x_ci_simd_4xn;
 
-    ia = X_IND_CI_S256(ci);
-#endif
+    ia = X_IND_CI_SIMD_4XN(ci);
 
     x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
     x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
@@ -85,34 +82,21 @@ static void icell_set_x_x86_simd256
     x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
 }
 
-/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
  * for coordinates in packed format.
  * Checks bouding box distances and possibly atom pair distances.
  * This is an accelerated version of make_cluster_list_simple.
  */
-#ifdef GMX_MM128_HERE
-static void make_cluster_list_x86_simd128
-#else
-#ifdef GMX_MM256_HERE
-static void make_cluster_list_x86_simd256
-#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
-#endif
-#endif
-                                         (const nbnxn_grid_t *gridj,
-                                          nbnxn_pairlist_t *nbl,
-                                          int ci,int cjf,int cjl,
-                                          gmx_bool remove_sub_diag,
-                                          const real *x_j,
-                                          real rl2,float rbb2,
-                                          int *ndistc)
+static gmx_inline void
+make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+                           nbnxn_pairlist_t *nbl,
+                           int ci,int cjf,int cjl,
+                           gmx_bool remove_sub_diag,
+                           const real *x_j,
+                           real rl2,float rbb2,
+                           int *ndistc)
 {
-#ifdef GMX_MM128_HERE
-    const nbnxn_x_ci_x86_simd128_t *work;
-#else
-    const nbnxn_x_ci_x86_simd256_t *work;
-#endif
-
+    const nbnxn_x_ci_simd_4xn_t *work;
     const float *bb_ci;
 
     gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
@@ -139,17 +123,10 @@ static void make_cluster_list_x86_simd256
     float      d2;
     int        xind_f,xind_l,cj;
 
-#ifdef GMX_MM128_HERE
-    cjf = CI_TO_CJ_S128(cjf);
-    cjl = CI_TO_CJ_S128(cjl+1) - 1;
-
-    work = nbl->work->x_ci_x86_simd128;
-#else
-    cjf = CI_TO_CJ_S256(cjf);
-    cjl = CI_TO_CJ_S256(cjl+1) - 1;
+    cjf = CI_TO_CJ_SIMD_4XN(cjf);
+    cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
 
-    work = nbl->work->x_ci_x86_simd256;
-#endif
+    work = nbl->work->x_ci_simd_4xn;
 
     bb_ci = nbl->work->bb_ci;
 
@@ -172,11 +149,8 @@ static void make_cluster_list_x86_simd256
         }
         else if (d2 < rl2)
         {
-#ifdef GMX_MM128_HERE
-            xind_f  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf);
-#else
-            xind_f  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf);
-#endif
+            xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+
             jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
             jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
             jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
@@ -213,7 +187,7 @@ static void make_cluster_list_x86_simd256
             
             InRange            = gmx_movemask_pr(wco_any_SSE);
 
-            *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
         if (!InRange)
         {
@@ -242,11 +216,8 @@ static void make_cluster_list_x86_simd256
         }
         else if (d2 < rl2)
         {
-#ifdef GMX_MM128_HERE
-            xind_l  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl);
-#else
-            xind_l  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl);
-#endif
+            xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+
             jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
             jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
             jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
@@ -282,7 +253,7 @@ static void make_cluster_list_x86_simd256
             
             InRange            = gmx_movemask_pr(wco_any_SSE);
 
-            *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
         if (!InRange)
         {
@@ -295,16 +266,15 @@ static void make_cluster_list_x86_simd256
         for(cj=cjf; cj<=cjl; cj++)
         {
             /* Store cj and the interaction mask */
-#ifdef GMX_MM128_HERE
-            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S128(gridj->cell0) + cj;
-            nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj);
-#else
-            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S256(gridj->cell0) + cj;
-            nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj);
-#endif
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag,ci,cj);
             nbl->ncj++;
         }
         /* Increase the closing index in i super-cell list */
         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
     }
 }
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
index b57d35cf4d74830a0c86247c1e53415d6d7f370b..eb9f636140116ec2adad8531460842b5057c0013 100644 (file)
@@ -93,8 +93,8 @@
 #include "nbnxn_atomdata.h"
 #include "nbnxn_search.h"
 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 
 #ifdef GMX_LIB_MPI
@@ -620,13 +620,13 @@ static void do_nb_verlet(t_forcerec *fr,
         gmx_incons("Invalid cut-off scheme passed!");
     }
 
-    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
     {
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
     switch (nbvg->kernel_type)
     {
-        case nbk4x4_PlainC:
+        case nbnxnk4x4_PlainC:
             nbnxn_kernel_ref(&nbvg->nbl_lists,
                              nbvg->nbat, ic,
                              fr->shift_vec,
@@ -639,38 +639,38 @@ static void do_nb_verlet(t_forcerec *fr,
                              enerd->grpp.ener[egLJSR]);
             break;
         
-        case nbk4xN_X86_SIMD128:
-            nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
-                                     nbvg->nbat, ic,
-                                     nbvg->ewald_excl,
-                                     fr->shift_vec,
-                                     flags,
-                                     clearF,
-                                     fr->fshift[0],
-                                     enerd->grpp.ener[egCOULSR],
-                                     fr->bBHAM ?
-                                     enerd->grpp.ener[egBHAMSR] :
-                                     enerd->grpp.ener[egLJSR]);
+        case nbnxnk4xN_SIMD_4xN:
+            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+                                  nbvg->nbat, ic,
+                                  nbvg->ewald_excl,
+                                  fr->shift_vec,
+                                  flags,
+                                  clearF,
+                                  fr->fshift[0],
+                                  enerd->grpp.ener[egCOULSR],
+                                  fr->bBHAM ?
+                                  enerd->grpp.ener[egBHAMSR] :
+                                  enerd->grpp.ener[egLJSR]);
             break;
-        case nbk4xN_X86_SIMD256:
-            nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
-                                     nbvg->nbat, ic,
-                                     nbvg->ewald_excl,
-                                     fr->shift_vec,
-                                     flags,
-                                     clearF,
-                                     fr->fshift[0],
-                                     enerd->grpp.ener[egCOULSR],
-                                     fr->bBHAM ?
-                                     enerd->grpp.ener[egBHAMSR] :
-                                     enerd->grpp.ener[egLJSR]);
+        case nbnxnk4xN_SIMD_2xNN:
+            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+                                   nbvg->nbat, ic,
+                                   nbvg->ewald_excl,
+                                   fr->shift_vec,
+                                   flags,
+                                   clearF,
+                                   fr->fshift[0],
+                                   enerd->grpp.ener[egCOULSR],
+                                   fr->bBHAM ?
+                                   enerd->grpp.ener[egBHAMSR] :
+                                   enerd->grpp.ener[egLJSR]);
             break;
 
-        case nbk8x8x8_CUDA:
+        case nbnxnk8x8x8_CUDA:
             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
             break;
 
-        case nbk8x8x8_PlainC:
+        case nbnxnk8x8x8_PlainC:
             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
                                  nbvg->nbat, ic,
                                  fr->shift_vec,
@@ -688,7 +688,7 @@ static void do_nb_verlet(t_forcerec *fr,
             gmx_incons("Invalid nonbonded kernel type passed!");
 
     }
-    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
     {
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
@@ -785,7 +785,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
     bDoForces     = (flags & GMX_FORCE_FORCES);
     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
     bUseGPU       = fr->nbv->bUseGPU;
-    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 
     if (bStateChanged)
     {
@@ -1000,7 +1000,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
 
             wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 
-            if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
             {
                 /* initialize non-local pair-list on the GPU */
                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,
index 202a1a5b2ccd2a73cbb37a1e8c8aba00f0103567..fafcf008da510e49ab692fc1bff758bcf64b4779 100644 (file)
@@ -1085,7 +1085,7 @@ t_forcetable make_tables(FILE *out,const output_env_t oenv,
    * numbers per nx+1 data points. For performance reasons we want
    * the table data to be aligned to 16-byte.
    */
-  snew_aligned(table.data, 12*(nx+1)*sizeof(real),16);
+  snew_aligned(table.data, 12*(nx+1)*sizeof(real),32);
 
   for(k=0; (k<etiNR); k++) {
     if (tabsel[k] != etabUSER) {
@@ -1203,7 +1203,7 @@ t_forcetable make_gb_table(FILE *out,const output_env_t oenv,
         * to do this :-)
         */
        
-       snew_aligned(table.data,4*nx,16);
+       snew_aligned(table.data,4*nx,32);
        
        init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
        
@@ -1363,7 +1363,7 @@ t_forcetable make_atf_table(FILE *out,const output_env_t oenv,
         * to do this :-)
         */
        
-    snew_aligned(table.data,4*nx,16);
+    snew_aligned(table.data,4*nx,32);
 
        copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data);
        
index 3f290d6dceae9129b6a5c383d8cdba0c0da80ead..51ba5123b50aa561d843d9901b47730aa5fefcc2 100644 (file)
@@ -97,20 +97,20 @@ int gmx_g_angle(int argc,char *argv[])
 {
   static const char *desc[] = {
     "[TT]g_angle[tt] computes the angle distribution for a number of angles",
-    "or dihedrals. This way you can check whether your simulation",
-    "is correct. With option [TT]-ov[tt] you can plot the average angle of",
-    "a group of angles as a function of time. With the [TT]-all[tt] option",
-    "the first graph is the average, the rest are the individual angles.[PAR]",
+    "or dihedrals.[PAR]",
+    "With option [TT]-ov[tt], you can plot the average angle of",
+    "a group of angles as a function of time. With the [TT]-all[tt] option,",
+    "the first graph is the average and the rest are the individual angles.[PAR]",
     "With the [TT]-of[tt] option, [TT]g_angle[tt] also calculates the fraction of trans",
     "dihedrals (only for dihedrals) as function of time, but this is",
-    "probably only fun for a selected few.[PAR]",
-    "With option [TT]-oc[tt] a dihedral correlation function is calculated.[PAR]",
-    "It should be noted that the index file should contain",
-    "atom-triples for angles or atom-quadruplets for dihedrals.",
+    "probably only fun for a select few.[PAR]",
+    "With option [TT]-oc[tt], a dihedral correlation function is calculated.[PAR]",
+    "It should be noted that the index file must contain",
+    "atom triplets for angles or atom quadruplets for dihedrals.",
     "If this is not the case, the program will crash.[PAR]",
-    "With option [TT]-or[tt] a trajectory file is dumped containing cos and",
-    "sin of selected dihedral angles which subsequently can be used as",
-    "input for a PCA analysis using [TT]g_covar[tt].[PAR]",
+    "With option [TT]-or[tt], a trajectory file is dumped containing cos and",
+    "sin of selected dihedral angles, which subsequently can be used as",
+    "input for a principal components analysis using [TT]g_covar[tt].[PAR]",
     "Option [TT]-ot[tt] plots when transitions occur between",
     "dihedral rotamers of multiplicity 3 and [TT]-oh[tt]",
     "records a histogram of the times between such transitions,",
index 574a85bc5679bf477c3387dc34604c6ba6a4b03d..4995d9ca263b9ecd377dc017423386b6afb8ff41 100644 (file)
@@ -1567,15 +1567,14 @@ static void check_input(
 /* Returns TRUE when "opt" is needed at launch time */
 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
 {
-    /* Apart from the input .tpr we need all options that were set
+    /* Apart from the input .tpr and the error log we need all options that were set
      * on the command line and that do not start with -b */
-    if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2))
+    if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2) || 0 == strncmp(opt,"-err", 4))
+    {
         return FALSE;
+    }
 
-    if (bSet)
-        return TRUE;
-    else
-        return FALSE;
+    return bSet;
 }