Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Tue, 15 Jan 2013 21:00:15 +0000 (16:00 -0500)
committerRoland Schulz <roland@utk.edu>
Tue, 15 Jan 2013 22:50:22 +0000 (17:50 -0500)
Ontop of resolving conflicts:
- Removed any newly added GMX.*EXPORT
  (no visibility yet in master)
- Adapted gmxTestCXX11.cmake to the new approach of setting flags
  without caching
- Removed deleted g_dih from symlinking and legacymodules.cpp
- Fix gmx_cpuid.c (also part of I0ad9ca77b)

Changes not applied
- Template building (c22f43f2932c0332c939cfc44)
- Removed libmd renamed (3f231be160c46)

Conflicts:
CMakeLists.txt (mostly easy - moved comment about
    Windows and BUILD_SHARED_LIBS, and changes related to
            gmxTestCXX11)
cmake/gmxCFlags.cmake
include/nbsearch.h (only Export ignored)
include/trajana.h (only Export ignored)
share/template/CMakeLists.txt (changes not applied)
src/gmxlib/CMakeLists.txt (changes not applied)
src/gmxlib/cuda_tools/copyrite_gpu.cu (moved to src/gromacs)
src/gmxlib/gpu_utils/CMakeLists.txt (only exports - ignored)
src/gromacs/legacyheaders/edsam.h
src/gromacs/legacyheaders/gmx_ana.h
src/gromacs/legacyheaders/gmx_cpuid.h
src/gromacs/libgromacs.pc.cmakein
src/gromacs/mdlib/edsam.c
src/gromacs/mdlib/forcerec.c
src/gromacs/selection/compiler.cpp
src/kernel/CMakeLists.txt
src/mdlib/CMakeLists.txt (ignored Export&rename)
src/mdlib/libmd.pc.cmakein (only rename - ignored)
src/mdlib/nbnxn_cuda/CMakeLists.txt (only Export ignored)
src/tools/CMakeLists.txt
src/tools/gmx_dih.c
src/tools/gmx_sans.c

Change-Id: I28541d3c871feb7261c685793b36045e1806014d

215 files changed:
CMakeLists.txt
admin/programs.txt
cmake/FindFFTW.cmake
cmake/Platform/BlueGeneL-static-XL-C.cmake [moved from cmake/Toolchain-BlueGeneL-xlc.cmake with 100% similarity]
cmake/Platform/BlueGeneP-static-XL-C.cmake [moved from cmake/Toolchain-BlueGeneP.cmake with 100% similarity]
cmake/Platform/BlueGeneQ-base.cmake [new file with mode: 0644]
cmake/Platform/BlueGeneQ-static-XL-C.cmake [new file with mode: 0644]
cmake/Platform/BlueGeneQ-static-XL-CXX.cmake [new file with mode: 0644]
cmake/Platform/BlueGeneQ-static.cmake [new file with mode: 0644]
cmake/TestAVXMaskload.c [new file with mode: 0644]
cmake/TestQPX.c [new file with mode: 0644]
cmake/gmxCFlags.cmake
cmake/gmxManageBlueGene.cmake [new file with mode: 0644]
cmake/gmxManageGPU.cmake
cmake/gmxManageMPI.cmake
cmake/gmxTestAVXMaskload.cmake [new file with mode: 0644]
cmake/gmxTestCXX11.cmake
scripts/GMXRC.cmakein
share/html/online.html
share/html/online/g_dih.html [deleted file]
share/html/online/gro.html
share/html/online/mdp_opt.html
share/top/links.dat
src/config.h.cmakein
src/gromacs/CMakeLists.txt
src/gromacs/gmxlib/atomprop.c
src/gromacs/gmxlib/checkpoint.c
src/gromacs/gmxlib/copyrite.c
src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu [new file with mode: 0644]
src/gromacs/gmxlib/filenm.c
src/gromacs/gmxlib/gmx_cpuid.c
src/gromacs/gmxlib/gmx_detect_hardware.c
src/gromacs/gmxlib/gmx_omp_nthreads.c
src/gromacs/gmxlib/ifunc.c
src/gromacs/gmxlib/mshift.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
src/gromacs/gmxlib/sfactor.c
src/gromacs/gmxlib/thread_mpi/collective.c
src/gromacs/gmxlib/thread_mpi/once.c
src/gromacs/gmxlib/thread_mpi/tmpi_init.c
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/edsam.h
src/gromacs/legacyheaders/gmx_ana.h
src/gromacs/legacyheaders/gmx_cpuid.h
src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_double.h
src/gromacs/legacyheaders/gmx_simd_macros.h
src/gromacs/legacyheaders/gmx_x86_avx_128_fma.h
src/gromacs/legacyheaders/gmx_x86_avx_256.h
src/gromacs/legacyheaders/main.h
src/gromacs/legacyheaders/mshift.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
src/gromacs/legacyheaders/types/filenm.h
src/gromacs/legacyheaders/types/graph.h
src/gromacs/legacyheaders/types/nb_verlet.h
src/gromacs/legacyheaders/types/nbnxn_pairlist.h
src/gromacs/legacyheaders/types/state.h
src/gromacs/legacyheaders/vec.h
src/gromacs/libgromacs.pc.cmakein
src/gromacs/mdlib/constr.c
src/gromacs/mdlib/coupling.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/edsam.c
src/gromacs/mdlib/fft5d.h
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/gmx_wallcycle.c
src/gromacs/mdlib/init.c
src/gromacs/mdlib/nbnxn_atomdata.c
src/gromacs/mdlib/nbnxn_consts.h
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/sim_util.c
src/gromacs/mdlib/tables.c
src/programs/gmx/CreateLinks.cmake.cmakein
src/programs/gmx/legacymodules.cpp
src/programs/gmxcheck/gmxcheck.c
src/programs/grompp/convparm.c
src/programs/mdrun/md.c
src/programs/mdrun/mdrun.c
src/programs/mdrun/repl_ex.c
src/programs/mdrun/runner.c
src/programs/pdb2gmx/hizzie.c
src/programs/pdb2gmx/pdb2gmx.c
src/tools/CMakeLists.txt
src/tools/gmx_angle.c
src/tools/gmx_dih.c [deleted file]
src/tools/gmx_genpr.c
src/tools/gmx_helix.c
src/tools/gmx_make_edi.c
src/tools/gmx_rmsf.c
src/tools/gmx_tune_pme.c

index ae3d23362f1309761ac092ded1cc76a34cf556bd..c81554dda3d8c8b79af41ff4acaea356381bd1d0 100644 (file)
@@ -2,10 +2,9 @@ cmake_minimum_required(VERSION 2.8)
 # Keep CMake suitably quiet on Cygwin
 set(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required
 
-# override bugs on OS X where Cmake picks gcc (GNU) for C instead of system default cc (Clang).
-if(APPLE)
-    set(CMAKE_C_COMPILER_INIT "cc")
-endif(APPLE)
+# CMake modules/macros are in a subdirectory to keep this file cleaner
+# This needs to be set before project() in order to pick up toolchain files
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
 
 project(Gromacs)
 include(Dart)
@@ -52,9 +51,6 @@ endif()
 # provide backward compatibility of software written against the Gromacs API.
 set(API_VERSION ${NUM_VERSION})
 
-# Cmake modules/macros are in a subdirectory to keep this file cleaner
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
     set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
 endif()
@@ -109,47 +105,10 @@ if(CMAKE_HOST_UNIX)
 endif()
 
 ########################################################################
-set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
-# Fix stupid flags on Windows
-########################################################################
-SET(SHARED_LIBS_DEFAULT ON) 
-IF( WIN32 AND NOT CYGWIN)
-  option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
-  mark_as_advanced(GMX_PREFER_STATIC_LIBS)
-  SET(SHARED_LIBS_DEFAULT OFF)  #is currently not working on Windows
-  # This makes windows.h not declare min/max as macros that would break
-  # C++ code using std::min/std::max.
-  add_definitions(-DNOMINMAX)
-
-  IF (GMX_PREFER_STATIC_LIBS)
-    #Only setting Debug and Release flags. Others configurations current not used.
-    STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-    SET(CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE} CACHE STRING "" FORCE)
-    STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
-    STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-    SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
-    STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
-  ENDIF()
-
-  #Workaround for cmake bug 13174. Replace deprecated options.
-  IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
-    STRING(REPLACE /GZ /RTC1 CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    SET(CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG} CACHE STRING "" FORCE)
-  ENDIF()
-  IF( CMAKE_CXX_COMPILER_ID MATCHES "Intel" )
-    STRING(REPLACE /GZ /RTC1 CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    SET(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} CACHE STRING "" FORCE)
-
-    STRING(REPLACE /GX /EHsc CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-    SET(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} CACHE STRING "" FORCE)
-  ENDIF()
-ENDIF()
-
+option(GMX_COOL_QUOTES "Enable Gromacs cool quotes" ON)
+mark_as_advanced(GMX_COOL_QUOTES)
 
-########################################################################
+set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
 # User input options                                                   #
 ########################################################################
 option(GMX_DOUBLE "Use double precision (much slower, use only if you really need it)" OFF)
@@ -157,8 +116,6 @@ option(GMX_MPI    "Build a parallel (message-passing) version of GROMACS" OFF)
 option(GMX_THREAD_MPI  "Build a thread-MPI-based multithreaded version of GROMACS (not compatible with MPI)" ON)
 option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
 mark_as_advanced(GMX_SOFTWARE_INVSQRT)
-option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" OFF)
-mark_as_advanced(GMX_POWERPC_INVSQRT)
 option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
 mark_as_advanced(GMX_FAHCORE)
 
@@ -172,14 +129,18 @@ mark_as_advanced(GMX_OPENMM)
 include(gmxDetectAcceleration)
 if(NOT DEFINED GMX_CPU_ACCELERATION)
     if(CMAKE_CROSSCOMPILING)
-        set(GMX_SUGGESTED_CPU_ACCELERATION "None")
+        if("${CMAKE_SYSTEM_NAME}" MATCHES "BlueGeneQ")
+            set(GMX_SUGGESTED_CPU_ACCELERATION "IBM_QPX")
+        else()
+            set(GMX_SUGGESTED_CPU_ACCELERATION "None")
+        endif()
     else(CMAKE_CROSSCOMPILING)
         gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
     endif(CMAKE_CROSSCOMPILING)
 endif(NOT DEFINED GMX_CPU_ACCELERATION)
 
 set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
-    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
+    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
 
 set(GMX_FFT_LIBRARY "fftw3" 
     CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
@@ -216,6 +177,9 @@ endif()
 option(GMX_CYCLE_SUBCOUNTERS "Enable cycle subcounters to get a more detailed cycle timings" OFF)
 mark_as_advanced(GMX_CYCLE_SUBCOUNTERS)
 
+option(GMX_SKIP_DEFAULT_CFLAGS "Don't automatically add suggested/required Compiler flags." OFF)
+mark_as_advanced(GMX_SKIP_DEFAULT_CFLAGS)
+
 ######################################################################
 # Compiler tests
 # These need to be done early (before further tests).
@@ -229,6 +193,17 @@ mark_as_advanced(GMX_CYCLE_SUBCOUNTERS)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 
+# First exclude compilers known to not work with OpenMP although claim to support it:
+# gcc 4.2.1 and gcc-llvm 4.2.1 (also claims to be 4.2.1) on Mac OS X
+# This fixes redmine 900 and needs to run before OpenMP flags are set below.
+message("CMAKE_COMPILER_IS_GNUCC: ${CMAKE_COMPILER_IS_GNUCC}")
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND
+    CMAKE_COMPILER_IS_GNUCC AND C_COMPILER_VERSION VERSION_LESS 4.3)
+    message(STATUS "OpenMP multithreading not supported with gcc/llvm-gcc 4.2 on Mac OS X, disabled")
+    set(GMX_OPENMP OFF CACHE BOOL
+        "OpenMP multithreading not not supported with gcc/llvm-gcc 4.2 on Mac OS X, disabled!" FORCE)
+endif()
+
 # OpenMP check must come before other CFLAGS!
 if(GMX_OPENMP)
     find_package(OpenMP)
@@ -328,14 +303,6 @@ endif(GMX_DOUBLE)
 if(GMX_SOFTWARE_INVSQRT)
   set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
 endif(GMX_SOFTWARE_INVSQRT)
-if(GMX_POWERPC_INVSQRT)
-  set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
-endif(GMX_POWERPC_INVSQRT)
-
-########################################################################
-#Process MPI settings
-########################################################################
-include(gmxManageMPI)
 
 #######################################################################
 # Check for options incompatible with OpenMM build                    #
@@ -445,27 +412,96 @@ include(TestBigEndian)
 test_big_endian(GMX_INTEGER_BIG_ENDIAN)
 
 
+if(APPLE OR CYGWIN OR ${CMAKE_SYSTEM_NAME} MATCHES "Linux|.*BSD")
+    # Maybe Solaris should be here? Patch this if you know!
+    SET(SHARED_LIBS_DEFAULT ON)
+elseif(WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "BlueGene")
+    # Support for shared libs on native Windows is a bit new. Its
+    # default might change later if/when we sort things out. Also,
+    # Cray should go here. What variable value can detect it?
+    SET(SHARED_LIBS_DEFAULT OFF)
+else()
+    message(STATUS "Defaulting to building static libraries")
+    SET(SHARED_LIBS_DEFAULT OFF)
+endif()
+
+# Management of GROMACS options for specific toolchains should go
+# here. Because the initial settings for some of the main options have
+# already happened, but things like library detection and MPI compiler
+# feature detection have not, the docstrings for any over-rides of
+# GROMACS defaults or user settings will make sense. Also, any
+# toolchain-related reasons for choosing whether to detect various
+# things can be sorted out now, before the detection takes place.
+if(${CMAKE_SYSTEM_NAME} MATCHES BlueGene)
+    include(gmxManageBlueGene)
+endif()
+
+if(UNIX AND GMX_PREFER_STATIC_LIBS AND SHARED_LIBS_DEFAULT)
+    if(BUILD_SHARED_LIBS)
+        # Warn the user about the combination. But don't overwrite the request.
+        message(WARNING "Searching for static libraries requested, and building shared Gromacs libraries requested. This might cause problems linking later.")
+    elseif(NOT DEFINED BUILD_SHARED_LIBS)
+        # Change default to OFF. Don't warn if it's already off.
+        message(WARNING "Searching for static libraries requested, so the GROMACS libraries will also be built statically (BUILD_SHARED_LIBS=OFF)")
+        set(SHARED_LIBS_DEFAULT OFF)
+    endif()
+endif()
+
+# By now, all tool chains should have spoken up if they care about
+# the setting of SHARED_LIBS_DEFAULT.
+option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic e.g. with MPI, or on some HPC systems)" ${SHARED_LIBS_DEFAULT})
 
+########################################################################
+#Process MPI settings
+########################################################################
+include(gmxManageMPI)
 
 ########################################################################
 # Find external packages                                               #
 ########################################################################
-if(UNIX)
-    if(GMX_PREFER_STATIC_LIBS)
-        # On Linux .a is the static library suffix, on Mac OS X .lib can also
-        # be used, so we'll add both to the preference list.
-        SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib;.a" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-        if(SHARED_LIBS_DEFAULT)
-            if(BUILD_SHARED_LIBS) #Warn the user about the combination. But don't overwrite the request.
-                message(WARNING "Static libraries requested, and shared Gromacs libraries requested.")
-            elseif(NOT DEFINED BUILD_SHARED_LIBS) #Change default to OFF. Don't warn if it's already off.
-                message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
-                set(SHARED_LIBS_DEFAULT OFF)
-            endif()
-        endif()
-    endif()
+if(UNIX AND GMX_PREFER_STATIC_LIBS)
+    # On Linux .a is the static library suffix, on Mac OS X .lib can also
+    # be used, so we'll add both to the preference list.
+    SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib;.a" ${CMAKE_FIND_LIBRARY_SUFFIXES})
 endif()
-option(BUILD_SHARED_LIBS "Enable shared libraries (can be problematic with MPI, Windows)" ${SHARED_LIBS_DEFAULT})
+
+IF( WIN32 AND NOT CYGWIN)
+  # This makes windows.h not declare min/max as macros that would break
+  # C++ code using std::min/std::max.
+  add_definitions(-DNOMINMAX)
+
+  if (NOT BUILD_SHARED_LIBS)
+      option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
+      if(NOT GMX_PREFER_STATIC_LIBS)
+          message(WARNING "Shared system libraries requested, and static Gromacs libraries requested.")
+      endif()
+  else()
+      message(FATAL_ERROR "BUILD_SHARED_LIBS not yet working for Windows in the master branch")
+      option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" OFF)
+      if(GMX_PREFER_STATIC_LIBS)
+          #this combination segfaults (illigal passing of file handles)
+          message(FATAL_ERROR "Static system libraries requested, and shared Gromacs libraries requested.")
+      endif()
+      add_definitions(-DUSE_VISIBILITY -DTMPI_USE_VISIBILITY)
+      set(PKG_CFLAGS "$PKG_CFLAGS -DUSE_VISIBILITY -DTMPI_USE_VISIBILITY")
+  endif()
+  mark_as_advanced(GMX_PREFER_STATIC_LIBS)
+
+  IF (GMX_PREFER_STATIC_LIBS)
+      #Only setting Debug and Release flags. Others configurations are current not used.
+      STRING(REPLACE /MD /MT CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+      STRING(REPLACE /MD /MT CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+      if(CMAKE_CXX_COMPILER_LOADED)
+          STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+          STRING(REPLACE /MD /MT CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+      endif()
+  ENDIF()
+  IF( CMAKE_C_COMPILER_ID MATCHES "Intel" )
+    if(BUILD_SHARED_LIBS) #not sure why incremental building with shared libs doesn't work
+        STRING(REPLACE "/INCREMENTAL:YES" "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+    endif()
+  ENDIF()
+ENDIF()
 
 option(GMX_XML "Use libxml2 to parse xml files" ON)
 if (GMX_XML)
@@ -653,8 +689,7 @@ gmx_test__isfinite(HAVE__ISFINITE)
 gmx_test__finite(HAVE__FINITE)
 
 include(gmxTestCXX11)
-gmx_test_cxx11(GMX_CXX11 CXX11_FLAG)
-set(GROMACS_CXX_FLAGS "${CXX11_FLAG} ${GROMACS_CXX_FLAGS}")
+gmx_test_cxx11(GMX_CXX11 GMX_CXX11_FLAGS)
 if(CXX11_FLAG AND GMX_GPU)
     #FIXME: add proper solution for progate all but cxx11 flag
     set(CUDA_PROPAGATE_HOST_FLAGS no)
@@ -668,26 +703,29 @@ if(NOT GMX_SYSTEM_XDR)
     set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_INTERNAL_XDR")
 endif(NOT GMX_SYSTEM_XDR)
 
+# include avx test source, used if the AVX flags are set below
+include(gmxTestAVXMaskload)
+
 # Process nonbonded accelerated kernels settings
 string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
 if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
     # nothing to do
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
 
-    GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
+    GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" ACCELERATION_C_FLAGS)
     if(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
     endif(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
 
-    GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" GROMACS_CXX_FLAGS)
+    GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" ACCELERATION_CXX_FLAGS)
     if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+        GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
     endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 
     # We dont warn for lacking SSE2 flag support, since that is probably standard today.
 
     # Only test the include after we have tried to add the correct flag for SSE2 support
-    check_include_file(emmintrin.h  HAVE_EMMINTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(emmintrin.h  HAVE_EMMINTRIN_H ${ACCELERATION_C_FLAGS})
 
     if(NOT HAVE_EMMINTRIN_H)
         message(FATAL_ERROR "Cannot find emmintrin.h, which is required for SSE2 intrinsics support.")
@@ -702,34 +740,36 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
 
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
 
-    GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" GROMACS_C_FLAGS)
+    GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" ACCELERATION_C_FLAGS)
     if (NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" ACCELERATION_C_FLAGS)
     endif(NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
     if (NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
-        message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
         # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
-        # intrinsics when SSE2 support is enabled, so we try that instead.
+        # intrinsics when SSE2 support is enabled, so we try that instead first.
        if (GMX_NATIVE_WINDOWS)
-            GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+            GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
+            message(WARNING "Neither SSE4.1 or SSE2 seems to be supported by your Windows compiler. Something is likely broken.")
+        else()
+            message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance")
         endif()
     endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
 
     GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
     if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
-       GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
+        GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
     endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
     if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
         message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
         # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
         # intrinsics when SSE2 support is enabled, so we try that instead.
         if (GMX_NATIVE_WINDOWS)
-            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
         endif()
     endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 
     # This must come after we have added the -msse4.1 flag on some platforms.
-    check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${ACCELERATION_C_FLAGS})
 
     if(NOT HAVE_SMMINTRIN_H)
         message(FATAL_ERROR "Cannot find smmintrin.h, which is required for SSE4.1 intrinsics support.")
@@ -747,17 +787,17 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
 
     # Set the AVX compiler flag for both these choices!
 
-    GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" GROMACS_C_FLAGS)
+    GMX_TEST_CFLAG(GNU_AVX_CFLAG "-mavx" ACCELERATION_C_FLAGS)
     if (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
-        GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" ACCELERATION_C_FLAGS)
     endif (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
     if (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
         message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
     endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
 
-    GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" GROMACS_CXX_FLAGS)
+    GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
     if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
-       GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" GROMACS_CXX_FLAGS)
+        GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
     endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
     if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
        message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
@@ -765,24 +805,27 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
 
     # Set the FMA4 flags (MSVC doesn't require any)
     if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
-        GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+        if (${CMAKE_COMPILER_ID} MATCHES "Clang")
+            message(FATAL_ERROR "Clang up to at least version 3.2 produces incorrect code for AVX_128_FMA. Sorry, but you will have to select a different compiler or acceleration.")
+        endif()
+        GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" ACCELERATION_C_FLAGS)
         if (NOT GNU_FMA_CFLAG)
             message(WARNING "No C FMA4 flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
         endif(NOT GNU_FMA_CFLAG)
-        GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" GROMACS_C_FLAGS)
+        GMX_TEST_CFLAG(GNU_XOP_CFLAG "-mxop" ACCELERATION_C_FLAGS)
         # No big deal if we do not have xop, so no point yelling warnings about it.
         if (CMAKE_CXX_COMPILER_LOADED)
-            GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" ACCELERATION_CXX_FLAGS)
             if (NOT GNU_FMA_CXXFLAG)
                 message(WARNING "No C++ FMA flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
             endif (NOT GNU_FMA_CXXFLAG)
-            GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" GROMACS_CXX_FLAGS)
+            GMX_TEST_CXXFLAG(GNU_XOP_CXXFLAG "-mxop" ACCELERATION_CXX_FLAGS)
             # No big deal if we do not have xop, so no point yelling warnings about it.
         endif()
     endif()
 
     # Only test the header after we have tried to add the flag for AVX support
-    check_include_file(immintrin.h  HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(immintrin.h  HAVE_IMMINTRIN_H ${ACCELERATION_C_FLAGS})
 
     if(NOT HAVE_IMMINTRIN_H)
         message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
@@ -791,15 +834,15 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
     if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
         try_compile(TEST_AVX ${CMAKE_BINARY_DIR}
             "${CMAKE_SOURCE_DIR}/cmake/TestAVX.c"
-            COMPILE_DEFINITIONS "${GROMACS_C_FLAGS}")
+            COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
         if(NOT TEST_AVX)
             message(FATAL_ERROR "Cannot compile AVX intrinsics. Consider switching compiler.")
         endif()
     endif()
 
     # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
-    check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
-    check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(x86intrin.h HAVE_X86INTRIN_H ${ACCELERATION_C_FLAGS})
+    check_include_file(intrin.h HAVE_INTRIN_H ${ACCELERATION_C_FLAGS})
 
     # The user should not be able to set this orthogonally to the acceleration
     set(GMX_X86_SSE4_1 1)
@@ -821,34 +864,40 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
         endif()
     endif()
 
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
-# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
-    if (NOT ACCELERATION_QUIETLY)
-      message(STATUS "Configuring for BlueGene")
+    # Unfortunately gcc-4.5.2 and gcc-4.6.0 has a bug where they use the wrong datatype for the formal
+    # parameter of the mask for maskload/maskstore arguments. Check if this is present, since we can work around it.
+    gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
+
+else(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
+    # Used on BlueGene/Q
+    if (CMAKE_C_COMPILER_ID MATCHES "XL")
+        GMX_TEST_CFLAG(XLC_BLUEGENEQ_CFLAG "-qarch=qp -qtune=qp" ACCELERATION_C_FLAGS)
+        try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
+            "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c"
+            COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
+        if(NOT TEST_QPX)
+            message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
+        endif()
+    endif()
+    if (CMAKE_CXX_COMPILER_ID MATCHES "XL" AND CMAKE_CXX_COMPILER_LOADED)
+        GMX_TEST_CXXFLAG(XLC_BLUEGENEQ_CXXFLAG "-qarch=qp -qtune=qp" ACCELERATION_CXX_FLAGS)
+        try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
+            "cmake/TestQPX.c"
+            COMPILE_DEFINITIONS "${ACCELERATION_CXX_FLAGS")
+        if(NOT TEST_QPX)
+            message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
+        endif()
     endif()
-    set(GMX_BLUEGENE 1)
-    if (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
-        set(SHARED_LIBS_DEFAULT OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
-        set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
-    endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
-    set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
-    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
-    set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
-    set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI not compatible with BlueGene, disabled!" FORCE)
-    set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
-# Access to /etc/passwd is not available on the back end of BlueGene,
-# despite being detected by CMake. This can cause linker warnings
-# about harmless things in src/gmxlib/string2.h.
-    set(HAVE_PWD_H OFF)
-# The automatic testing for endianness does not work for the BlueGene cross-compiler
-    set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
-    set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
-    set(GMX_POWER6 1)
-    set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
-    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
+
+    if (TEST_QPX)
+        message(WARNING "IBM QPX acceleration was selected and could be compiled, but the accelerated kernels are not yet available.")
+        set(GMX_CPU_ACCELERATION_IBM_QPX 1)
+    else()
+        message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
+    endif()
+
 else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
-    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene")
+    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
 endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
 set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
 
@@ -1019,24 +1068,27 @@ if(GMX_FAHCORE)
   set(COREWRAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../corewrap" CACHE STRING 
       "Path to swindirect.h")
   include_directories(${COREWRAP_INCLUDE_DIR})
+  set_property(CACHE GMX_COOL_QUOTES VALUE OFF)
 endif(GMX_FAHCORE)
 
 # # # # # # # # # # NO MORE TESTS AFTER THIS LINE! # # # # # # # # # # #
 # these are set after everything else
-if (NOT DEFINED GROMACS_C_FLAGS_SET)
-    set(GROMACS_C_FLAGS_SET true CACHE INTERNAL "Whether to reset the C flags" 
-        FORCE)
-    set(CMAKE_C_FLAGS "${GROMACS_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING 
-        "Flags used by the compiler during all build types" FORCE)
-    set(CMAKE_CXX_FLAGS "${GROMACS_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING 
-        "Flags used by the compiler during all build types" FORCE)
-    set(CMAKE_EXE_LINKER_FLAGS 
-        "${GROMACS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" 
-        CACHE STRING "Linker flags for creating executables" FORCE) 
-    set(CMAKE_SHARED_LINKER_FLAGS 
-        "${GROMACS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" 
-        CACHE STRING "Linker flags for creating shared libraries" FORCE) 
-endif (NOT DEFINED GROMACS_C_FLAGS_SET)
+if (NOT GMX_SKIP_DEFAULT_CFLAGS)
+    set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMX_CXX11_FLAGS} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+else()
+    message("Recommended flags which are not added because GMX_SKIP_DEFAULT_CFLAGS=yes:")
+    message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CFLAGS}")
+    message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}")
+    message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}")
+    message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMX_CXX11_FLAGS} ${GMXC_CXXFLAGS}")
+    message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
+    message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
+    message("CMAKE_EXE_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+    message("CMAKE_SHARED_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+endif()
 
 if(NOT GMX_OPENMP)
     #Unset all OpenMP flags in case OpenMP was disabled either by the user
@@ -1047,6 +1099,7 @@ else()
     set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} ${OpenMP_LINKER_FLAGS})
     set(GMX_SHARED_LINKER_FLAGS ${GMX_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS})
 endif()
+set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
 
 ######################################
 # Output compiler and CFLAGS used
index d6b05f0f6ac0087ba1002082df75c4d93198f1cc..1aaa7a5f9692c48f8076886cf47509cb431c268e 100644 (file)
@@ -88,7 +88,6 @@ END
 HEAD|Analyzing bonded interactions
 g_angle|calculates distributions and correlations for angles and dihedrals
 g_bond|calculates bond length distributions
-g_dih|analyzes dihedral transitions
 mk_angndx|generates index files for g_angle
 END
 
index be6e1eb3d95ae1dcebb528b36ca19e63546791cf..a7fb8ec54e0ca427f477554643e43710eee8e69b 100644 (file)
@@ -100,4 +100,4 @@ if (${FFTW}_FOUND)
 endif (${FFTW}_FOUND)
 set(${FFTW}_HAVE_SIMD FALSE CACHE BOOL "If ${${FFTW}_PKG} was built with SIMD support")
 
-mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD)
+mark_as_advanced(${FFTW}_INCLUDE_DIR ${FFTW}_LIBRARY ${FFTW}_HAVE_SIMD ${FFTW}_HAVE_AVX)
diff --git a/cmake/Platform/BlueGeneQ-base.cmake b/cmake/Platform/BlueGeneQ-base.cmake
new file mode 100644 (file)
index 0000000..dd17ab6
--- /dev/null
@@ -0,0 +1,120 @@
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+#
+# BlueGeneQ base platform file.
+#
+# NOTE: Do not set your platform to "BlueGeneQ-base".  This file is included
+# by the real platform files.  Use one of these two platforms instead:
+#
+#     BlueGeneQ-dynamic  For dynamically linked builds
+#     BlueGeneQ-static   For statically linked builds
+#
+# This platform file tries its best to adhere to the behavior of the MPI
+# compiler wrappers included with the latest BG/Q drivers.
+#
+
+
+#
+# For BG/Q builds, we're cross compiling, but we don't want to re-root things
+# (e.g. with CMAKE_FIND_ROOT_PATH) because users may have libraries anywhere on
+# the shared filesystems, and this may lie outside the root.  Instead, we set the
+# system directories so that the various system BG/Q CNK library locations are
+# searched first.  This is not the clearest thing in the world, given IBM's driver
+# layout, but this should cover all the standard ones.
+#
+set(CMAKE_SYSTEM_LIBRARY_PATH
+  /bgsys/drivers/ppcfloor/comm/xl/lib                       # default comm layer (used by mpi compiler wrappers)
+  /bgsys/drivers/ppcfloor/spi/lib/                          # other low-level stuff
+  /bgsys/drivers/ppcfloor/gnu-linux/powerpc64-bgq-linux/lib # CNK Linux image -- standard runtime libs, pthread, etc.
+)
+
+#
+# This adds directories that find commands should specifically ignore for cross compiles.
+# Most of these directories are the includeand lib directories for the frontend on BG/Q systems.
+# Not ignoring these can cause things like FindX11 to find a frontend PPC version mistakenly.
+# We use this on BG instead of re-rooting because backend libraries are typically strewn about
+# the filesystem, and we can't re-root ALL backend libraries to a single place.
+#
+set(CMAKE_SYSTEM_IGNORE_PATH
+  /lib             /lib64             /include
+  /usr/lib         /usr/lib64         /usr/include
+  /usr/local/lib   /usr/local/lib64   /usr/local/include
+  /usr/X11/lib     /usr/X11/lib64     /usr/X11/include
+  /usr/lib/X11     /usr/lib64/X11     /usr/include/X11
+  /usr/X11R6/lib   /usr/X11R6/lib64   /usr/X11R6/include
+  /usr/X11R7/lib   /usr/X11R7/lib64   /usr/X11R7/include
+)
+
+#
+# Indicate that this is a unix-like system
+#
+set(UNIX 1)
+
+#
+# Library prefixes, suffixes, extra libs.
+#
+set(CMAKE_LINK_LIBRARY_SUFFIX "")
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib")     # lib
+set(CMAKE_STATIC_LIBRARY_SUFFIX ".a")      # .a
+
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib")     # lib
+set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")     # .so
+set(CMAKE_EXECUTABLE_SUFFIX "")            # .exe
+set(CMAKE_DL_LIBS "dl")
+
+#
+# This macro needs to be called for dynamic library support.  Unfortunately on BG/Q,
+# We can't support both static and dynamic links in the same platform file.  The
+# dynamic link platform file needs to call this explicitly to set up dynamic linking.
+#
+macro(__BlueGeneQ_set_dynamic_flags compiler_id lang)
+  if (${compiler_id} STREQUAL XL)
+    # Flags for XL compilers if we explicitly detected XL
+    set(CMAKE_SHARED_LIBRARY_${lang}_FLAGS           "-qpic")
+    set(CMAKE_SHARED_LIBRARY_CREATE_${lang}_FLAGS    "-qmkshrobj -qnostaticlink")
+    set(BG/Q_${lang}_DYNAMIC_EXE_FLAGS                "-qnostaticlink -qnostaticlink=libgcc")
+  else()
+    # Assume flags for GNU compilers (if the ID is GNU *or* anything else).
+    set(CMAKE_SHARED_LIBRARY_${lang}_FLAGS           "-fPIC")
+    set(CMAKE_SHARED_LIBRARY_CREATE_${lang}_FLAGS    "-shared")
+    set(BG/Q_${lang}_DYNAMIC_EXE_FLAGS                "-dynamic")
+  endif()
+
+  # Both toolchains use the GNU linker on BG/Q, so these options are shared.
+  set(CMAKE_SHARED_LIBRARY_RUNTIME_${lang}_FLAG      "-Wl,-rpath,")
+  set(CMAKE_SHARED_LIBRARY_RPATH_LINK_${lang}_FLAG   "-Wl,-rpath-link,")
+  set(CMAKE_SHARED_LIBRARY_SONAME_${lang}_FLAG       "-Wl,-soname,")
+  set(CMAKE_EXE_EXPORTS_${lang}_FLAG                 "-Wl,--export-dynamic")
+  set(CMAKE_SHARED_LIBRARY_LINK_${lang}_FLAGS        "")  # +s, flag for exe link to use shared lib
+  set(CMAKE_SHARED_LIBRARY_RUNTIME_${lang}_FLAG_SEP  ":") # : or empty
+
+  set(BG/Q_${lang}_DEFAULT_EXE_FLAGS
+    "<FLAGS> <CMAKE_${lang}_LINK_FLAGS> <LINK_FLAGS> <OBJECTS>  -o <TARGET> <LINK_LIBRARIES>")
+  set(CMAKE_${lang}_LINK_EXECUTABLE
+    "<CMAKE_${lang}_COMPILER> ${BG/Q_${lang}_DYNAMIC_EXE_FLAGS} ${BG/Q_${lang}_DEFAULT_EXE_FLAGS}")
+endmacro()
+
+#
+# This macro needs to be called for static builds.  Right now it just adds -Wl,-relax
+# to the link line.
+#
+macro(__BlueGeneQ_set_static_flags compiler_id lang)
+  set(BG/Q_${lang}_DEFAULT_EXE_FLAGS
+    "<FLAGS> <CMAKE_${lang}_LINK_FLAGS> <LINK_FLAGS> <OBJECTS>  -o <TARGET> <LINK_LIBRARIES>")
+  set(CMAKE_${lang}_LINK_EXECUTABLE
+    "<CMAKE_${lang}_COMPILER> ${BG/Q_${lang}_DEFAULT_EXE_FLAGS}")
+endmacro()
diff --git a/cmake/Platform/BlueGeneQ-static-XL-C.cmake b/cmake/Platform/BlueGeneQ-static-XL-C.cmake
new file mode 100644 (file)
index 0000000..b2100b5
--- /dev/null
@@ -0,0 +1,26 @@
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+include(BlueGeneQ-static)
+__BlueGeneQ_set_static_flags(XL C)
+
+set(CMAKE_SYSTEM_NAME BlueGeneQ-static)
+# xl.ndebug is appropriate for production calculations. For debugging,
+# use xl to add back error checks and assertions
+set(CMAKE_C_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpicc)
+set(CMAKE_C_FLAGS_RELEASE "-O4 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+
+mark_as_advanced(CMAKE_XL_CreateExportList) # No idea what spams this
diff --git a/cmake/Platform/BlueGeneQ-static-XL-CXX.cmake b/cmake/Platform/BlueGeneQ-static-XL-CXX.cmake
new file mode 100644 (file)
index 0000000..d43ccb3
--- /dev/null
@@ -0,0 +1,26 @@
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+include(BlueGeneQ-static)
+__BlueGeneQ_set_static_flags(XL CXX)
+
+set(CMAKE_SYSTEM_NAME BlueGeneQ-static CACHE STRING "Cross-compiling for BlueGene/Q" FORCE)
+# xl.ndebug is appropriate for production calculations. For debugging,
+# use xl to add back error checks and assertions
+set(CMAKE_CXX_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpicxx)
+set(CMAKE_CXX_FLAGS_RELEASE "-O4 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+
+mark_as_advanced(CMAKE_XL_CreateExportList) # No idea what spams this
diff --git a/cmake/Platform/BlueGeneQ-static.cmake b/cmake/Platform/BlueGeneQ-static.cmake
new file mode 100644 (file)
index 0000000..f5bd5b4
--- /dev/null
@@ -0,0 +1,20 @@
+
+#=============================================================================
+# Copyright 2010 Kitware, Inc.
+# Copyright 2010 Todd Gamblin <tgamblin@llnl.gov>
+# Copyright 2012 Julien Bigot <julien.bigot@cea.fr>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+include(BlueGeneQ-base)
+set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+set(CMAKE_FIND_LIBRARY_PREFIXES "lib")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
diff --git a/cmake/TestAVXMaskload.c b/cmake/TestAVXMaskload.c
new file mode 100644 (file)
index 0000000..61777b0
--- /dev/null
@@ -0,0 +1,17 @@
+#include<immintrin.h>
+int main()
+{
+    __m256d a;
+    __m256i mask;
+    double  d[4]={1,2,3,4};
+
+    a = _mm256_setzero_pd();
+    mask = _mm256_castpd_si256(a);
+
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+    a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
+#else
+    a = _mm256_maskload_pd(d,mask);
+#endif
+}
+
diff --git a/cmake/TestQPX.c b/cmake/TestQPX.c
new file mode 100644 (file)
index 0000000..229c35b
--- /dev/null
@@ -0,0 +1,6 @@
+int main()
+{
+    vector4double one = vec_splats(1.0);
+    vector4double zero = vec_sub(one,one);
+    return 0;
+}
index 8c0de1ffb069b2a59656e6f0c03f7dafaf7f4dbc..3809d02e957f946827a1444b9881d9c6f340fa57 100644 (file)
@@ -30,17 +30,6 @@ MACRO(gmx_c_flags)
 
     # gcc
     if(CMAKE_COMPILER_IS_GNUCC)
-
-        #Fix for LLVM OpenMP bug (redmine 900). Needs to run before OpenMP flags are set below.
-        if(GMX_OPENMP)
-            exec_program(${CMAKE_C_COMPILER} ARGS --version OUTPUT_VARIABLE _compiler_output)
-            if(_compiler_output MATCHES "llvm.*4\\.2")
-                message(STATUS "OpenMP multithreading not supported with llvm-gcc 4.2, disabled")
-                set(GMX_OPENMP OFF CACHE BOOL
-                    "OpenMP multithreading not not supported with llvm-gcc 4.2, disabled!" FORCE)
-            endif()
-        endif()
-
         #flags are added in reverse order and -Wno* need to appear after -Wall
         if(NOT GMX_OPENMP)
             GMX_TEST_CFLAG(CFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CFLAGS)
@@ -77,12 +66,10 @@ MACRO(gmx_c_flags)
             GMX_TEST_CFLAG(CFLAGS_WARN "-Wall" GMXC_CFLAGS)
             GMX_TEST_CFLAG(CFLAGS_STDGNU "-std=gnu99" GMXC_CFLAGS)
             GMX_TEST_CFLAG(CFLAGS_OPT "-ip -funroll-all-loops" GMXC_CFLAGS_RELEASE)
-            GMX_TEST_CFLAG(CFLAGS_SSE2 "-msse2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_X86 "-mtune=core2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_IA64 "-mtune=itanium2" GMXC_CFLAGS_RELEASE)
         else()
             GMX_TEST_CFLAG(CFLAGS_WARN "/W2" GMXC_CFLAGS)
-            GMX_TEST_CFLAG(CFLAGS_SSE2 "/arch:SSE2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_X86 "/Qip" GMXC_CFLAGS_RELEASE)
         endif()
     endif()
@@ -94,13 +81,11 @@ MACRO(gmx_c_flags)
             endif()
             GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall" GMXC_CXXFLAGS)
             GMX_TEST_CXXFLAG(CXXFLAGS_OPT "-ip -funroll-all-loops" GMXC_CXXFLAGS_RELEASE)
-            GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "-msse2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_X86 "-mtune=core2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_IA64 "-mtune=itanium2" 
                               GMXC_CXXFLAGS_RELEASE)
         else()
             GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W2" GMXC_CXXFLAGS)
-            GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "/arch:SSE2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_X86 "/Qip" GMXC_CXXFLAGS_RELEASE)
         endif()
     endif()
@@ -181,36 +166,19 @@ MACRO(gmx_c_flags)
 
     # now actually set the flags:
     # C
-    if ( NOT DEFINED GMXCFLAGS_SET AND NOT DEFINED ENV{CFLAGS} )
-        set(GMXCFLAGS_SET true CACHE INTERNAL "Whether to reset the C flags" 
-            FORCE)
-        
-        set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}" 
-            CACHE STRING "Flags used by the compiler during all build types." 
-            FORCE)
-        set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}" 
-            CACHE STRING "Flags used by the compiler during release builds." 
-            FORCE)
-        set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}" 
-            CACHE STRING "Flags used by the compiler during debug builds." 
-            FORCE)
+    if ( NOT GMX_SKIP_DEFAULT_CFLAGS )
+        set(CMAKE_C_FLAGS "${GMXC_CFLAGS} ${CMAKE_C_FLAGS}")
+        set(CMAKE_C_FLAGS_RELEASE "${GMXC_CFLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
+        set(CMAKE_C_FLAGS_DEBUG "${GMXC_CFLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
     endif()
 
     # C++
-    if ( NOT DEFINED GMXCXXFLAGS_SET AND NOT DEFINED ENV{CXXFLAGS} )
-        set(GMXCXXFLAGS_SET true CACHE INTERNAL "Whether to reset the C++ flags" 
-            FORCE)
-        set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}" 
-            CACHE STRING "Flags used by the compiler during all build types." 
-            FORCE)
+    if ( NOT GMX_SKIP_DEFAULT_CFLAGS)
+        set(CMAKE_CXX_FLAGS "${GMXC_CXXFLAGS} ${CMAKE_CXX_FLAGS}")
         set(CMAKE_CXX_FLAGS_RELEASE 
-            "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}" 
-            CACHE STRING "Flags used by the compiler during release builds." 
-            FORCE)
+            "${GMXC_CXXFLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
         set(CMAKE_CXX_FLAGS_DEBUG 
-            "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}" 
-            CACHE STRING "Flags used by the compiler during debug builds." 
-            FORCE)
+            "${GMXC_CXXFLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
     endif()
 ENDMACRO(gmx_c_flags)
 
diff --git a/cmake/gmxManageBlueGene.cmake b/cmake/gmxManageBlueGene.cmake
new file mode 100644 (file)
index 0000000..fb9b9c9
--- /dev/null
@@ -0,0 +1,73 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+# Managing configuration for all kinds of BlueGene systems
+# BlueGene/L is probably obsolete, but does no harm
+# BlueGene/P needs testing, but hasn't changed
+# BlueGene/Q works
+message(STATUS "Configuring for BlueGene")
+
+if (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
+    # BlueGene/L never had shared lib support.
+    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
+endif()
+if (${CMAKE_SYSTEM_NAME} MATCHES "BlueGene.*static")
+    # BlueGene/P claims shared library support, but Mark Abraham never
+    # got it to work. BlueGene/Q claims it, but discourages it for
+    # performance reasons. So unless information to the contrary ever
+    # comes to light, we should not mess about giving the user options
+    # that are useless when they've already selected a static toolchain.
+    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Static BlueGene build toolchain selected, so shared libraries are disabled" FORCE)
+endif()
+
+set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
+set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
+set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on BlueGene" FORCE)
+
+# It is conceivable you could use ThreadMPI on BlueGene/Q by using its
+# facility to run lots of jobs on small chunks of the machine. You
+# certainly need proper MPI to use a whole chunk of the machine that
+# the scheduler will allocate.
+set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI generally not compatible with BlueGene, defaulting to disabled!")
+set(GMX_MPI ON CACHE BOOL "MPI is normally required on BlueGene" FORCE)
+
+# Access to /etc/passwd is not available on the back end of BlueGeneP
+# (at least), despite being detected by CMake. This can cause linker
+# warnings about harmless things in src/gmxlib/string2.h.
+set(HAVE_PWD_H OFF)
+
+# The automatic testing for endianness does not work for the BlueGene cross-compiler
+set(GMX_FLOAT_FORMAT_IEEE754 1 CACHE INTERNAL "" FORCE)
+set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big-endian floating-point byte order (by default)" FORCE)
+set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big-endian floating-point word order (by default)" FORCE)
index 629191d2960aa97a6b63c8402c3fab6aa004ba8a..8b7c761a2f61451d95b44f58aa11cabb46b151ed 100644 (file)
@@ -18,14 +18,14 @@ if(GMX_GPU_AUTO AND GMX_DOUBLE)
 endif()
 
 # detect GPUs in the build host machine
-if (GMX_GPU OR GMX_GPU_AUTO AND NOT GMX_GPU_DETECTION_DONE)
+if ((GMX_GPU OR GMX_GPU_AUTO) AND NOT GMX_GPU_DETECTION_DONE)
     include(gmxDetectGpu)
     gmx_detect_gpu()
 endif()
 
 # We need to call find_package even when we've already done the detection/setup
 if(GMX_GPU OR GMX_GPU_AUTO)
-    if(NOT GMX_GPU AND GMX_GPU_AUTO AND GMX_GPU_DETECTION_DONE)
+    if(NOT GMX_GPU AND NOT GMX_DETECT_GPU_AVAILABLE)
         # Stay quiet when detection has occured and found no GPU.
         # Noise is acceptable when there is a GPU or the user required one.
         set(FIND_CUDA_QUIETLY QUIET)
@@ -46,7 +46,7 @@ endif()
 # - ON , FALSE: The user requested GPU builds, will require CUDA and will fail
 #               if it is not available.
 # - ON , TRUE : Can't happen (GMX_GPU=ON can only be user-set at this point)
-if(GMX_GPU OR GMX_GPU_AUTO AND NOT GMX_GPU_DETECTION_DONE)
+if((GMX_GPU OR GMX_GPU_AUTO) AND NOT GMX_GPU_DETECTION_DONE)
     if (EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
         set(CUDA_FOUND TRUE CACHE INTERNAL "Whether the CUDA toolkit was found" FORCE)
     else()
@@ -108,6 +108,9 @@ endif()
 # user turns GMX_GPU=OFF after a failed cmake pass, these variables will be
 # left behind in the cache.
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_SDK_ROOT_DIR CUDA_VERBOSE_BUILD)
+if(NOT GMX_GPU)
+    mark_as_advanced(CUDA_TOOLKIT_ROOT_DIR)
+endif()
 
 macro(gmx_gpu_setup)
     # set up nvcc options
index acdea5a39af67cdeb635677c14d41d1cd414de43..36defad2f52a8c919f629596f61aa5dde0d4006e 100644 (file)
@@ -22,8 +22,8 @@ if(GMX_MPI)
       endif()
       find_package(MPI)
       if(${${MPI_PREFIX}_FOUND})
-        set(GROMACS_C_FLAGS ${GROMACS_C_FLAGS} ${${MPI_PREFIX}_COMPILE_FLAGS})
-        set(GROMACS_LINKER_FLAGS ${GROMACS_LINKER_FLAGS} ${${MPI_PREFIX}_LINK_FLAGS})
+        set(MPI_COMPILE_FLAGS ${${MPI_PREFIX}_COMPILE_FLAGS})
+        set(MPI_LINKER_FLAGS ${${MPI_PREFIX}_LINK_FLAGS})
         include_directories(${${MPI_PREFIX}_INCLUDE_PATH})
         list(APPEND GMX_EXTRA_LIBRARIES ${${MPI_PREFIX}_LIBRARIES})
       endif()
diff --git a/cmake/gmxTestAVXMaskload.cmake b/cmake/gmxTestAVXMaskload.cmake
new file mode 100644 (file)
index 0000000..a80920d
--- /dev/null
@@ -0,0 +1,72 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+#  GMX_TEST_AVX_GCC_MASKLOAD_BUG(VARIABLE)
+#
+#  VARIABLE will be set if the compiler is a buggy version
+#  of GCC (prior to 4.5.3, and maybe 4.6) that has an incorrect second
+#  argument to the AVX _mm256_maskload_ps() intrinsic.
+#
+#  You need to use this variable in a cmakedefine, and then handle
+#  the case separately in your code - no automatic cure, unfortunately.
+#
+MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG AVX_CFLAGS VARIABLE)
+    IF(NOT DEFINED ${VARIABLE})
+        MESSAGE(STATUS "Checking for gcc AVX maskload bug") 
+        # some compilers like clang accept both cases, 
+        # so first try a normal compile to avoid flagging those as buggy.
+        TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+                    "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+                    COMPILE_DEFINITIONS "${AVX_CFLAGS}" )
+        IF(${VARIABLE}_COMPILEOK)
+            SET(${VARIABLE} 0 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+            MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+        ELSE()
+            TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+                        "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+                         COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+            IF(${VARIABLE}_COMPILEOK)
+                SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+                MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
+            ELSE()
+                MESSAGE(WARNING "Cannot compile AVX code - assuming gcc AVX maskload bug not present." )
+                MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+            ENDIF()
+        ENDIF()
+    ENDIF(NOT DEFINED ${VARIABLE})
+ENDMACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE)
+
+
+
+
index ceea9ca0a76acbf0cacd21cce318252c30e8a83a..f7cb20d2ea89bbbe20eb8ae3c93683cca399bc84 100644 (file)
@@ -1,18 +1,16 @@
 include(CheckCXXSourceCompiles)
 MACRO(GMX_TEST_CXX11 VARIABLE FLAG)
-    IF(NOT DEFINED HAVE_${VARIABLE})
-        MESSAGE(STATUS "Checking for C++11 support")
-        if(NOT WIN32)
-            set(CXX11_FLAG "-std=c++0x")
-        else()
-            set(CXX11_FLAG "/Qstd=c++0x")
-        endif()
-        CHECK_CXX_COMPILER_FLAG("${CXX11_FLAG}" CXXFLAG_STD_CXX0X)
-        if(NOT CXXFLAG_STD_CXX0X)
-            set(CXX11_FLAG "")
-        endif()
-        set(CMAKE_REQUIRED_DEFINITIONS "${CXX11_FLAG}")
-        check_cxx_source_compiles(
+    if(NOT WIN32)
+        set(CXX11_FLAG "-std=c++0x")
+    else()
+        set(CXX11_FLAG "/Qstd=c++0x")
+    endif()
+    CHECK_CXX_COMPILER_FLAG("${CXX11_FLAG}" CXXFLAG_STD_CXX0X)
+    if(NOT CXXFLAG_STD_CXX0X)
+        set(CXX11_FLAG "")
+    endif()
+    set(CMAKE_REQUIRED_DEFINITIONS "${CXX11_FLAG}")
+    check_cxx_source_compiles(
 "#include <vector>
 #include <memory>
 #include <utility>
@@ -28,16 +26,9 @@ int main() {
   std::vector<A> v2;
   v2.push_back(A());  //requires default move constructor
   v2.push_back(A(new int(5))); //detects bug in ICC
-}" HAVE_${VARIABLE})
-        set(CMAKE_REQUIRED_DEFINITIONS "")
-        if(HAVE_${VARIABLE})
-            set(${VARIABLE} 1 CACHE INTERNAL "Result of C++11 support test" FORCE)
-            set(${FLAG} ${CXX11_FLAG} CACHE INTERNAL "Compiler flag for C++11 support" FORCE)
-            MESSAGE(STATUS "Checking for C++11 support - yes")
-        else()
-            set(${VARIABLE} 0 CACHE INTERNAL "Result of C++11 support test" FORCE)
-            set(${FLAG} "" CACHE INTERNAL "Compiler flag for C++11 support" FORCE)
-            MESSAGE(STATUS "Checking for C++11 support - no")
-        endif()
-    ENDIF(NOT DEFINED HAVE_${VARIABLE})
+}" ${VARIABLE})
+    set(CMAKE_REQUIRED_DEFINITIONS "")
+    if(${VARIABLE})
+        set(${FLAG} ${CXX11_FLAG})
+    endif()
 ENDMACRO()
index 9917088959a64eb63ea7fee6b1894c1e0c0497b1..bc5f737fba02a62c080b97ccee65776302071f56 100644 (file)
@@ -5,9 +5,8 @@
 # If you only use one shell you can copy that GMXRC.* instead.
 
 
-# only csh/tcsh understand 'set'
-set is_csh = 123
-test "$is_csh" = 123 && goto CSH
+# only csh/tcsh set the variable $shell (note: lower case!)
+test $shell && goto CSH
 
 # if we got here, shell is bsh/bash/zsh/ksh
 . @BIN_INSTALL_DIR@/GMXRC.bash
index 6c532bb98f0849c17b8fcaec81d8bdf4877cb027..2da5590a0862102e9432f4ce7c7767ef1ef71e73 100644 (file)
@@ -66,7 +66,6 @@ Thu 26 Aug 2010</B></td>
 <br><a href=online/g_density.html>g_density</a>
 <br><a href=online/g_densmap.html>g_densmap</a>
 <br><a href=online/g_dielectric.html>g_dielectric</a>
-<br><a href=online/g_dih.html>g_dih</a>
 <br><a href=online/g_dipoles.html>g_dipoles</a>
 <br><a href=online/g_disre.html>g_disre</a>
 <br><a href=online/g_dist.html>g_dist</a>
@@ -282,7 +281,6 @@ Thu 26 Aug 2010</B></td>
 <TR><TD><A HREF="online/g_bond.html">g_bond</A></TD><TD>calculates bond length distributions</TD>
 <TR><TD><A HREF="online/mk_angndx.html">mk_angndx</A></TD><TD>generates index files for g_angle</TD>
 <TR><TD><A HREF="online/g_angle.html">g_angle</A></TD><TD>calculates distributions and correlations for angles and dihedrals</TD>
-<TR><TD><A HREF="online/g_dih.html">g_dih</A></TD><TD>analyzes dihedral transitions</TD>
 </TABLE>
 
 <A NAME="HNR11">
diff --git a/share/html/online/g_dih.html b/share/html/online/g_dih.html
deleted file mode 100644 (file)
index e61741b..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-<HTML>
-<HEAD>
-<TITLE>g_dih</TITLE>
-<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
-<TABLE WIDTH="98%" NOBORDER >
-<TR><TD WIDTH=400>
-<TABLE WIDTH=400 NOBORDER>
-<TD WIDTH=116>
-<a href="http://www.gromacs.org/"><img SRC="../images/gmxlogo_small.png"BORDER=0 </a></td>
-<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>g_dih</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p><B>VERSION 4.5<br>
-Thu 26 Aug 2010</B></td></tr></TABLE>
-<HR>
-<H3>Description</H3>
-<p>
-g_dih can do two things. The default is to analyze dihedral transitions
-by merely computing all the dihedral angles defined in your topology
-for the whole trajectory. When a dihedral flips over to another minimum
-an angle/time plot is made.<p>
-The opther option is to discretize the dihedral space into a number of
-bins, and group each conformation in dihedral space in the
-appropriate bin. The output is then given as a number of dihedral
-conformations sorted according to occupancy.
-<P>
-<H3>Files</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>filename</TH><TH>type</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-f</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">    traj.xtc</a></tt> </TD><TD> Input </TD><TD> Trajectory: <a href="xtc.html">xtc</a> <a href="trr.html">trr</a> <a href="trj.html">trj</a> <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> cpt </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-s</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">   topol.tpr</a></tt> </TD><TD> Input </TD><TD> Run input file: <a href="tpr.html">tpr</a> <a href="tpb.html">tpb</a> <a href="tpa.html">tpa</a> </TD></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-o</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="out.html">   hello.out</a></tt> </TD><TD> Output </TD><TD> Generic output file </TD></TR>
-</TABLE>
-<P>
-<H3>Other options</H3>
-<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TR><TH>option</TH><TH>type</TH><TH>default</TH><TH>description</TH></TR>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> Print help info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]version</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> Print version info and quit </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0     </tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0     </tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>0     </tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-[no]sa</tt></b> </TD><TD ALIGN=RIGHT> gmx_bool </TD><TD ALIGN=RIGHT> <tt>no    </tt> </TD><TD> Perform cluster analysis in dihedral space instead of analysing dihedral transitions. </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-mult</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> mulitiplicity for dihedral angles (by default read from topology) </TD></TD>
-</TABLE>
-<P>
-<hr>
-<div ALIGN=RIGHT>
-<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
-<font size="-1"><a href="mailto:gromacs@gromacs.org">gromacs@gromacs.org</a></font><br>
-</div>
-</BODY>
index 4853808f5988ee3f49e268b2570fe5aad7dce768..963099586feaef488b35e235075c33d426932881 100644 (file)
@@ -67,7 +67,7 @@ without using the GROMACS libraries you can use the following formats:
 
 <dl>
 <dt>C format 
-<dd><tt>"%5d%5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
+<dd><tt>"%5d%-5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"</tt>
 <dt>Fortran format 
 <dd><tt>(i5,2a5,i5,3f8.3,3f8.4)</tt>
 <dt>Pascal format
index 88a27c21bf34d45a72144314c9157cfe4758afb1..b15510e463173d0c6991c9ee1b32641a3827bbc4 100644 (file)
@@ -51,8 +51,8 @@ IF YOU'RE NOT SURE ABOUT WHAT YOU'RE DOING, DON'T DO IT!
 wall-density, wall-ewald-zfac)
 <li><A HREF="#pull"><b>COM pulling</b></A> (pull, ...)
 <li><A HREF="#nmr"><b>NMR refinement</b></A> (disre, disre-weighting, disre-mixed, disre-fc, disre-tau, nstdisreout, orire, orire-fc, orire-tau, orire-fitgrp, nstorireout)
-<li><A HREF="#free"><b>Free energy calculations</b></A> (free-energy, nstdhdl, dhdl-print-energy, init-lambda, delta-lambda, fep-lambdas, coul-lambdas, vdw-lambdas, bonded-lambdas, restraint-lambdas, mass-lambdas, sc-alpha, sc-coul, sc-power, sc-r-power, sc-sigma, couple-moltype, couple-lambda0, couple-lambda1, couple-intramol)
-<li><A HREF="#expanded"><b>Expanded ensemble simulation</b></A> (lmc-stats, lmc-mc-move, lmc-seed, lmc-gibbsdelta, mc-temperature, nst-transition-matrix,init-lambda-weights,initial-wl-delta,wl-scale,wl-ratio,symmetrized-transition-matrix,lmc-forced-nstart,weight-c-range,mininum-var-min,lmc-weights-equil,weight-equil-wl-delta,weight-equil-number-all-lambda,weight-equil-number-steps,weight-equil-number-samples,weight-equil-count-ratio,simulated-tempering,simulated-tempering-scaling,sim-temp-low,sim-temp-high)
+<li><A HREF="#free"><b>Free energy calculations</b></A> (free-energy, nstdhdl, dhdl-print-energy, init-lambda, delta-lambda, fep-lambdas, coul-lambdas, vdw-lambdas, bonded-lambdas, restraint-lambdas, mass-lambdas, temperature-lambdas, sc-alpha, sc-coul, sc-power, sc-r-power, sc-sigma, couple-moltype, couple-lambda0, couple-lambda1, couple-intramol)
+<li><A HREF="#expanded"><b>Expanded ensemble simulation</b></A> (lmc-stats, lmc-mc-move, lmc-seed, lmc-gibbsdelta, mc-temperature, nst-transition-matrix, init-lambda-weights, initial-wl-delta, wl-scale, wl-ratio, symmetrized-transition-matrix, lmc-forced-nstart, mininum-var-min, lmc-weights-equil, weight-equil-wl-delta, weight-equil-number-all-lambda, weight-equil-number-steps, weight-equil-number-samples, weight-equil-count-ratio, simulated-tempering, simulated-tempering-scaling, sim-temp-low, sim-temp-high)
 <li><A HREF="#neq"><b>Non-equilibrium MD</b></A> (acc-grps, accelerate, freezegrps, freezedim, cos-acceleration, deform)
 <li><A HREF="#ef"><b>Electric fields</b></A> (E-x, E-xt, E-y, E-yt, E-z, E-zt )
 <li><A HREF="#qmmm"><b>Mixed quantum/classical dynamics</b></A> (QMMM, QMMM-grps, QMMMscheme, QMmethod, QMbasis, QMcharge, Qmmult, CASorbitals, CASelectrons, SH)
@@ -542,7 +542,7 @@ For dynamics without temperature coupling or to override the buffer size,
 use <b>verlet-buffer-drift</b>=-1 and set <b>rlist</b> manually.</dd>
 
 <dt><b>rlist: (1) [nm]</b></dt>
-<dd>Cut-off distance for the short-range neighbor list, should be &ge; 0.
+<dd>Cut-off distance for the short-range neighbor list.
 With <b>cutoff-scheme</b>=<b>Verlet</b>, this is by default set by the
 <b>verlet-buffer-drift</b> option and the value of <b>rlist</b> is ignored.</dd>
 
@@ -716,8 +716,8 @@ affect the forces or the sampling.</dd>
 <dt><b>rcoulomb-switch: (0) [nm]</b></dt>
 <dd>where to start switching the Coulomb potential</dd>
 
-<dt><b>rcoulomb: (-1) [nm]</b></dt>
-<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx-->, should be &ge; 0</dd>
+<dt><b>rcoulomb: (1) [nm]</b></dt>
+<dd>distance for the Coulomb <!--Idx-->cut-off<!--EIdx--></dd>
 
 <dt><b>epsilon-r: (1)</b></dt>
 <dd>The relative <!--Idx-->dielectric constant<!--EIdx-->.
@@ -787,8 +787,8 @@ affect the forces or the sampling.</dd>
 <dt><b>rvdw-switch: (0) [nm]</b></dt>
 <dd>where to start switching the LJ potential</dd>
 
-<dt><b>rvdw: (-1) [nm]</b></dt>
-<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx-->, should be &ge; 0</dd>
+<dt><b>rvdw: (1) [nm]</b></dt>
+<dd>distance for the LJ or Buckingham <!--Idx-->cut-off<!--EIdx--></dd>
 
 <dt><b>DispCorr:</b></dt>
 <dd><dl compact></dd>
@@ -1497,6 +1497,12 @@ potentials are used for the LJ and Coulomb interactions.</dd>
 <dd>starting value for the lambda state (integer).  Specified which columm of the lambda vector should be used.</dd>
 <dt><b>delta-lambda: (0)</b></dt>
 <dd>increment per time step for lambda</dd>
+<dt><b>fep-lambdas: ()</b></dt>
+<dd>Zero, one or more lambda values for which Delta H values will
+be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
+Free energy differences between different lambda values can then
+be determined with <tt>g_bar</tt>. <b>fep-lambdas</b> is different from the other -lambdas keywords because
+all components of the lambda vector that are not specified will use <b>fep-lambdas</b>.</dd>
 <dt><b>coul-lambdas: ()</b></dt>
 <dd>Zero, one or more lambda values for which Delta H values will
 be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
@@ -1522,14 +1528,6 @@ Only the particle masses are controlled with this component of the lambda vector
 be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
 Only the temperatures controlled with this component of the lambda vector.
 Note that these lambdas should not be used for replica exchange, only for simulated tempering.</dd>
-<dt><b>fep-lambdas: ()</b></dt>
-<dd>Zero, one or more lambda values for which Delta H values will
-be determined and written to dhdl.xvg every <b>nstdhdl</b> steps.
-Free energy differences between different lambda values can then
-be determined with <tt>g_bar</tt>. <b>fep-lambdas</b> is different from the other -lambdas keywords because
-all components of the lambda vector that are not specified will use <b>fep-lambdas</b>.</dd>
-<dt><b>dhdl-derivatives: (yes)</b></dt>
-<dd>If yes (the default), the derivatives of the Hamiltonian with respect to lambda at each <b>nstdhdl</b> step are written out. These values are needed for interpolation of linear energy differences with <tt>g_bar</tt> (although the same can also be achieved with the right <b>foreign lambda</b> setting, that may not be as flexible), or with thermodynamic integration</dd>
 <dt><b>sc-alpha: (0)</b></dt>
 <dd>the soft-core parameter, a value of 0 results in linear interpolation of
 the LJ and Coulomb interactions</dd>
@@ -1565,7 +1563,7 @@ the molecule definition in the topology.</dd>
 <dt><b>q</b></dt>
 <dd>the Van der Waals interactions are turned at lambda=0; soft-core interactions will be required to avoid singularities
 <dt><b>none</b></dt>
-<dd>the Van der Waals interactions are turned off and the charges are zero at lambda=0; soft-core interactions will be required to avoid singularities
+<dd>the Van der Waals interactions are turned off and the charges are zero at lambda=0; soft-core interactions will be required to avoid singularities.
 </dl>
 <dt><b>couple-lambda1:</b></dt>
 <dd> analogous to <b>couple-lambda1</b>, but for lambda=1
@@ -1579,6 +1577,10 @@ the molecule definition in the topology.</dd>
 <dt><b>nstdhdl: (100)</b></dt>
 <dd>the frequency for writing dH/dlambda and possibly Delta H to dhdl.xvg,
 0 means no ouput, should be a multiple of <b>nstcalcenergy</b></dd>.</dd>
+<dt><b>dhdl-derivatives: (yes)</b></dt>
+<dd>If yes (the default), the derivatives of the Hamiltonian with respect to lambda at each <b>nstdhdl</b> step are written out. These values are needed for interpolation of linear energy differences with <tt>g_bar</tt> (although the same can also be achieved with the right <b>foreign lambda</b> setting, that may not be as flexible), or with thermodynamic integration</dd>
+<dt><b>dhdl-print-energy: (no)</b></dt>
+<dd> Include the total energy in the dhdl file.  This information is needed for later analysis if the states of interest in the free e energy calculation are at different temperatures.  If all are at the same temperature, this information is not needed.</dd>
 <dt><b>separate-dhdl-file: (yes)</b></dt>
 <dd><dl compact>
 <dt><b>yes</b></dt>
@@ -1601,17 +1603,18 @@ simulations.  Must be a multiple of <b>nstcalcenergy</b>.</dd>
 <dt><b>lmc-stats:</b></dt>
 <dd><dl compact>
 <dt><b>no</b></dt>
-<dd>No Monte Carlo in state space</dd>
+<dd>No Monte Carlo in state space is performed.</dd>
 <dt><b>metropolis-transition</b></dt>
-<dd> Uses the Metropolis weights to update the expanded ensemble weight of the state.
+<dd> Uses the Metropolis weights to update the expanded ensemble weight of each state.
 Min{1,exp(-(beta_new u_new - beta_old u_old)}</dd>
 <dt><b>barker-transition</b></dt>
-<dd> Uses the Barker transition critera to update the expanded ensemble weight of the state.</dd>
+<dd> Uses the Barker transition critera to update the expanded ensemble weight of each state i, defined by
+exp(-beta_new u_new)/[exp(-beta_new u_new)+exp(-beta_old u_old)</dd>
 <dt><b>wang-landau</b></dt>
-<dd>Uses the Wang-Landau algorithm (in state space) to update the expanded ensemble weights.</dd>
+<dd>Uses the Wang-Landau algorithm (in state space, not energy space) to update the expanded ensemble weights.</dd>
 <dt><b>min-variance</b></dt>
-<dd>Uses the minimum variance updating method of Escobedo et al to update the expanded ensemble weights. Weights
-will not be the free energies, but will rather emphasize states that need more sampling to give even uncertainty.
+<dd>Uses the minimum variance updating method of Escobedo et al. to update the expanded ensemble weights. Weights
+will not be the free energies, but will rather emphasize states that need more sampling to give even uncertainty.</dd>
 </dl>
 <dt><b>lmc-mc-move:</b></dt>
 <dd><dl compact>
@@ -1621,7 +1624,7 @@ will not be the free energies, but will rather emphasize states that need more s
 <dd> Randomly chooses a new state up or down, then uses the Metropolis critera to decide whether to accept or reject:
 Min{1,exp(-(beta_new u_new - beta_old u_old)}</dd>
 <dt><b>barker-transition</b></dt>
-<dd> Randomly chooses a new state up or down, then uses the Barker transition critera to decide whether to accept or reject: exp(-beta_new u_new)/[exp(-beta_new u_new)+exp(-beta_old u_old)] </dd>
+<dd> Randomly chooses a new state up or down, then uses the Barker transition critera to decide whether to accept or reject: exp(-beta_new u_new)/[exp(-beta_new u_new)+exp(-beta_old u_old)]</dd>
 <dt><b>gibbs</b></dt>
 <dd> Uses the conditional weights of the state given the coordinate (exp(-beta_i u_i) / sum_k exp(beta_i u_i) to
 decide which state to move to.</dd>
@@ -1629,35 +1632,69 @@ decide which state to move to.</dd>
 <dd>
 <dd> Uses the conditional weights of the state given the coordinate (exp(-beta_i u_i) / sum_k exp(beta_i u_i) to
 decide which state to move to, EXCLUDING the current state, then uses a rejection step to ensure detailed
-balance. Always more efficient that Gibbs, though marginally so in many situations.</dd>
+balance. Always more efficient that Gibbs, though only marginally so in many situations, such as when only the nearest neighbors have decent phase space overlap.</dd>
 </dl>
-
 <dt><b>lmc-seed:</b></dt>
-<dd> random seed to use for Monte Carlo moves in state space.  If not specified, <b>ld-seed</b> is used instead. </dd>
+<dd> random seed to use for Monte Carlo moves in state space.  If not specified, <b>ld-seed</b> is used instead.</dd>
 <dt><b>mc-temperature:</b></dt>
 <dd> Temperature used for acceptance/rejection for Monte Carlo moves. If not specified, the temperature of the
 simulation specified in the first group of <b>ref_t</b> is used.</dd>
-
-<dt><b>wl-scale: (0.8)</b></dt>
 <dt><b>wl-ratio: (0.8)</b></dt>
-<dt><b>init-wl-delta: (1.0) </b></dt>
-<dt><b>wl-oneovert: (no) </b></dt>
+<dd>The cutoff for the histogram of state occupancies to be reset, and the free energy incrementor to be reset as delta -> delta*wl-scale. If we define the Nratio = (number of samples at each histogram) / (average number of samples at each histogram).  <b>wl-ratio</b> of 0.8 means that means that the histogram is only considered flat if all Nratio &gt; 0.8 AND simultaneously all 1/Nratio &gt; 0.8.</dd>
+<dt><b>wl-scale: (0.8)</b></dt>
+<dd> Each time the histogram is considered flat, then the current value of the Wang-Landau incrementor for the free energies is multiplied by <b>wl-scale</b>.  Value must be between 0 and 1.</dd>
+<dt><b>init-wl-delta: (1.0)</b></dt>
+<dd>The initial value of the Wang-Landau incrementor in kT. Some value near 1 kT is usually most efficient, though sometimes a value of 2-3 in units of kT works better if the free energy differences are large.</dd>
+<dt><b>wl-oneovert: (no)</b></dt>
+<dd>Set Wang-Landau incrementor to scale with 1/(simulation time) in the large sample limit. There is significant evidence that the standard Wang-Landau algorithms in state space presented here result in free energies getting 'burned in' to incorrect values that depend on the initial state. when <b>wl-oneovert</b> is true, then when the incrementor becomes less than 1/N, where N is the mumber of samples collected (and thus proportional to the data collection time, hence '1 over t'), then the Wang-Lambda incrementor is set to 1/N, decreasing every step.  Once this occurs, <b>wl-ratio</b> is ignored, but the weights will still stop updating when the equilibration criteria set in <b>lmc-weights-equil</b> is achieved.</dd>
 <dt><b>lmc-repeats: (1)</b></dt>
-<dt><b>lmc-gibbsdelta: (-1) </b></dt>
-<dt><b>lmc-forced-nstart: (0) </b></dt>
+<dd>Controls the number of times that each Monte Carlo swap type is performed each iteration. In the limit of large numbers of Monte Carlo repeats, then all methods converge to Gibbs sampling.  The value will generally not need to be different from 1.</dd>
+<dt><b>lmc-gibbsdelta: (-1)</b></dt>
+<dd> Limit Gibbs sampling to selected numbers of neighboring states. For Gibbs sampling, it is sometimes inefficient to perform Gibbs sampling over all of the states that are defined.  A positive value of <b>lmc-gibbsdelta</b> means that only states plus or minus <b>lmc-gibbsdelta</b> are considered in exchanges up and down. A value of -1 means that all states are considered.  For less than 100 states, it is probably not that expensive to include all states.</dd> 
+<dt><b>lmc-forced-nstart: (0)</b></dt>
+<dd> Force initial state space sampling to generate weights. In order to come up with reasonable initial weights, this setting allows the simulation to drive from the initial to the final lambda state, with <b>lmc-forced-nstart</b> steps at each state before moving on to the next lambda state. If <b>lmc-forced-nstart</b> is sufficiently long (thousands of steps, perhaps), then the weights will be close to correct.  However, in most cases, it is probably better to simply run the standard weight equilibration algorithms.
 <dt><b>nst-transition-matrix: (-1)</b></dt>
 <dd>Frequency of outputting the expanded ensemble transition matrix.  A negative number means it will only be printed at the end of the simulation.<dd>
 <dt><b>symmetrized-transition-matrix: (no) </b></dt>
-<dd>Whether to symmetrize the empirical transition matrix</dd>
-<dt><b>mininum-var-min</b></dt>
-<dt><b>weight-c-range</b></dt>
-
+<dd>Whether to symmetrize the empirical transition matrix. In the infinite limit the matrix will be symmetric, but will diverge with statistical noise for short timescales.  Forced symmetrization, by using the matrix T_sym = 1/2 (T + transpose(T)), removes problems like the existence of (small magnitude) negative eigenvalues.</dd>
+<dt><b>mininum-var-min: (100)</b></dt>
+<dd> The <b>min-variance</b> strategy (option of <b>lmc-stats</b> is only valid for larger number of samples, and can get stuck if too few samples are used at each state.  <b>mininum-var-min</b> is the minimum number of samples that each state that are allowed before the <b>min-variance</b> strategy is activated if selected.
+<dt><b>init-lambda-weights: </b></dt>
+<dd>The initial weights (free energies) used for the expanded ensemble states.  Default is a vector of zero weights. format is similar to the lambda vector settings in <b>fep-lambdas</b>, except the weights can be any floating point number.  Units are kT. Its length must match the lambda vector lengths.<dd>
+<dt><b>lmc-weights-equil: (no)</b><dt>
+<dd><dl compact>
+<dt><b>no</b><dt>
+<dd>Expanded ensemble weights continue to be updated throughout the simulation.</dd>
+<dt><b>yes</b><dt>
+<dd>The input expanded ensemble weights are treated as equilibrated, and are not updated throughout the simulation.</dd>
+<dt><b>wl-delta</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the Wang-Landau incrementor falls below the value specified by <b>weight-equil-wl-delta</b>.</dd>
+<dt><b>number-all-lambda</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the number of samples at all of the lambda states is greater than the value specified by <b>weight-equil-number-all-lambda</b>.</dd>
+<dt><b>number-steps</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the number of steps is greater than the level specified by <b>weight-equil-number-steps</b>.</dd>
+<dt><b>number-samples</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the number of total samples across all lambda states is greater than the level specified by <b>weight-equil-number-samples</b>.</dd>
+<dt><b>count-ratio</b><dt>
+<dd>Expanded ensemble weight updating is stopped when the ratio of samples at the least sampled lambda state and most sampled lambda state greater than the value specified by <b>weight-equil-count-ratio</b>.</dd> 
+</dl>
 <dt><b>simulated-tempering: (no)</b></dt>
-<dt><b>simulated-tempering-scaling: ()</b></dt>
-<dt><b>sim-temp-low: (300):</b></dt>
-<dd>Low temperature for simulated tempering</dd>
-<dt><b>sim-temp-high: (300):</b></dt>
-<dd>High temperature for simulated tempering</dd>
+<dd>Turn simulated tempering on or off. Simulated tempering is implemented as expanded ensemble sampling with different temperatures instead of different Hamiltonians.</dd>
+<dt><b>sim-temp-low: (300)</b></dt>
+<dd>Low temperature for simulated tempering.</dd>
+<dt><b>sim-temp-high: (300)</b></dt>
+<dd>High temperature for simulated tempering.</dd>
+<dt><b>simulated-tempering-scaling: (linear)</b></dt>
+<dd>Controls the way that the temperatures at intermediate lambdas are calculated from the <b>temperature-lambda</b> part of the lambda vector.</dd>
+<dd><dl compact>
+<dt><b>linear</b><dt>
+<dd>Linearly interpolates the temperatures using the values of <b>temperature-lambda</b>,i.e. if <b>sim-temp-low</b>=300, <b>sim-temp-high</b>=400, then lambda=0.5 correspond to a temperature of 350. A nonlinear set of temperatures can always be implemented with uneven spacing in lambda.</dd>
+<dt><b>geometric</b><dt>
+<dd> Interpolates temperatures geometrically between <b>sim-temp-low</b> and <b>sim-temp-high</b>. The ith state has temperature <b>sim-temp-low</b> * (<b>sim-temp-high</b>/<b>sim-temp-low</b>)^(i/(ntemps-1)).  Should give roughly equal exchange for constant heat capacity, though of course things simulations that involve protein folding have very high heat capacity peaks.</dd>
+<dt><b>exponential</b><dt>
+<dd> Interpolates temperatures exponentially between <b>sim-temp-low</b> and <b>sim-temp-high</b>. The ith state has temperature
+<b>sim-temp-low</b> + (<b>sim-temp-high</b>-<b>sim-temp-low</b>)*((exp(<b>temperature-lambdas</b>[i])-1)/(exp(1.0)-1)).</dd>
+</dl>
 </dl>
 
 <A NAME="neq"><br>
@@ -2013,12 +2050,22 @@ reals to your subroutine. Check the inputrec definition in
 <A HREF="#vel">gen-vel</A><br>
 <A HREF="#pp">include</A><br>
 <A HREF="#free">init-lambda</A><br>
+<A HREF="#expanded">init-lambda-weights</A><br>
 <A HREF="#run">init-step</A><br>
+<A HREF="#expanded">initial-wl-delta</A><br>
 <A HREF="#run">integrator</A><br>
 <A HREF="#ld">ld-seed</A><br>
 <A HREF="#bond2">lincs-iter</A><br>
 <A HREF="#bond2">lincs-order</A><br>
 <A HREF="#bond2">lincs-warnangle</A><br>
+<A HREF="#expanded">lmc-forced-nstart</A><br>
+<A HREF="#expanded">lmc-gibbsdelta</A><br>
+<A HREF="#expanded">lmc-mc-move</A><br>
+<A HREF="#expanded">lmc-seed</A><br>
+<A HREF="#expanded">lmc-stats</A><br>
+<A HREF="#expanded">lmc-weights-equil</A><br>
+<A HREF="#expanded">mc-temperature</A><br>
+<A HREF="#expanded">mininum-var-min</A><br>
 <A HREF="#bond2">morse</A><br>
 <A HREF="#em">nbfgscorr</A><br>
 <A HREF="#xmdrun">niter</A><br>
@@ -2037,6 +2084,7 @@ reals to your subroutine. Check the inputrec definition in
 <A HREF="#out">nstvout</A><br>
 <A HREF="#out">nstxout</A><br>
 <A HREF="#out">nstxtcout</A><br>
+<A HREF="#expanded">nst-transition-matrix</A><br>
 <A HREF="#nl">ns-type</A><br>
 <A HREF="#wall">nwall</A><br>
 <A HREF="#ewald">optimize-fft</A><br>
@@ -2065,6 +2113,11 @@ reals to your subroutine. Check the inputrec definition in
 <A HREF="#free">sc-power</A><br>
 <A HREF="#free">sc-sigma</A><br>
 <A HREF="#bond2">shake-tol</A><br>
+<A HREF="#expanded">sim-temp-low</A><br>
+<A HREF="#expanded">sim-temp-high</A><br>
+<A HREF="#expanded">simulated-tempering</A><br>
+<A HREF="#expanded">simulated-tempering-scaling</A><br>
+<A HREF="#expanded">symmetrized-transition-matrix</A><br>
 <A HREF="#table">table-extension</A><br>
 <A HREF="#pc">tau-p</A><br>
 <A HREF="#tc">tau-t</A><br>
@@ -2093,6 +2146,13 @@ reals to your subroutine. Check the inputrec definition in
 <A HREF="#walls">wall-ewald-zfac</A><br>
 <A HREF="#walls">wall-r-linpot</A><br>
 <A HREF="#walls">wall-type</A><br>
+<A HREF="#expanded">weight-equil-count-ratio</A><br>
+<A HREF="#expanded">weight-equil-number-all-lambda</A><br>
+<A HREF="#expanded">weight-equil-number-samples</A><br>
+<A HREF="#expanded">weight-equil-number-steps</A><br>
+<A HREF="#expanded">weight-equil-wl-delta</A><br>
+<A HREF="#expanded">wl-ratio</A><br>
+<A HREF="#expanded">wl-scale</A><br>
 </multicol>
 
 <hr>
index 4a2a7d9817c713c771d88f412f36ca44b976bcf2..443f1f9f2453562a17e71f1aa2cb942adec29247 100644 (file)
@@ -22,7 +22,6 @@ g_coord
 g_covar
 g_density
 g_dielectric
-g_dih
 g_dipoles
 g_disre
 g_dist
index f4dc1148b4bb4c2e0b6d53ad1c30f276879ecb7f..2a5577e8a2d3c60a9fc945af9b33f86577585175 100644 (file)
@@ -54,9 +54,6 @@
  */
 #cmakedefine GMX_FLOAT_FORMAT_IEEE754
 
-/* Use assembly intrinsics kernels for BlueGene */
-#cmakedefine GMX_BLUEGENE
-
 /* Work around broken calloc() */
 #cmakedefine GMX_BROKEN_CALLOC
 
@@ -93,6 +90,9 @@
 /* AVX 256-bit instructions available */
 #cmakedefine GMX_X86_AVX_256
 
+/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
+#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+
 /* SSE2 was selected as CPU acceleration level */
 #cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
 
 /* AVX 256-bit was selected as CPU acceleration level */
 #cmakedefine GMX_CPU_ACCELERATION_X86_AVX_256
 
+/* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */
+#cmakedefine GMX_CPU_ACCELERATION_IBM_QPX
+
 /* String for CPU acceleration choice (for writing to log files and stdout) */
 #define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
 
 /* Use the GROMACS software 1/sqrt(x) */
 #cmakedefine GMX_SOFTWARE_INVSQRT
 
-/* Use the PowerPC hardware 1/sqrt(x) */
-#cmakedefine GMX_POWERPC_INVSQRT
-
 /* Use sub-counters */
 #cmakedefine GMX_CYCLE_SUBCOUNTERS
 
 /* Build special-purpose mdrun library */
 #cmakedefine GMX_FAHCORE   
 
+/* Enable gromacs quotes */
+#cmakedefine GMX_COOL_QUOTES
+
 #ifdef GMX_FAHCORE
 #define FULLINDIRECT 1
 #define USE_FAH_XDR  1
index edf009283fce31d616cc92188a9ddb933cf5157d..425cc60d89b7b24c083de6c8dded03726c38eeee 100644 (file)
@@ -55,10 +55,6 @@ if(GMX_USE_GCC44_BUG_WORKAROUND)
    gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
 endif()
 
-if(GMX_GPU)
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif()
-
 add_library(libgromacs ${LIBGROMACS_SOURCES})
 if (GMX_GIT_VERSION_INFO)
     add_dependencies(libgromacs gmx_version)
index a2bc622e11992ae3712cbae583846bf27b585b30..ffcbdce9e235fe009d0166740754c734811f42a3 100644 (file)
@@ -158,8 +158,6 @@ static void add_prop(aprop_t *ap,gmx_residuetype_t restype,
        ap->bAvail[i] = FALSE;
       }
     }
-    upstring(atomnm);
-    upstring(resnm);
     ap->atomnm[ap->nprop] = strdup(atomnm);
     ap->resnm[ap->nprop]  = strdup(resnm);
     j = ap->nprop;
@@ -319,9 +317,7 @@ gmx_bool gmx_atomprop_query(gmx_atomprop_t aps,
   else { 
     strncpy(atomname,atomnm,MAXQ-1);
   }
-  upstring(atomname);
   strncpy(resname,resnm,MAXQ-1);
-  upstring(resname);
   
   j = get_prop_index(&(ap->prop[eprop]),ap->restype,resname,
                     atomname,&bExact);
index 5ae98d0047faf2763f632fe0d5134eccc59aa19a..704310a81adf5ecc2e74f40176c2735aa23af84f 100644 (file)
@@ -86,7 +86,7 @@ gmx_ctime_r(const time_t *clock,char *buf, int n);
  * But old code can not read a new entry that is present in the file
  * (but can read a new format when new entries are not present).
  */
-static const int cpt_version = 14;
+static const int cpt_version = 15;
 
 
 const char *est_names[estNR]=
@@ -299,6 +299,39 @@ static void do_cpt_double_err(XDR *xd,const char *desc,double *f,FILE *list)
     }
 }
 
+static void do_cpt_real_err(XDR *xd,const char *desc,real *f)
+{
+    bool_t res=0;
+
+#ifdef GMX_DOUBLE
+    res = xdr_double(xd,f);
+#else
+    res = xdr_float(xd,f);
+#endif
+    if (res == 0)
+    {
+        cp_error();
+    }
+}
+
+static void do_cpt_n_rvecs_err(XDR *xd,const char *desc,int n, rvec f[],FILE *list)
+{
+    int i,j;
+
+    for (i=0; i<n; i++)
+    {
+        for (j=0; j<DIM; j++)
+        {
+            do_cpt_real_err(xd, desc, &f[i][j]);
+        }
+    }
+
+    if (list)
+    {
+        pr_rvecs(list,0,desc,f,n);
+    }
+}
+
 /* If nval >= 0, nval is used; on read this should match the passed value.
  * If nval n<0, *nptr is used; on read the value is stored in nptr
  */
@@ -754,6 +787,7 @@ static void do_cpt_header(XDR *xd,gmx_bool bRead,int *file_version,
                           int *natoms,int *ngtc, int *nnhpres, int *nhchainlength,
                           int *nlambda, int *flags_state,
                           int *flags_eks,int *flags_enh, int *flags_dfh,
+                          int *nED,
                           FILE *list)
 {
     bool_t res=0;
@@ -892,6 +926,15 @@ static void do_cpt_header(XDR *xd,gmx_bool bRead,int *file_version,
     } else {
         *flags_dfh = 0;
     }
+
+    if (*file_version >= 15)
+    {
+        do_cpt_int_err(xd,"ED data sets",nED,list);
+    }
+    else
+    {
+        *nED = 0;
+    }
 }
 
 static int do_cpt_footer(XDR *xd,gmx_bool bRead,int file_version)
@@ -1162,6 +1205,71 @@ static int do_cpt_df_hist(XDR *xd,gmx_bool bRead,int fflags,df_history_t *dfhist
     return ret;
 }
 
+
+/* This function stores the last whole configuration of the reference and
+ * average structure in the .cpt file
+ */
+static int do_cpt_EDstate(XDR *xd,gmx_bool bRead,
+        edsamstate_t *EDstate, FILE *list)
+{
+    int i,j;
+    int ret=0;
+    char buf[STRLEN];
+
+
+    EDstate->bFromCpt = bRead;
+
+    if (EDstate->nED <= 0)
+    {
+        return ret;
+    }
+
+    /* When reading, init_edsam has not been called yet,
+     * so we have to allocate memory first. */
+    if (bRead)
+    {
+        snew(EDstate->nref    , EDstate->nED);
+        snew(EDstate->old_sref, EDstate->nED);
+        snew(EDstate->nav     , EDstate->nED);
+        snew(EDstate->old_sav , EDstate->nED);
+    }
+
+    /* Read/write the last whole conformation of SREF and SAV for each ED dataset (usually only one) */
+    for (i=0; i< EDstate->nED; i++)
+    {
+        /* Reference structure SREF */
+        sprintf(buf, "ED%d # of atoms in reference structure", i+1);
+        do_cpt_int_err(xd, buf, &EDstate->nref[i],list);
+        sprintf(buf, "ED%d x_ref", i+1);
+        if (bRead)
+        {
+            snew(EDstate->old_sref[i], EDstate->nref[i]);
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref[i], list);
+        }
+        else
+        {
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nref[i], EDstate->old_sref_p[i], list);
+        }
+
+        /* Average structure SAV */
+        sprintf(buf, "ED%d # of atoms in average structure", i+1);
+        do_cpt_int_err(xd, buf, &EDstate->nav[i] ,list);
+        sprintf(buf, "ED%d x_av", i+1);
+        if (bRead)
+        {
+            snew(EDstate->old_sav[i], EDstate->nav[i]);
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav[i], list);
+        }
+        else
+        {
+            do_cpt_n_rvecs_err(xd, buf, EDstate->nav[i], EDstate->old_sav_p[i], list);
+        }
+    }
+
+    return ret;
+}
+
+
 static int do_cpt_files(XDR *xd, gmx_bool bRead, 
                         gmx_file_position_t **p_outputfiles, int *nfiles, 
                         FILE *list, int file_version)
@@ -1401,6 +1509,7 @@ void write_checkpoint(const char *fn,gmx_bool bNumberAndKeep,
                   DOMAINDECOMP(cr) ? cr->dd->nc : NULL,&npmenodes,
                   &state->natoms,&state->ngtc,&state->nnhpres,
                   &state->nhchainlength,&(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+                  &state->edsamstate.nED,
                   NULL);
     
     sfree(version);
@@ -1413,6 +1522,7 @@ void write_checkpoint(const char *fn,gmx_bool bNumberAndKeep,
        (do_cpt_ekinstate(gmx_fio_getxdr(fp),FALSE,flags_eks,&state->ekinstate,NULL) < 0)||
        (do_cpt_enerhist(gmx_fio_getxdr(fp),FALSE,flags_enh,&state->enerhist,NULL) < 0)  ||
        (do_cpt_df_hist(gmx_fio_getxdr(fp),FALSE,flags_dfh,&state->dfhist,NULL) < 0)  ||
+       (do_cpt_EDstate(gmx_fio_getxdr(fp),FALSE,&state->edsamstate,NULL) < 0)      ||
        (do_cpt_files(gmx_fio_getxdr(fp),FALSE,&outputfiles,&noutputfiles,NULL,
                      file_version) < 0))
     {
@@ -1656,7 +1766,8 @@ static void read_checkpoint(const char *fn,FILE **pfplog,
                   &eIntegrator_f,simulation_part,step,t,
                   &nppnodes_f,dd_nc_f,&npmenodes_f,
                   &natoms,&ngtc,&nnhpres,&nhchainlength,&nlambda,
-                  &fflags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+                  &fflags,&flags_eks,&flags_enh,&flags_dfh,
+                  &state->edsamstate.nED,NULL);
 
     if (bAppendOutputFiles &&
         file_version >= 13 && double_prec != GMX_CPT_BUILD_DP)
@@ -1845,6 +1956,12 @@ static void read_checkpoint(const char *fn,FILE **pfplog,
         cp_error();
     }
 
+    ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+    if (ret)
+    {
+        cp_error();
+    }
+
     if (file_version < 6)
     {
         const char *warn="Reading checkpoint file in old format, assuming that the run that generated this file started at step 0, if this is not the case the averages stored in the energy file will be incorrect.";
@@ -2081,7 +2198,8 @@ static void read_checkpoint_data(t_fileio *fp,int *simulation_part,
                   &version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
                   &eIntegrator,simulation_part,step,t,&nppnodes,dd_nc,&npme,
                   &state->natoms,&state->ngtc,&state->nnhpres,&state->nhchainlength,
-                  &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,NULL);
+                  &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
+                  &state->edsamstate.nED,NULL);
     ret =
         do_cpt_state(gmx_fio_getxdr(fp),TRUE,state->flags,state,bReadRNG,NULL);
     if (ret)
@@ -2107,6 +2225,12 @@ static void read_checkpoint_data(t_fileio *fp,int *simulation_part,
         cp_error();
     }
 
+    ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state->edsamstate,NULL);
+    if (ret)
+    {
+        cp_error();
+    }
+
     ret = do_cpt_files(gmx_fio_getxdr(fp),TRUE,
                        outputfiles != NULL ? outputfiles : &files_loc,
                        outputfiles != NULL ? nfiles : &nfiles_loc,
@@ -2217,7 +2341,7 @@ void list_checkpoint(const char *fn,FILE *out)
                   &eIntegrator,&simulation_part,&step,&t,&nppnodes,dd_nc,&npme,
                   &state.natoms,&state.ngtc,&state.nnhpres,&state.nhchainlength,
                   &(state.dfhist.nlambda),&state.flags,
-                  &flags_eks,&flags_enh,&flags_dfh,out);
+                  &flags_eks,&flags_enh,&flags_dfh,&state.edsamstate.nED,out);
     ret = do_cpt_state(gmx_fio_getxdr(fp),TRUE,state.flags,&state,TRUE,out);
     if (ret)
     {
@@ -2238,6 +2362,12 @@ void list_checkpoint(const char *fn,FILE *out)
         ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
                              flags_dfh,&state.dfhist,out);
     }
+
+    if (ret == 0)
+    {
+        ret = do_cpt_EDstate(gmx_fio_getxdr(fp),TRUE,&state.edsamstate,out);
+    }
+
     if (ret == 0)
     {
                do_cpt_files(gmx_fio_getxdr(fp),TRUE,&outputfiles,&nfiles,out,file_version);
index 20b96fe8ac18b17671c89470691fde189562d0e7..734a11164a233335b5c527ed8e457e0fa673a3bc 100644 (file)
 #ifdef HAVE_LIBMKL
 #include <mkl.h>
 #endif
-#ifdef GMX_GPU
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif
 #ifdef GMX_FFT_FFTW3
 #include <fftw3.h>
 #endif
@@ -123,11 +119,11 @@ gmx_bool be_cool(void)
    * but we dont call this routine often, and it avoids using 
    * a mutex for locking the variable...
    */
-#ifdef GMX_FAHCORE
+#ifdef GMX_COOL_QUOTES
+  return (getenv("GMX_NO_QUOTES") == NULL);
+#else
   /*be uncool*/
   return FALSE;
-#else
-  return (getenv("GMX_NO_QUOTES") == NULL);
 #endif
 }
 
@@ -649,12 +645,10 @@ const char *GromacsVersion()
   return _gmx_ver_string;
 }
 
+void gmx_print_version_info_gpu(FILE *fp);
+
 void gmx_print_version_info(FILE *fp)
 {
-#ifdef GMX_GPU
-    int cuda_driver,cuda_runtime;
-#endif
-
     fprintf(fp, "Gromacs version:    %s\n", _gmx_ver_string);
 #ifdef GMX_GIT_VERSION_INFO
     fprintf(fp, "GIT SHA1 hash:      %s\n", _gmx_full_git_hash);
@@ -673,6 +667,7 @@ void gmx_print_version_info(FILE *fp)
 #else
     fprintf(fp, "Precision:          single\n");
 #endif
+    fprintf(fp, "Memory model:       %lu bit\n",8*sizeof(void *));
 
 #ifdef GMX_THREAD_MPI
     fprintf(fp, "MPI library:        thread_mpi\n");
@@ -747,13 +742,7 @@ void gmx_print_version_info(FILE *fp)
             __INTEL_MKL__,__INTEL_MKL_MINOR__,__INTEL_MKL_UPDATE__);
 #endif
 #ifdef GMX_GPU
-    fprintf(fp, "CUDA compiler:      %s\n",CUDA_NVCC_COMPILER_INFO);
-    cuda_driver = 0;
-    cudaDriverGetVersion(&cuda_driver);
-    cuda_runtime = 0;
-    cudaRuntimeGetVersion(&cuda_runtime);
-    fprintf(fp, "CUDA driver:        %d.%d\n",cuda_driver/1000, cuda_driver%100);
-    fprintf(fp, "CUDA runtime:       %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+    gmx_print_version_info_gpu(fp);
 #endif
 
 }
diff --git a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
new file mode 100644 (file)
index 0000000..001ca53
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include "buildinfo.h"
+
+extern "C" void gmx_print_version_info_gpu(FILE *fp) 
+{
+    int cuda_driver,cuda_runtime;
+    fprintf(fp, "CUDA compiler:      %s\n",CUDA_NVCC_COMPILER_INFO);
+    cuda_driver = 0;
+    cudaDriverGetVersion(&cuda_driver);
+    cuda_runtime = 0;
+    cudaRuntimeGetVersion(&cuda_runtime);
+    fprintf(fp, "CUDA driver:        %d.%d\n",cuda_driver/1000, cuda_driver%100);
+    fprintf(fp, "CUDA runtime:       %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+}
index d42f37e11c273556f1423cdc50b5ec6437208b5d..474167a57ec3418347d3e16ae32d2fcb82065bcb 100644 (file)
@@ -194,7 +194,6 @@ static const t_deffile
     { eftASC, ".m2p", "ps",     NULL, "Input file for mat2ps"},
     { eftXDR, ".mtx", "hessian","-m", "Hessian matrix"},
     { eftASC, ".edi", "sam",    NULL, "ED sampling input"},
-    { eftASC, ".edo", "sam",    NULL, "ED sampling output"},
     { eftASC, ".hat", "gk", NULL, "Fourier transform of spread function" },
     { eftASC, ".cub", "pot",  NULL, "Gaussian cube file" },
     { eftASC, ".xpm", "root", NULL, "X PixMap compatible matrix file" },
index 71bf75ac7e6ba34671cdc5d131c79961390021c4..dfe5c298e5192f93b39981739aef1e43e6b04783 100644 (file)
 #ifdef _MSC_VER
 /* MSVC definition for __cpuid() */
 #include <intrin.h>
+/* sysinfo functions */
+#include <windows.h>
 #endif
 #ifdef HAVE_UNISTD_H
 /* sysconf() definition */
 #include <unistd.h>
 #endif
 
+#include "gmx_cpuid.h"
 
 
 
-#include "gmx_cpuid.h"
-
+/* For convenience, and to enable configure-time invocation, we keep all architectures
+ * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
+ */
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#    define GMX_CPUID_X86
+#endif
 
 /* Global constant character strings corresponding to our enumerated types */
 const char *
@@ -121,6 +128,20 @@ struct gmx_cpuid
     int                        stepping;
     /* Not using gmx_bool here, since this file must be possible to compile without simple.h */
     char                       feature[GMX_CPUID_NFEATURES];
+    
+    /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between
+     * operating systems and sometimes even settings. For most other architectures you can likely just check
+     * the documentation and then write static information to these arrays rather than detecting on-the-fly.
+     */
+    int                        have_cpu_topology;
+    int                        nproc;               /* total number of logical processors from OS */
+    int                        npackages;
+    int                        ncores_per_package;
+    int                        nhwthreads_per_core;
+    int *                      package_id;
+    int *                      core_id;             /* Local core id in each package */
+    int *                      hwthread_id;         /* Local hwthread id in each core */
+    int *                      locality_order;      /* Processor indices sorted in locality order */
 };
 
 
@@ -195,10 +216,7 @@ compiled_acc = GMX_CPUID_ACCELERATION_NONE;
 #endif
 
 
-/* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
- * if the compiler handles GNU-style inline assembly.
- */
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#ifdef GMX_CPUID_X86
 
 /* Execute CPUID on x86 class CPUs. level sets function to exec, and the
  * contents of register output is returned. See Intel/AMD docs for details.
@@ -217,6 +235,10 @@ execute_x86cpuid(unsigned int   level,
 {
     int rc = 0;
 
+    /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
+     * if the compiler handles GNU-style inline assembly.
+     */
+
 #if (defined _MSC_VER)
     int CPUInfo[4];
 
@@ -269,7 +291,6 @@ execute_x86cpuid(unsigned int   level,
 #endif
     return rc;
 }
-#endif /* architecture is x86 */
 
 
 /* Identify CPU features common to Intel & AMD - mainly brand string,
@@ -371,17 +392,107 @@ cpuid_check_common_x86(gmx_cpuid_t                cpuid)
         execute_x86cpuid(0x80000007,0,&eax,&ebx,&ecx,&edx);
         cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC]  = (edx & (1 << 8))  != 0;
     }
-
     return 0;
 }
 
+/* This routine returns the number of unique different elements found in the array,
+ * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2}
+ * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
+ * number of unique elements.
+ */
+static int
+cpuid_renumber_elements(int *data, int n)
+{
+    int *unique;
+    int  i,j,nunique,found;
+
+    unique = malloc(sizeof(int)*n);
+    
+    nunique=0;
+    for(i=0;i<n;i++)
+    {
+        for(j=0,found=0;j<nunique && !found;j++)
+        {
+            found = (data[i]==unique[j]);
+        }
+        if(!found)
+        {
+            /* Insert in sorted order! */
+             for(j=nunique++;j>0 && unique[j-1]>data[i];j--)
+            {
+                unique[j]=unique[j-1];
+            }
+            unique[j]=data[i];
+        }
+    }
+    /* renumber */
+    for(i=0;i<n;i++)
+    {
+        for(j=0;j<nunique;j++)
+        {
+            if(data[i]==unique[j])
+            {
+                data[i]=j;
+            }
+        }
+    }
+    return nunique;
+}
+
+/* APIC IDs, or everything you wanted to know about your x86 cores but were afraid to ask...
+ *
+ * Raw APIC IDs are unfortunately somewhat dirty. For technical reasons they are assigned
+ * in power-of-2 chunks, and even then there are no guarantees about specific numbers - all
+ * we know is that the part for each thread/core/package is unique, and how many bits are
+ * reserved for that part. 
+ * This routine does internal renumbering so we get continuous indices, and also
+ * decodes the actual number of packages,cores-per-package and hwthreads-per-core.
+ */
+static void
+cpuid_x86_decode_apic_id(gmx_cpuid_t cpuid,int *apic_id,int core_bits,int hwthread_bits)
+{
+    int i,idx;
+    int hwthread_mask,core_mask_after_shift;
+    
+    cpuid->hwthread_id     = malloc(sizeof(int)*cpuid->nproc);
+    cpuid->core_id         = malloc(sizeof(int)*cpuid->nproc);
+    cpuid->package_id      = malloc(sizeof(int)*cpuid->nproc);
+    cpuid->locality_order  = malloc(sizeof(int)*cpuid->nproc);
+
+    hwthread_mask         = (1 << hwthread_bits) - 1;
+    core_mask_after_shift = (1 << core_bits) - 1;
+    
+    for(i=0;i<cpuid->nproc;i++)
+    {
+        cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask;
+        cpuid->core_id[i]     = (apic_id[i] >> hwthread_bits) & core_mask_after_shift;
+        cpuid->package_id[i]  = apic_id[i] >> (core_bits + hwthread_bits);
+    }
+    
+    cpuid->npackages            = cpuid_renumber_elements(cpuid->package_id,cpuid->nproc);
+    cpuid->ncores_per_package   = cpuid_renumber_elements(cpuid->core_id,cpuid->nproc);
+    cpuid->nhwthreads_per_core  = cpuid_renumber_elements(cpuid->hwthread_id,cpuid->nproc);
+    
+    /* Create a locality order array, i.e. first all resources in package0, which in turn
+     * are sorted so we first have all resources in core0, where threads are sorted in order, etc.
+     */
+    for(i=0;i<cpuid->nproc;i++)
+    {
+        idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i];
+        cpuid->locality_order[idx]=i;
+    }
+}
+
+
 /* Detection of AMD-specific CPU features */
 static int
 cpuid_check_amd_x86(gmx_cpuid_t                cpuid)
 {
     int                       max_stdfn,max_extfn;
     unsigned int              eax,ebx,ecx,edx;
-
+    int                       hwthread_bits,core_bits;
+    int *                     apic_id;
+    
     cpuid_check_common_x86(cpuid);
 
     execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
@@ -399,7 +510,66 @@ cpuid_check_amd_x86(gmx_cpuid_t                cpuid)
         cpuid->feature[GMX_CPUID_FEATURE_X86_XOP]         = (ecx & (1 << 11)) != 0;
         cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4]        = (ecx & (1 << 16)) != 0;
     }
-
+    
+    /* Query APIC information on AMD */
+    if(max_extfn>=0x80000008)
+    {
+#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
+        /* Linux */
+        unsigned int   i;
+        cpu_set_t      cpuset,save_cpuset;
+        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
+        apic_id      = malloc(sizeof(int)*cpuid->nproc);
+        sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+        /* Get APIC id from each core */
+        CPU_ZERO(&cpuset);
+        for(i=0;i<cpuid->nproc;i++)
+        {
+            CPU_SET(i,&cpuset);
+            sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
+            execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+            apic_id[i]=ebx >> 24;
+            CPU_CLR(i,&cpuset);
+        }
+        /* Reset affinity to the value it had when calling this routine */
+        sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+#define CPUID_HAVE_APIC
+#elif defined GMX_NATIVE_WINDOWS
+        /* Windows */
+        DWORD_PTR     i;
+        SYSTEM_INFO   sysinfo;
+        unsigned int  save_affinity,affinity;
+        GetSystemInfo( &sysinfo );
+        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
+        apic_id       = malloc(sizeof(int)*cpuid->nproc);
+        /* Get previous affinity mask */
+        save_affinity = SetThreadAffinityMask(GetCurrentThread(),1);
+        for(i=0;i<cpuid->nproc;i++)
+        {
+            SetThreadAffinityMask(GetCurrentThread(),(((DWORD_PTR)1)<<i));
+            Sleep(0);
+            execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+            apic_id[i]=ebx >> 24;
+        }
+        SetThreadAffinityMask(GetCurrentThread(),save_affinity);
+#define CPUID_HAVE_APIC
+#endif
+#ifdef CPUID_HAVE_APIC
+        /* AMD does not support SMT yet - there are no hwthread bits in apic ID */
+        hwthread_bits = 0;
+        /* Get number of core bits in apic ID - try modern extended method first */
+        execute_x86cpuid(0x80000008,0,&eax,&ebx,&ecx,&edx);
+        core_bits = (ecx >> 12) & 0xf;
+        if(core_bits==0)
+        {
+            /* Legacy method for old single/dual core AMD CPUs */
+            int i = ecx & 0xF;
+            for(core_bits=0;(i>>core_bits)>0;core_bits++) ;
+        }
+        cpuid_x86_decode_apic_id(cpuid,apic_id,core_bits,hwthread_bits);
+        cpuid->have_cpu_topology = 1;
+#endif
+    }
     return 0;
 }
 
@@ -409,8 +579,9 @@ cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 {
     unsigned int              max_stdfn,max_extfn;
     unsigned int              eax,ebx,ecx,edx;
-    unsigned int              i;
     unsigned int              max_logical_cores,max_physical_cores;
+    int                       hwthread_bits,core_bits;
+    int *                     apic_id;
 
     cpuid_check_common_x86(cpuid);
 
@@ -449,8 +620,64 @@ cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
             cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0;
         }
     }
+    
+    if(max_stdfn>=0xB)
+    {
+        /* Query x2 APIC information from cores */
+#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
+        /* Linux */
+        unsigned int   i;
+        cpu_set_t      cpuset,save_cpuset;
+        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
+        apic_id      = malloc(sizeof(int)*cpuid->nproc);
+        sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+        /* Get x2APIC ID from each hardware thread */
+        CPU_ZERO(&cpuset);
+        for(i=0;i<cpuid->nproc;i++)
+        {
+            CPU_SET(i,&cpuset);
+            sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
+            execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+            apic_id[i]=edx;
+            CPU_CLR(i,&cpuset);
+        }
+        /* Reset affinity to the value it had when calling this routine */
+        sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+#define CPUID_HAVE_APIC
+#elif defined GMX_NATIVE_WINDOWS
+        /* Windows */
+        DWORD_PTR     i;
+        SYSTEM_INFO   sysinfo;
+        unsigned int  save_affinity,affinity;
+        GetSystemInfo( &sysinfo );
+        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
+        apic_id       = malloc(sizeof(int)*cpuid->nproc);
+        /* Get previous affinity mask */
+        save_affinity = SetThreadAffinityMask(GetCurrentThread(),1);
+        for(i=0;i<cpuid->nproc;i++)
+        {
+            SetThreadAffinityMask(GetCurrentThread(),(((DWORD_PTR)1)<<i));
+            Sleep(0);
+            execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+            apic_id[i]=edx;
+        }
+        SetThreadAffinityMask(GetCurrentThread(),save_affinity);
+#define CPUID_HAVE_APIC
+#endif
+#ifdef CPUID_HAVE_APIC
+        execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+        hwthread_bits    = eax & 0x1F;
+        execute_x86cpuid(0xB,1,&eax,&ebx,&ecx,&edx);
+        core_bits        = (eax & 0x1F) - hwthread_bits;
+        cpuid_x86_decode_apic_id(cpuid,apic_id,core_bits,hwthread_bits);
+        cpuid->have_cpu_topology = 1;
+#endif
+    }
     return 0;
 }
+#endif /* GMX_CPUID_X86 */
+
+
 
 /* Try to find the vendor of the current CPU, so we know what specific
  * detection routine to call.
@@ -466,6 +693,7 @@ cpuid_check_vendor(void)
     /* Set default first */
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
 
+#ifdef GMX_CPUID_X86
     execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
 
     memcpy(vendorstring,&ebx,4);
@@ -481,12 +709,68 @@ cpuid_check_vendor(void)
             vendor = i;
         }
     }
-
+#else
+    vendor = GMX_CPUID_VENDOR_UNKNOWN;
+#endif
+    
     return vendor;
 }
 
 
 
+int
+gmx_cpuid_topology(gmx_cpuid_t        cpuid,
+                   int *              nprocessors,
+                   int *              npackages,
+                   int *              ncores_per_package,
+                   int *              nhwthreads_per_core,
+                   const int **       package_id,
+                   const int **       core_id,
+                   const int **       hwthread_id,
+                   const int **       locality_order)
+{
+    int rc;
+    
+    if(cpuid->have_cpu_topology)
+    {
+        *nprocessors          = cpuid->nproc;
+        *npackages            = cpuid->npackages;
+        *ncores_per_package   = cpuid->ncores_per_package;
+        *nhwthreads_per_core  = cpuid->nhwthreads_per_core;
+        *package_id           = cpuid->package_id;
+        *core_id              = cpuid->core_id;
+        *hwthread_id          = cpuid->hwthread_id;
+        *locality_order       = cpuid->locality_order;
+        rc = 0;
+    }
+    else
+    {
+        rc = -1;
+    }
+    return rc;
+}
+
+
+enum gmx_cpuid_x86_smt
+gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
+{
+    enum gmx_cpuid_x86_smt rc;
+    
+    if(cpuid->have_cpu_topology)
+    {
+        rc = (cpuid->nhwthreads_per_core>1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED;
+    }
+    else if(cpuid->vendor==GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0)
+    {
+        rc = GMX_CPUID_X86_SMT_DISABLED;
+    }
+    else
+    {
+        rc = GMX_CPUID_X86_SMT_CANNOTDETECT;
+    }
+    return rc;
+}
+
 
 int
 gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
@@ -502,24 +786,35 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
     {
         cpuid->feature[i]=0;
     }
-
+    cpuid->have_cpu_topology   = 0;
+    cpuid->nproc               = 0;
+    cpuid->npackages           = 0;
+    cpuid->ncores_per_package  = 0;
+    cpuid->nhwthreads_per_core = 0;
+    cpuid->package_id          = NULL;
+    cpuid->core_id             = NULL;
+    cpuid->hwthread_id         = NULL;
+    cpuid->locality_order      = NULL;
+    
     cpuid->vendor = cpuid_check_vendor();
-
+    
     switch(cpuid->vendor)
     {
+#ifdef GMX_CPUID_X86
         case GMX_CPUID_VENDOR_INTEL:
             cpuid_check_intel_x86(cpuid);
             break;
         case GMX_CPUID_VENDOR_AMD:
             cpuid_check_amd_x86(cpuid);
             break;
+#endif
         default:
             /* Could not find vendor */
             strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
             cpuid->family         = 0;
             cpuid->model          = 0;
             cpuid->stepping       = 0;
-
+            
             for(i=0;i<GMX_CPUID_NFEATURES;i++)
             {
                 cpuid->feature[i]=0;
@@ -689,94 +984,6 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 }
 
 
-enum gmx_cpuid_x86_smt
-gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
-{
-
-#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
-    int            i;
-    int            nproc;
-    cpu_set_t      cpuset,save_cpuset;
-    int *          apic_id;
-    unsigned int   eax,ebx,ecx,edx;
-    int            core_shift_bits;
-    int            smt_found;
-
-    if( gmx_cpuid_vendor(cpuid)!=GMX_CPUID_VENDOR_INTEL ||
-       gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0)
-    {
-        return GMX_CPUID_X86_SMT_DISABLED;
-    }
-
-    /* Check cpuid max standard function */
-    execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
-
-    /* Early CPUs that do not support function 11 do not support SMT either */
-    if(eax<0xB)
-    {
-        return GMX_CPUID_X86_SMT_DISABLED;
-    }
-
-    /* If we got here, it is a modern Intel CPU that supports detection, as does our OS */
-
-    /* How many processors? */
-    nproc = sysconf(_SC_NPROCESSORS_ONLN);
-
-    apic_id      = malloc(sizeof(int)*nproc);
-
-    sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
-
-    /* Get x2APIC ID from each hardware thread */
-    CPU_ZERO(&cpuset);
-    for(i=0;i<nproc;i++)
-    {
-        CPU_SET(i,&cpuset);
-        sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
-        execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
-        apic_id[i]=edx;
-        CPU_CLR(i,&cpuset);
-    }
-    /* Reset affinity to the value it had when calling this routine */
-    sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
-
-    core_shift_bits = eax & 0x1F;
-
-    /* Check if there is any other APIC id that is identical to [0], apart from
-     * the hardware thread bit.
-     */
-    smt_found  = 0;
-    for(i=1;i<nproc && smt_found==0;i++)
-    {
-        smt_found = (apic_id[i]>>core_shift_bits == apic_id[0] >> core_shift_bits);
-    }
-
-    free(apic_id);
-
-    if(smt_found==1)
-    {
-        return GMX_CPUID_X86_SMT_ENABLED;
-    }
-    else
-    {
-        return GMX_CPUID_X86_SMT_DISABLED;
-    }
-#else
-    /* Do the trivial stuff first. If Hyper-Threading isn't even supported it
-     * cannot be enabled, no matter what OS detection we use!
-     */
-    if(0==gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT))
-    {
-        return GMX_CPUID_X86_SMT_DISABLED;
-    }
-    else
-    {
-        return GMX_CPUID_X86_SMT_CANNOTDETECT;
-    }
-#endif
-}
-
-
-
 
 #ifdef GMX_CPUID_STANDALONE
 /* Stand-alone program to enable queries of CPU features from Cmake.
index 9985f3dfb121658c23e79792007dc239c3ac8677..e7c2d65e1e198742921394b90671956468d54023 100644 (file)
@@ -125,7 +125,7 @@ static void print_gpu_use_stats(FILE *fplog,
     }
     else
     {
-        sprintf(sbuf, "%d GPU%s %sselected to be used for this run: ",
+        sprintf(sbuf, "%d GPU%s %sselected for this run: ",
                 ngpu, (ngpu > 1) ? "s" : "",
                 gpu_info->bUserSet ? "user-" : "auto-");
         for (i = 0; i < ngpu; i++)
@@ -163,7 +163,7 @@ static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
         if (idstr[i] < '0' || idstr[i] > '9')
         {
             gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
-                      invalid_gpuid_hint, idstr[i]);
+                      idstr[i], invalid_gpuid_hint);
         }
         idlist[i] = idstr[i] - '0';
     }
index 67a6e207c731536e4e83d22d8e4e195f8be965dc..673025b7697dad412210da03097c2329f1b1cb78 100644 (file)
@@ -379,7 +379,7 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
 #endif
 
     /* inform the user about the settings */
-    if (SIMMASTER(cr) && bOMP)
+    if (bOMP)
     {
 #ifdef GMX_THREAD_MPI
         const char *mpi_str="per tMPI thread";
@@ -390,15 +390,15 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
         /* for group scheme we print PME threads info only */
         if (bFullOmpSupport)
         {
-            fprintf(stderr, "Using %d OpenMP thread%s %s\n",
-                    modth.gnth,modth.gnth > 1 ? "s" : "",
-                    cr->nnodes > 1 ? mpi_str : "");
+            md_print_info(cr, fplog, "Using %d OpenMP thread%s %s\n",
+                          modth.gnth,modth.gnth > 1 ? "s" : "",
+                          cr->nnodes > 1 ? mpi_str : "");
         }
         if (bSepPME && modth.gnth_pme != modth.gnth)
         {
-            fprintf(stderr, "Using %d OpenMP thread%s %s for PME\n",
-                    modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
-                    cr->nnodes > 1 ? mpi_str : "");
+            md_print_info(cr, fplog, "Using %d OpenMP thread%s %s for PME\n",
+                          modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
+                          cr->nnodes > 1 ? mpi_str : "");
         }
     }
 
index d99803d8287c5538ed9c6eb500aedd75d87ce5ac..d5473499fa268b6a7c1f06e5a96d2de0d075544f 100644 (file)
@@ -109,7 +109,7 @@ const t_interaction_function interaction_function[F_NRE]=
   def_bonded  ("RBDIHS",   "Ryckaert-Bell.",  4, 6, 6,  eNR_RB, rbdihs            ),
   def_bonded  ("FOURDIHS", "Fourier Dih.",    4, 4, 4,  eNR_FOURDIH, rbdihs       ),
   def_bonded  ("IDIHS",    "Improper Dih.",   4, 2, 2,  eNR_IMPROPER,idihs        ),
-  def_bonded  ("PIDIHS",   "Improper Dih.",   4, 3, 3,  eNR_PROPER, pdihs         ),
+  def_bonded  ("PIDIHS",   "Improper Dih.",   4, 3, 3,  eNR_IMPROPER, pdihs       ),
   def_bondedt ("TABDIHS", "Tab. Dih.",        4, 2, 2,  eNR_TABDIHS, tab_dihs     ),
   def_bonded  ("CMAP",  "CMAP Dih.",          5, -1, -1,  eNR_CMAP,   unimplemented ),
   def_bonded  ("GB12",     "GB 1-2 Pol.",     2, 4, 0,  eNR_GB,     unimplemented ),
index 7fb2314fdb64c2939a82096fc22b29540a0d91ed..e41e2e977721dccdb8cec31de4c17791db946676 100644 (file)
@@ -321,10 +321,13 @@ void mk_graph_ilist(FILE *fplog,
   int     i,nbtot;
   gmx_bool    bMultiPart;
 
-  if (at_start != 0) {
-    gmx_incons("In mk_graph_ilist at_start can not be != 0");
-  }
-  g->natoms = at_end;
+  /* The naming is somewhat confusing, but we need g->at0 and g->at1
+   * for shifthing coordinates to a new array (not in place) when
+   * some atoms are not connected by the graph, which runs from
+   * g->at_start (>= g->at0) to g->at_end (<= g->at1).
+   */
+  g->at0 = at_start;
+  g->at1 = at_end;
 
   snew(nbond,at_end);
   nbtot = calc_start_end(fplog,g,ilist,at_start,at_end,nbond);
@@ -390,7 +393,7 @@ void mk_graph_ilist(FILE *fplog,
 
   sfree(nbond);
 
-  snew(g->ishift,g->natoms);
+  snew(g->ishift,g->at1);
 
   if (gmx_debug_at)
     p_graph(debug,"graph",g);
@@ -609,7 +612,7 @@ void mk_mshift(FILE *log,t_graph *g,int ePBC,matrix box,rvec x[])
    * at all. If we return without doing this for a system without bonds
    * (i.e. only settles) all water molecules are moved to the opposite octant
    */
-  for(i=0; (i<g->natoms); i++) {
+  for(i=g->at0; (i<g->at1); i++) {
       g->ishift[i][XX]=g->ishift[i][YY]=g->ishift[i][ZZ]=0;
   }
     
@@ -708,7 +711,7 @@ void shift_x(t_graph *g,matrix box,rvec x[],rvec x_s[])
   g1 = g->at_end;
   is = g->ishift;
   
-  for(j=0; j<g0; j++) {
+  for(j=g->at0; j<g0; j++) {
     copy_rvec(x[j],x_s[j]);
   }
 
@@ -751,7 +754,7 @@ void shift_x(t_graph *g,matrix box,rvec x[],rvec x_s[])
      }
   }       
 
-  for(j=g1; j<g->natoms; j++) {
+  for(j=g1; j<g->at1; j++) {
     copy_rvec(x[j],x_s[j]);
   }
 }
@@ -808,7 +811,7 @@ void unshift_x(t_graph *g,matrix box,rvec x[],rvec x_s[])
   g1 = g->at_end;
   is = g->ishift;
 
-  for(j=0; j<g0; j++) {
+  for(j=g->at0; j<g0; j++) {
     copy_rvec(x_s[j],x[j]);
   }
 
@@ -834,7 +837,7 @@ void unshift_x(t_graph *g,matrix box,rvec x[],rvec x_s[])
       }
   }
 
-  for(j=g1; j<g->natoms; j++) {
+  for(j=g1; j<g->at1; j++) {
     copy_rvec(x_s[j],x[j]);
   }
 }
index b86c3eee5715d688527f130c666757938c52d5e2..0f076850006660da3ff8fb52aacdaac864c8bbc0 100644 (file)
@@ -9,16 +9,16 @@
  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  * a full list of developers and information, check out http://www.gromacs.org
  *
- * This program is free software; you can redistribute it and/or modify it under 
- * the terms of the GNU Lesser General Public License as published by the Free 
- * Software Foundation; either version 2 of the License, or (at your option) any 
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
  * later version.
  * As a special exception, you may use this file as part of a free software
  * library without restriction.  Specifically, if other files instantiate
  * templates or use macros or inline functions from this file, or you compile
  * this file and link it with other files to produce an executable, this
  * file does not by itself cause the resulting executable to be covered by
- * the GNU Lesser General Public License.  
+ * the GNU Lesser General Public License.
  *
  * In plain-speak: do not worry about classes/macros/templates either - only
  * changes to the library have to be LGPL, not an application linking with it.
@@ -73,16 +73,16 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                               __m128d xmm1)
 {
     __m128d t2;
-    
+
     t2       = _mm_unpackhi_pd(xmm1,xmm1);
-    _mm_store_sd(ptrA,xmm1);                                           
-    _mm_store_sd(ptrB,t2);                                         
+    _mm_store_sd(ptrA,xmm1);
+    _mm_store_sd(ptrB,t2);
 }
 
 static void
 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 {
-    _mm_store_sd(ptrA,xmm1);                                        
+    _mm_store_sd(ptrA,xmm1);
 }
 
 
@@ -92,7 +92,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                   double * gmx_restrict ptrB, __m128d xmm1)
 {
     __m128d t1;
-    
+
     t1   = _mm_unpackhi_pd(xmm1,xmm1);
     xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA));
     t1   = _mm_add_sd(t1,_mm_load_sd(ptrB));
@@ -104,7 +104,7 @@ static void
 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 {
     __m128d tmp;
-    
+
     tmp = gmx_mm_load_1real_pd(ptrA);
     tmp = _mm_add_sd(tmp,xmm1);
     gmx_mm_store_1real_pd(ptrA,tmp);
@@ -119,12 +119,12 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                              __m128d * gmx_restrict c12)
 {
     __m128d t1,t2,t3;
-    
+
     /* The c6/c12 array should be aligned */
     t1   = _mm_loadu_pd(p1);
     t2   = _mm_loadu_pd(p2);
-    *c6  = _mm_unpacklo_pd(t1,t2);  
-    *c12 = _mm_unpackhi_pd(t1,t2);                    
+    *c6  = _mm_unpacklo_pd(t1,t2);
+    *c12 = _mm_unpackhi_pd(t1,t2);
 }
 
 static gmx_inline void
@@ -139,21 +139,21 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
-    
+
     mem_xy  = _mm_loadu_pd(xyz);
     mem_z   = _mm_load_sd(xyz+2);
     mem_sxy = _mm_loadu_pd(xyz_shift);
     mem_sz  = _mm_load_sd(xyz_shift+2);
-    
+
     mem_xy  = _mm_add_pd(mem_xy,mem_sxy);
     mem_z   = _mm_add_pd(mem_z,mem_sz);
-    
+
     *x1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
     *y1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
     *z1  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
@@ -162,30 +162,30 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-    
+
     t1  = _mm_loadu_pd(xyz);
     t2  = _mm_loadu_pd(xyz+2);
     t3  = _mm_loadu_pd(xyz+4);
     t4  = _mm_loadu_pd(xyz+6);
     t5  = _mm_load_sd(xyz+8);
-    
+
     sxy = _mm_loadu_pd(xyz_shift);
     sz  = _mm_load_sd(xyz_shift+2);
     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-    
+
     t1  = _mm_add_pd(t1,sxy);
     t2  = _mm_add_pd(t2,szx);
     t3  = _mm_add_pd(t3,syz);
     t4  = _mm_add_pd(t4,sxy);
     t5  = _mm_add_sd(t5,sz);
-    
+
     *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
@@ -200,33 +200,33 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-    
+
     t1  = _mm_loadu_pd(xyz);
     t2  = _mm_loadu_pd(xyz+2);
     t3  = _mm_loadu_pd(xyz+4);
     t4  = _mm_loadu_pd(xyz+6);
     t5  = _mm_loadu_pd(xyz+8);
     t6  = _mm_loadu_pd(xyz+10);
-    
+
     sxy = _mm_loadu_pd(xyz_shift);
     sz  = _mm_load_sd(xyz_shift+2);
     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-    
+
     t1  = _mm_add_pd(t1,sxy);
     t2  = _mm_add_pd(t2,szx);
     t3  = _mm_add_pd(t3,syz);
     t4  = _mm_add_pd(t4,sxy);
     t5  = _mm_add_pd(t5,szx);
     t6  = _mm_add_pd(t6,syz);
-    
+
     *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -313,7 +313,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     t1           = _mm_loadu_pd(ptrA);
     t2           = _mm_loadu_pd(ptrB);
     t3           = _mm_loadu_pd(ptrA+2);
@@ -382,106 +382,16 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 
 
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
-{
-    __m128d t1,t2;
-    
-    t1 = _mm_loadu_pd(ptrA);
-    t2 = _mm_load_sd(ptrA+2);
-    
-    t1 = _mm_sub_pd(t1,xy);
-    t2 = _mm_sub_sd(t2,z);
-    
-    _mm_storeu_pd(ptrA,t1);
-    _mm_store_sd(ptrA+2,t2);
-}
-
-
-static void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-    
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-    
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-    
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-    
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-    
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-    
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-    
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
-
-
 static void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1)
 {
     __m128d t1,t2,t3;
-    
+
     t1           = _mm_load_sd(ptrA);
     t2           = _mm_load_sd(ptrA+1);
     t3           = _mm_load_sd(ptrA+2);
-    
+
     t1           = _mm_sub_sd(t1,x1);
     t2           = _mm_sub_sd(t2,y1);
     t3           = _mm_sub_sd(t3,z1);
@@ -491,26 +401,53 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
-                                       __m128d x3, __m128d y3, __m128d z3) 
+                                       __m128d x3, __m128d y3, __m128d z3)
 {
     __m128d t1,t2,t3,t4,t5;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
     t4          = _mm_loadu_pd(ptrA+6);
     t5          = _mm_load_sd(ptrA+8);
-    
+
     x1          = _mm_unpacklo_pd(x1,y1);
     z1          = _mm_unpacklo_pd(z1,x2);
     y2          = _mm_unpacklo_pd(y2,z2);
     x3          = _mm_unpacklo_pd(x3,y3);
     /* nothing to be done for z3 */
-    
+
     t1          = _mm_sub_pd(t1,x1);
     t2          = _mm_sub_pd(t2,z1);
     t3          = _mm_sub_pd(t3,y2);
@@ -522,31 +459,58 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
                                        __m128d x3, __m128d y3, __m128d z3,
-                                       __m128d x4, __m128d y4, __m128d z4) 
+                                       __m128d x4, __m128d y4, __m128d z4)
 {
     __m128d t1,t2,t3,t4,t5,t6;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
     t4          = _mm_loadu_pd(ptrA+6);
     t5          = _mm_loadu_pd(ptrA+8);
     t6          = _mm_loadu_pd(ptrA+10);
-    
+
     x1          = _mm_unpacklo_pd(x1,y1);
     z1          = _mm_unpacklo_pd(z1,x2);
     y2          = _mm_unpacklo_pd(y2,z2);
     x3          = _mm_unpacklo_pd(x3,y3);
     z3          = _mm_unpacklo_pd(z3,x4);
     y4          = _mm_unpacklo_pd(y4,z4);
-    
+
     _mm_storeu_pd(ptrA,    _mm_sub_pd( t1,x1 ));
     _mm_storeu_pd(ptrA+2,  _mm_sub_pd( t2,z1 ));
     _mm_storeu_pd(ptrA+4,  _mm_sub_pd( t3,y2 ));
@@ -554,28 +518,30 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1)
 {
     __m128d t1,t2,t3,t4,t5,t6,t7;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_load_sd(ptrA+2);
     t3          = _mm_loadu_pd(ptrB);
     t4          = _mm_load_sd(ptrB+2);
-    
+
     t5          = _mm_unpacklo_pd(x1,y1);
     t6          = _mm_unpackhi_pd(x1,y1);
     t7          = _mm_unpackhi_pd(z1,z1);
-    
+
     t1          = _mm_sub_pd(t1,t5);
     t2          = _mm_sub_sd(t2,z1);
-    
+
     t3          = _mm_sub_pd(t3,t6);
     t4          = _mm_sub_sd(t4,t7);
-    
+
     _mm_storeu_pd(ptrA,t1);
     _mm_store_sd(ptrA+2,t2);
     _mm_storeu_pd(ptrB,t3);
@@ -583,15 +549,63 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
-                                       __m128d x3, __m128d y3, __m128d z3) 
+                                       __m128d x3, __m128d y3, __m128d z3)
 {
     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
@@ -602,7 +616,7 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     t8          = _mm_loadu_pd(ptrB+4);
     t9          = _mm_loadu_pd(ptrB+6);
     t10         = _mm_load_sd(ptrB+8);
-    
+
     tA          = _mm_unpacklo_pd(x1,y1);
     tB          = _mm_unpackhi_pd(x1,y1);
     tC          = _mm_unpacklo_pd(z1,x2);
@@ -612,19 +626,19 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     tG          = _mm_unpacklo_pd(x3,y3);
     tH          = _mm_unpackhi_pd(x3,y3);
     tI          = _mm_unpackhi_pd(z3,z3);
-    
+
     t1          = _mm_sub_pd(t1,tA);
     t2          = _mm_sub_pd(t2,tC);
     t3          = _mm_sub_pd(t3,tE);
     t4          = _mm_sub_pd(t4,tG);
     t5          = _mm_sub_sd(t5,z3);
-    
+
     t6          = _mm_sub_pd(t6,tB);
     t7          = _mm_sub_pd(t7,tD);
     t8          = _mm_sub_pd(t8,tF);
     t9          = _mm_sub_pd(t9,tH);
     t10         = _mm_sub_sd(t10,tI);
-    
+
     _mm_storeu_pd(ptrA,t1);
     _mm_storeu_pd(ptrA+2,t2);
     _mm_storeu_pd(ptrA+4,t3);
@@ -636,18 +650,76 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
                                        __m128d x3, __m128d y3, __m128d z3,
-                                       __m128d x4, __m128d y4, __m128d z4) 
+                                       __m128d x4, __m128d y4, __m128d z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
     __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-    
+
     t1          = _mm_loadu_pd(ptrA);
     t2          = _mm_loadu_pd(ptrA+2);
     t3          = _mm_loadu_pd(ptrA+4);
@@ -660,7 +732,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     t10         = _mm_loadu_pd(ptrB+6);
     t11         = _mm_loadu_pd(ptrB+8);
     t12         = _mm_loadu_pd(ptrB+10);
-    
+
     tA          = _mm_unpacklo_pd(x1,y1);
     tB          = _mm_unpackhi_pd(x1,y1);
     tC          = _mm_unpacklo_pd(z1,x2);
@@ -673,21 +745,21 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     tJ          = _mm_unpackhi_pd(z3,x4);
     tK          = _mm_unpacklo_pd(y4,z4);
     tL          = _mm_unpackhi_pd(y4,z4);
-    
+
     t1          = _mm_sub_pd(t1,tA);
     t2          = _mm_sub_pd(t2,tC);
     t3          = _mm_sub_pd(t3,tE);
     t4          = _mm_sub_pd(t4,tG);
     t5          = _mm_sub_pd(t5,tI);
     t6          = _mm_sub_pd(t6,tK);
-    
+
     t7          = _mm_sub_pd(t7,tB);
     t8          = _mm_sub_pd(t8,tD);
     t9          = _mm_sub_pd(t9,tF);
     t10         = _mm_sub_pd(t10,tH);
     t11         = _mm_sub_pd(t11,tJ);
     t12         = _mm_sub_pd(t12,tL);
-    
+
     _mm_storeu_pd(ptrA,  t1);
     _mm_storeu_pd(ptrA+2,t2);
     _mm_storeu_pd(ptrA+4,t3);
@@ -701,7 +773,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -711,14 +783,41 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
 {
     fix1 = _mm_hadd_pd(fix1,fiy1);
     fiz1 = _mm_hadd_pd(fiz1,fiz1);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
     _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
-    
+
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+fix1 = _mm_add_pd(fix1,fix3);\
+_t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+fiz1 = _mm_add_sd(fiz1,_t2);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -727,32 +826,63 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       double * gmx_restrict fshiftptr)
 {
     __m128d t1,t2;
-    
+
     fix1 = _mm_hadd_pd(fix1,fiy1);
     fiz1 = _mm_hadd_pd(fiz1,fix2);
     fiy2 = _mm_hadd_pd(fiy2,fiz2);
     fix3 = _mm_hadd_pd(fix3,fiy3);
     fiz3 = _mm_hadd_pd(fiz3,fiz3);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
     _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
-    
+
     fix1 = _mm_add_pd(fix1,fix3);
     t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
     fix1 = _mm_add_pd(fix1,t1); /* x and y sums */
-    
+
     t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));
     fiz1 = _mm_add_sd(fiz1,fiz3);
     fiz1 = _mm_add_sd(fiz1,t2); /* z sum */
-    
+
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fix4);\
+fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+fix3 = _mm_add_pd(fix3,_t2);\
+fix1 = _mm_add_pd(fix1,fix3);\
+fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -762,35 +892,35 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       double * gmx_restrict fshiftptr)
 {
     __m128d t1,t2;
-    
+
     fix1 = _mm_hadd_pd(fix1,fiy1);
     fiz1 = _mm_hadd_pd(fiz1,fix2);
     fiy2 = _mm_hadd_pd(fiy2,fiz2);
     fix3 = _mm_hadd_pd(fix3,fiy3);
     fiz3 = _mm_hadd_pd(fiz3,fix4);
     fiy4 = _mm_hadd_pd(fiy4,fiz4);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));
     _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));
     _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
-    
+
     t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
     fix1 = _mm_add_pd(fix1,t1);
     t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));
     fix3 = _mm_add_pd(fix3,t2);
     fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */
-    
+
     fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));
     fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));
     fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */
-    
+
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
+#endif
 
 
 static gmx_inline void
@@ -806,7 +936,7 @@ gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
 {
     pot1 = _mm_hadd_pd(pot1,pot2);
     pot2 = _mm_unpackhi_pd(pot1,pot1);
-    
+
     _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
     _mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB)));
 }
index 8fe321d85cb9078763daf769b05c822a24afc382..7b663ed73392fb6e8463ed9e6dfb6829a975db6e 100644 (file)
@@ -120,10 +120,10 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
 
@@ -142,10 +142,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
@@ -180,11 +180,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
@@ -227,10 +227,10 @@ gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float *
 {
     __m128 t1,t2,t3,t4;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
-    t1             = _mm_maskload_ps(ptrA,mask);
-    t2             = _mm_maskload_ps(ptrB,mask);
-    t3             = _mm_maskload_ps(ptrC,mask);
-    t4             = _mm_maskload_ps(ptrD,mask);
+    t1             = gmx_mm_maskload_ps(ptrA,mask);
+    t2             = gmx_mm_maskload_ps(ptrB,mask);
+    t3             = gmx_mm_maskload_ps(ptrC,mask);
+    t4             = gmx_mm_maskload_ps(ptrD,mask);
     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
     *x1           = t1;
     *y1           = t2;
@@ -348,6 +348,72 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                               _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+    __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+    _t13         = _mm_unpackhi_ps(_x1,_y1);\
+    _x1          = _mm_unpacklo_ps(_x1,_y1);\
+    _t14         = _mm_unpackhi_ps(_z1,_x2);\
+    _z1          = _mm_unpacklo_ps(_z1,_x2);\
+    _t15         = _mm_unpackhi_ps(_y2,_z2);\
+    _y2          = _mm_unpacklo_ps(_y2,_z2);\
+    _t16         = _mm_unpackhi_ps(_x3,_y3);\
+    _x3          = _mm_unpacklo_ps(_x3,_y3);\
+    _t17         = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
+    _t18         = _mm_movehl_ps(_z3,_z3);\
+    _t19         = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
+    _t20         = _mm_movelh_ps(_x1,_z1);\
+    _t21         = _mm_movehl_ps(_z1,_x1);\
+    _t22         = _mm_movelh_ps(_t13,_t14);\
+    _t14         = _mm_movehl_ps(_t14,_t13);\
+    _t23         = _mm_movelh_ps(_y2,_x3);\
+    _t24         = _mm_movehl_ps(_x3,_y2);\
+    _t25         = _mm_movelh_ps(_t15,_t16);\
+    _t16         = _mm_movehl_ps(_t16,_t15);\
+    _t1          = _mm_loadu_ps(ptrA);\
+    _t2          = _mm_loadu_ps(ptrA+4);\
+    _t3          = _mm_load_ss(ptrA+8);\
+    _t1          = _mm_sub_ps(_t1,_t20);\
+    _t2          = _mm_sub_ps(_t2,_t23);\
+    _t3          = _mm_sub_ss(_t3,_z3);\
+    _mm_storeu_ps(ptrA,_t1);\
+    _mm_storeu_ps(ptrA+4,_t2);\
+    _mm_store_ss(ptrA+8,_t3);\
+    _t4          = _mm_loadu_ps(ptrB);\
+    _t5          = _mm_loadu_ps(ptrB+4);\
+    _t6          = _mm_load_ss(ptrB+8);\
+    _t4          = _mm_sub_ps(_t4,_t21);\
+    _t5          = _mm_sub_ps(_t5,_t24);\
+    _t6          = _mm_sub_ss(_t6,_t17);\
+    _mm_storeu_ps(ptrB,_t4);\
+    _mm_storeu_ps(ptrB+4,_t5);\
+    _mm_store_ss(ptrB+8,_t6);\
+    _t7          = _mm_loadu_ps(ptrC);\
+    _t8          = _mm_loadu_ps(ptrC+4);\
+    _t9          = _mm_load_ss(ptrC+8);\
+    _t7          = _mm_sub_ps(_t7,_t22);\
+    _t8          = _mm_sub_ps(_t8,_t25);\
+    _t9          = _mm_sub_ss(_t9,_t18);\
+    _mm_storeu_ps(ptrC,_t7);\
+    _mm_storeu_ps(ptrC+4,_t8);\
+    _mm_store_ss(ptrC+8,_t9);\
+    _t10         = _mm_loadu_ps(ptrD);\
+    _t11         = _mm_loadu_ps(ptrD+4);\
+    _t12         = _mm_load_ss(ptrD+8);\
+    _t10         = _mm_sub_ps(_t10,_t14);\
+    _t11         = _mm_sub_ps(_t11,_t16);\
+    _t12         = _mm_sub_ss(_t12,_t19);\
+    _mm_storeu_ps(ptrD,_t10);\
+    _mm_storeu_ps(ptrD+4,_t11);\
+    _mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
@@ -414,8 +480,79 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_store_ss(ptrD+8,t12);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                               _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+    __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+    __m128 _t23,_t24;\
+    _t13         = _mm_unpackhi_ps(_x1,_y1);\
+    _x1          = _mm_unpacklo_ps(_x1,_y1);\
+    _t14         = _mm_unpackhi_ps(_z1,_x2);\
+    _z1          = _mm_unpacklo_ps(_z1,_x2);\
+    _t15         = _mm_unpackhi_ps(_y2,_z2);\
+    _y2          = _mm_unpacklo_ps(_y2,_z2);\
+    _t16         = _mm_unpackhi_ps(_x3,_y3);\
+    _x3          = _mm_unpacklo_ps(_x3,_y3);\
+    _t17         = _mm_unpackhi_ps(_z3,_x4);\
+    _z3          = _mm_unpacklo_ps(_z3,_x4);\
+    _t18         = _mm_unpackhi_ps(_y4,_z4);\
+    _y4          = _mm_unpacklo_ps(_y4,_z4);\
+    _t19         = _mm_movelh_ps(_x1,_z1);\
+    _z1          = _mm_movehl_ps(_z1,_x1);\
+    _t20         = _mm_movelh_ps(_t13,_t14);\
+    _t14         = _mm_movehl_ps(_t14,_t13);\
+    _t21         = _mm_movelh_ps(_y2,_x3);\
+    _x3          = _mm_movehl_ps(_x3,_y2);\
+    _t22         = _mm_movelh_ps(_t15,_t16);\
+    _t16         = _mm_movehl_ps(_t16,_t15);\
+    _t23         = _mm_movelh_ps(_z3,_y4);\
+    _y4          = _mm_movehl_ps(_y4,_z3);\
+    _t24         = _mm_movelh_ps(_t17,_t18);\
+    _t18         = _mm_movehl_ps(_t18,_t17);\
+    _t1          = _mm_loadu_ps(ptrA);\
+    _t2          = _mm_loadu_ps(ptrA+4);\
+    _t3          = _mm_loadu_ps(ptrA+8);\
+    _t1          = _mm_sub_ps(_t1,_t19);\
+    _t2          = _mm_sub_ps(_t2,_t21);\
+    _t3          = _mm_sub_ps(_t3,_t23);\
+    _mm_storeu_ps(ptrA,_t1);\
+    _mm_storeu_ps(ptrA+4,_t2);\
+    _mm_storeu_ps(ptrA+8,_t3);\
+    _t4          = _mm_loadu_ps(ptrB);\
+    _t5          = _mm_loadu_ps(ptrB+4);\
+    _t6          = _mm_loadu_ps(ptrB+8);\
+    _t4          = _mm_sub_ps(_t4,_z1);\
+    _t5          = _mm_sub_ps(_t5,_x3);\
+    _t6          = _mm_sub_ps(_t6,_y4);\
+    _mm_storeu_ps(ptrB,_t4);\
+    _mm_storeu_ps(ptrB+4,_t5);\
+    _mm_storeu_ps(ptrB+8,_t6);\
+    _t7          = _mm_loadu_ps(ptrC);\
+    _t8          = _mm_loadu_ps(ptrC+4);\
+    _t9          = _mm_loadu_ps(ptrC+8);\
+    _t7          = _mm_sub_ps(_t7,_t20);\
+    _t8          = _mm_sub_ps(_t8,_t22);\
+    _t9          = _mm_sub_ps(_t9,_t24);\
+    _mm_storeu_ps(ptrC,_t7);\
+    _mm_storeu_ps(ptrC+4,_t8);\
+    _mm_storeu_ps(ptrC+8,_t9);\
+    _t10         = _mm_loadu_ps(ptrD);\
+    _t11         = _mm_loadu_ps(ptrD+4);\
+    _t12         = _mm_loadu_ps(ptrD+8);\
+    _t10         = _mm_sub_ps(_t10,_t14);\
+    _t11         = _mm_sub_ps(_t11,_t16);\
+    _t12         = _mm_sub_ps(_t12,_t18);\
+    _mm_storeu_ps(ptrD,_t10);\
+    _mm_storeu_ps(ptrD+4,_t11);\
+    _mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
@@ -488,7 +625,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_storeu_ps(ptrD+8,t12);
 }
-
+#endif
 
 static gmx_inline void
 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
@@ -516,6 +653,38 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4;\
+\
+    fix1 = _mm_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+    fix1 = _mm_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+    _t4 = _mm_load_ss(fshiftptr+2);\
+    _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+    _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+    _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+    _t3 = _mm_permute_ps(_t3  ,_MM_SHUFFLE(1,2,0,0));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _mm_store_ss(fshiftptr+2,_t1);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -554,8 +723,43 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t1);
     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5;\
+\
+    fix1 = _mm_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm_hadd_ps(fiz3,fix4);\
+    fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+    fix1 = _mm_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+    _t5 = _mm_load_ss(fshiftptr+2);\
+    _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+    _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
+    _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
+    _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
+    _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+    _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _t5 = _mm_add_ps(_t5,_t1);\
+    _mm_store_ss(fshiftptr+2,_t5);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -598,7 +802,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t5);
     _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -621,22 +825,4 @@ gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
 }
 
 
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_permute_ps(pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_permute_ps(pot1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */
index e915536c5f95844860c9899bd318a187455d4ce2..c00b6dad84608efb553e9b70546edf65055a9f7c 100644 (file)
@@ -77,7 +77,7 @@ gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double *
 
     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
-    return gmx_mm256_set_m128(t2,t1);
+    return gmx_mm256_set_m128d(t2,t1);
 }
 
 
@@ -201,8 +201,8 @@ gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * g
 {
     __m256d t1,t2;
 
-    t1   = gmx_mm256_set_m128(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
-    t2   = gmx_mm256_set_m128(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
+    t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
+    t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 
     *c6  = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
     *c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
@@ -211,10 +211,10 @@ gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * g
 
 static gmx_inline void
 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1,
-                                            __m256d * gmx_restrict y1,
-                                            __m256d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1,
+        __m256d * gmx_restrict y1,
+        __m256d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
 
@@ -230,18 +230,18 @@ gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shif
     ty  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
     tz  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
 
-    *x1 = gmx_mm256_set_m128(tx,tx);
-    *y1 = gmx_mm256_set_m128(ty,ty);
-    *z1 = gmx_mm256_set_m128(tz,tz);
+    *x1 = gmx_mm256_set_m128d(tx,tx);
+    *y1 = gmx_mm256_set_m128d(ty,ty);
+    *z1 = gmx_mm256_set_m128d(tz,tz);
 }
 
 
 static gmx_inline void
 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                            __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                            __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+        __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+        __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
 
@@ -265,31 +265,31 @@ gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shif
     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
-    *x1 = gmx_mm256_set_m128(tx,tx);
-    *y1 = gmx_mm256_set_m128(ty,ty);
-    *z1 = gmx_mm256_set_m128(tz,tz);
+    *x1 = gmx_mm256_set_m128d(tx,tx);
+    *y1 = gmx_mm256_set_m128d(ty,ty);
+    *z1 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
-    *x2 = gmx_mm256_set_m128(tx,tx);
-    *y2 = gmx_mm256_set_m128(ty,ty);
-    *z2 = gmx_mm256_set_m128(tz,tz);
+    *x2 = gmx_mm256_set_m128d(tx,tx);
+    *y2 = gmx_mm256_set_m128d(ty,ty);
+    *z2 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
-    *x3 = gmx_mm256_set_m128(tx,tx);
-    *y3 = gmx_mm256_set_m128(ty,ty);
-    *z3 = gmx_mm256_set_m128(tz,tz);
+    *x3 = gmx_mm256_set_m128d(tx,tx);
+    *y3 = gmx_mm256_set_m128d(ty,ty);
+    *z3 = gmx_mm256_set_m128d(tz,tz);
 }
 
 
 static gmx_inline void
 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                            __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                            __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
-                                            __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+        __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+        __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
+        __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
 
@@ -315,27 +315,27 @@ gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shif
     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
-    *x1 = gmx_mm256_set_m128(tx,tx);
-    *y1 = gmx_mm256_set_m128(ty,ty);
-    *z1 = gmx_mm256_set_m128(tz,tz);
+    *x1 = gmx_mm256_set_m128d(tx,tx);
+    *y1 = gmx_mm256_set_m128d(ty,ty);
+    *z1 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
-    *x2 = gmx_mm256_set_m128(tx,tx);
-    *y2 = gmx_mm256_set_m128(ty,ty);
-    *z2 = gmx_mm256_set_m128(tz,tz);
+    *x2 = gmx_mm256_set_m128d(tx,tx);
+    *y2 = gmx_mm256_set_m128d(ty,ty);
+    *z2 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
-    *x3 = gmx_mm256_set_m128(tx,tx);
-    *y3 = gmx_mm256_set_m128(ty,ty);
-    *z3 = gmx_mm256_set_m128(tz,tz);
+    *x3 = gmx_mm256_set_m128d(tx,tx);
+    *y3 = gmx_mm256_set_m128d(ty,ty);
+    *z3 = gmx_mm256_set_m128d(tz,tz);
     tx   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
     ty   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
     tz   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
-    *x4 = gmx_mm256_set_m128(tx,tx);
-    *y4 = gmx_mm256_set_m128(ty,ty);
-    *z4 = gmx_mm256_set_m128(tz,tz);
+    *x4 = gmx_mm256_set_m128d(tx,tx);
+    *y4 = gmx_mm256_set_m128d(ty,ty);
+    *z4 = gmx_mm256_set_m128d(tz,tz);
 }
 
 
@@ -352,27 +352,6 @@ gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 }
 
 
-static void
-gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3;
-
-    t1            = _mm256_loadu_pd(p1);                         /* x2 z1 | y1 x1 */
-    t2            = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4));  /*  -  - | z2 y2 */
-
-    *x1           = t1;
-    *y2           = t2;
-
-    t3            = gmx_mm256_unpack128hi_pd(t1,t1);
-
-    *z1           = t3;
-    *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
-    *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
-    *x2           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
-}
-
 static void
 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -408,7 +387,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
     t1            = _mm256_loadu_pd(p1);
     t2            = _mm256_loadu_pd(p1+4);
     t3            = _mm256_loadu_pd(p1+8);
-    
+
     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
@@ -419,7 +398,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
     *z1           = t4;
     *x3           = t5;
     *y4           = t6;
-    
+
     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
     *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
     *x4           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
@@ -429,128 +408,12 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 }
 
 
-static void
-gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-{
-    __m256d tA,tB,tC;
-
-    tA           = _mm256_loadu_pd(ptrA); /*  - z1 | y1 x1 */
-    tB           = _mm256_loadu_pd(ptrB); /*  - z2 | y2 x2 */
-
-    tC           = _mm256_unpacklo_pd(tA,tB);  /* z2 z1 | x2 x1 */
-
-    *x1          = tC;
-    *y1          = _mm256_unpackhi_pd(tA,tB);
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
-}
-
-
-static void
-gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3,t4,t5;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));        /*   -   -  | z2a y2a */
-    t4           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));        /*   -   -  | z2b y2b */
-    
-    t5           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-    *y2          = _mm256_unpacklo_pd(t3,t4);      /*   -   -  | y2b y2a */
-    *z2          = _mm256_unpackhi_pd(t3,t4);      /*   -   -  | z2b z2a */
-    *x1          = t5;
-    *y1          = t1;
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-}
-
-
-static void
-gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                     __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
-    t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
-    t5           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8));        /*   -   -  |  -  z3a */
-    t6           = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8));        /*   -   -  |  -  z3b */
-
-    t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-
-    t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
-    t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
-
-    *z3          = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | z3b z3a */
-
-    *x1          = t7;
-    *y1          = t1;
-    *y2          = t2;
-    *z2          = t3;
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-    *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
-    *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-}
-
-
-static void
-gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                     __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
-                                     __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
-    t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
-    t5           = _mm256_loadu_pd(ptrA+8);        /*  z4a y4a | x4a z3a */
-    t6           = _mm256_loadu_pd(ptrB+8);        /*  z4b y4b | x4b z3b */
-
-    t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-
-    t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
-    t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
-
-    t4           = _mm256_unpacklo_pd(t5,t6);      /*  y4b y4a | z3b z3a */
-    t5           = _mm256_unpackhi_pd(t5,t6);      /*  z4b z4a | x4b x4a */
-
-    *x1          = t7;
-    *y1          = t1;
-    *y2          = t2;
-    *z2          = t3;
-    *z3          = t4;
-    *x4          = t5;
-
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-    *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
-    *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-    *y4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
-    *z4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
-}
-
-
-
 static void
 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 {
-     __m256d t1,t2,t3,t4,t5,t6;
+    __m256d t1,t2,t3,t4,t5,t6;
 
     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
@@ -567,40 +430,6 @@ gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
     *z1          = gmx_mm256_unpack128hi_pd(t5,t1);
 }
 
-static void
-gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-
-    t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
-    t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
-    t5           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));      /*   -   -  | z2a y2a */
-    t6           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));      /*   -   -  | z2b y2b */
-    t7           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4));      /*   -   -  | z2c y2c */
-    t8           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4));      /*   -   -  | z2d y2d */
-
-    t9           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t10          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-    t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
-    t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
-    t3           = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | y2b y2a */
-    t4           = _mm256_unpackhi_pd(t5,t6);      /*   -   -  | z2b z2a */
-    t5           = _mm256_unpacklo_pd(t7,t8);      /*   -   -  | y2d y2c */
-    t6           = _mm256_unpackhi_pd(t7,t8);      /*   -   -  | z2d z2c */
-
-    *x1          = gmx_mm256_unpack128lo_pd(t9,t1);
-    *y1          = gmx_mm256_unpack128lo_pd(t10,t2);
-    *z1          = gmx_mm256_unpack128hi_pd(t9,t1);
-
-    *x2          = gmx_mm256_unpack128hi_pd(t10,t2);
-    *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
-    *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
-}
 
 
 static void
@@ -705,375 +534,10 @@ gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
 
 
 
-/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
-{
-    __m256d t1,t2;
-
-    t1  = _mm256_loadu_pd(ptrA);
-    t2  = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
-    t1  = _mm256_sub_pd(t1,t2);
-    /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
-    _mm256_storeu_pd(ptrA,t1);
-}
-
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                            __m256d xyz1, __m256d xyz2, __m256d xyz3)
-{
-    __m256d t1,t2;
-    __m256d tA,tB;
-    __m128d tC;
-
-    tA   = _mm256_loadu_pd(ptrA);
-    tB   = _mm256_loadu_pd(ptrA+4);
-    tC   = _mm_load_sd(ptrA+8);
-
-    /* xyz1:  -  z1 | y1 x1 */
-    /* xyz2:  -  z2 | y2 x2 */
-    /* xyz3:  -  z3 | y3 x3 */
-
-    xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
-    t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
-    xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
-    xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
-    t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
-    xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
-
-    tA   = _mm256_sub_pd(tA,xyz1);
-    tB   = _mm256_sub_pd(tB,xyz2);
-    tC   = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
-
-    _mm256_storeu_pd(ptrA,tA);
-    _mm256_storeu_pd(ptrA+4,tB);
-    _mm_store_sd(ptrA+8,tC);
-}
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                            __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
-{
-    __m256d t1,t2,t3;
-    __m256d tA,tB,tC;
-
-    tA   = _mm256_loadu_pd(ptrA);
-    tB   = _mm256_loadu_pd(ptrA+4);
-    tC   = _mm256_loadu_pd(ptrA+8);
-
-    /* xyz1:  -  z1 | y1 x1 */
-    /* xyz2:  -  z2 | y2 x2 */
-    /* xyz3:  -  z3 | y3 x3 */
-    /* xyz4:  -  z4 | y4 x4 */
-
-    xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
-    t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
-    xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
-    xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
-    t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
-    xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
-    xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1));  /*  z4 -  | x4 y4 */
-    t3   = _mm256_permute2f128_pd(xyz4,xyz4,0x21);    /*  x4 y4 | z4 - */
-    t3   = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
-    xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /*  xz y4 | x4 z3 */
-
-    tA   = _mm256_sub_pd(tA,xyz1);
-    tB   = _mm256_sub_pd(tB,xyz2);
-    tC   = _mm256_sub_pd(tC,xyz4);
-
-    _mm256_storeu_pd(ptrA,tA);
-    _mm256_storeu_pd(ptrA+4,tB);
-    _mm256_storeu_pd(ptrA+8,tC);
-}
-
-
-
-static void
-gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1)
-{
-    __m128d t1,t2,t3;
-
-    t1           = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
-    t2           = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
-    t3           = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
-    _mm_store_sd(ptrA,t1);
-    _mm_store_sd(ptrA+1,t2);
-    _mm_store_sd(ptrA+2,t3);
-}
-
-
-static void
-gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1;
-    __m128d tA;
-    t1          = _mm256_loadu_pd(ptrA);
-    tA          = _mm_loadu_pd(ptrA+4);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2a z1a | y1a x1a */
-
-    t1          = _mm256_sub_pd(x1,t1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm_storeu_pd(ptrA+4,tA);
-}
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
-{
-    __m256d t1,t2;
-    __m128d tA;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrA+4);
-    tA          = _mm_load_sd(ptrA+8);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-    y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
-    t1          = _mm256_sub_pd(t1,x1);
-    t2          = _mm256_sub_pd(t2,y2);
-    tA          = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrA+4,t2);
-    _mm_store_sd(ptrA+8,tA);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
-{
-    __m256d t1,t2,t3;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrA+4);
-    t3          = _mm256_loadu_pd(ptrA+8);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    z3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
-    y4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-    y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
-    z3          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
-
-    t1          = _mm256_sub_pd(t1,x1);
-    t2          = _mm256_sub_pd(t2,y2);
-    t3          = _mm256_sub_pd(t3,z3);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrA+4,t2);
-    _mm256_storeu_pd(ptrA+8,t3);
-}
-
-static void
-gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1)
-{
-    __m256d t1,t2,t3,t4;
-    __m256i mask;
-
-    t3          = _mm256_loadu_pd(ptrA);
-    t4          = _mm256_loadu_pd(ptrB);
-
-    t1          = _mm256_unpacklo_pd(x1,y1);   /*  -  - | y1a x1a */
-    t2          = _mm256_unpackhi_pd(x1,y1);   /*  -  - | y1b x1b */
-
-    t1          = gmx_mm256_unpack128lo_pd(t1,z1); /*  -  z1a | y1a x1a */
-    z1          = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
-    t2          = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
-
-    /* Construct a mask without executing any data loads */
-    mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
-                                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
-
-    t3          = _mm256_sub_pd(t3,t1);
-    t4          = _mm256_sub_pd(t4,t2);
-
-    /* Careful with potentially overlapping stores, need to be masked */
-    _mm256_maskstore_pd(ptrA,mask,t3);
-    _mm256_maskstore_pd(ptrB,mask,t4);
-}
-
-static void
-gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1,t2,t5;
-    __m128d t3,t4;
-
-    t1          = _mm256_loadu_pd(ptrA); 
-    t2          = _mm256_loadu_pd(ptrB); 
-    t3          = _mm_loadu_pd(ptrA+4);
-    t4          = _mm_loadu_pd(ptrB+4);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t1          = _mm256_sub_pd(t1,z2);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
-    t4          = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
-
-    /* Careful with potentially overlapping stores, need to be masked */
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm_storeu_pd(ptrA+4,t3);
-    _mm_storeu_pd(ptrB+4,t4);
-}
-
-static void
-gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
-{
-    __m256d t1,t2,t3,t4,t5,t6;
-    __m128d tA,tB;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB);
-    t3          = _mm256_loadu_pd(ptrA+4);
-    t4          = _mm256_loadu_pd(ptrB+4);
-    tA          = _mm_load_sd(ptrA+8);
-    tB          = _mm_load_sd(ptrB+8);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
-
-    t6          = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
-
-    y3          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t5          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */     
-    x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
-    t1          = _mm256_sub_pd(t1,y3);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm256_sub_pd(t3,t5);  
-    t4          = _mm256_sub_pd(t4,x1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
-    tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrA+4,t3);
-    _mm256_storeu_pd(ptrB+4,t4);
-    _mm_store_sd(ptrA+8,tA);
-    _mm_store_sd(ptrB+8,tB);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB); 
-    t3          = _mm256_loadu_pd(ptrA+4);
-    t4          = _mm256_loadu_pd(ptrB+4);
-    t5          = _mm256_loadu_pd(ptrA+8);
-    t6          = _mm256_loadu_pd(ptrB+8);
-
-    t7          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
-
-    y3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
-    z3          = _mm256_unpackhi_pd(z3,x4); /*  -   -  | x4b z3b */
-    x4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
-    y4          = _mm256_unpackhi_pd(y4,z4); /*  -   -  | z4b y4b */
-
-    z4          = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t7          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
-    x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
-    x2          = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
-    y2          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
-
-    t1          = _mm256_sub_pd(t1,z4);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm256_sub_pd(t3,t7);
-    t4          = _mm256_sub_pd(t4,x1);
-    t5          = _mm256_sub_pd(t5,x2);
-    t6          = _mm256_sub_pd(t6,y2);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrA+4,t3);
-    _mm256_storeu_pd(ptrB+4,t4);
-    _mm256_storeu_pd(ptrA+8,t5);
-    _mm256_storeu_pd(ptrB+8,t6);
-}
-
-
-
 static void
 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1)
 {
     __m256d t1,t2,tA,tB,tC,tD;
     __m256i mask;
@@ -1088,7 +552,7 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
 
     /* Construct a mask without executing any data loads */
     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
-                                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
+                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
 
     tA          = _mm256_loadu_pd(ptrA);
     tB          = _mm256_loadu_pd(ptrB);
@@ -1106,65 +570,77 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
     _mm256_maskstore_pd(ptrD,mask,tD);
 }
 
-static void
-gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1,t2,t3,t4,t5,t6;
-    __m128d tA,tB,tC,tD,tE,tF;
 
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB);
-    t3          = _mm256_loadu_pd(ptrC);
-    t4          = _mm256_loadu_pd(ptrD);
-    tA          = _mm_loadu_pd(ptrA+4);
-    tB          = _mm_loadu_pd(ptrB+4);
-    tC          = _mm_loadu_pd(ptrC+4);
-    tD          = _mm_loadu_pd(ptrD+4);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
-    y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
-    x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
 
-    t6          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    z2          = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
-    t5          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-    y1          = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
-
-    tE          = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
-    tF          = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
-
-    t1          = _mm256_sub_pd(t1,t6);
-    t2          = _mm256_sub_pd(t2,t5);
-    t3          = _mm256_sub_pd(t3,z2);
-    t4          = _mm256_sub_pd(t4,y1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
-    tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
-    tC          = _mm_sub_pd(tC,tE);
-    tD          = _mm_sub_pd(tD,tF);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrC,t3);
-    _mm256_storeu_pd(ptrD,t4);
-    _mm_storeu_pd(ptrA+4,tA);
-    _mm_storeu_pd(ptrB+4,tB);
-    _mm_storeu_pd(ptrC+4,tC);
-    _mm_storeu_pd(ptrD+4,tD);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128d _tA,_tB,_tC,_tD,_tE;\
+    _t1          = _mm256_loadu_pd(ptrA);\
+    _t2          = _mm256_loadu_pd(ptrB);\
+    _t3          = _mm256_loadu_pd(ptrC);\
+    _t4          = _mm256_loadu_pd(ptrD);\
+    _t5          = _mm256_loadu_pd(ptrA+4);\
+    _t6          = _mm256_loadu_pd(ptrB+4);\
+    _t7          = _mm256_loadu_pd(ptrC+4);\
+    _t8          = _mm256_loadu_pd(ptrD+4);\
+    _tA          = _mm_load_sd(ptrA+8);\
+    _tB          = _mm_load_sd(ptrB+8);\
+    _tC          = _mm_load_sd(ptrC+8);\
+    _tD          = _mm_load_sd(ptrD+8);\
+    _t9          = _mm256_unpacklo_pd(_x1,_y1);\
+    _x1          = _mm256_unpackhi_pd(_x1,_y1);\
+    _y1          = _mm256_unpacklo_pd(_z1,_x2);\
+    _z1          = _mm256_unpackhi_pd(_z1,_x2);\
+    _x2          = _mm256_unpacklo_pd(_y2,_z2);\
+    _y2          = _mm256_unpackhi_pd(_y2,_z2);\
+    _z2          = _mm256_unpacklo_pd(_x3,_y3);\
+    _x3          = _mm256_unpackhi_pd(_x3,_y3);\
+    _t10         = gmx_mm256_unpack128lo_pd(_t9,_y1);\
+    _y3          = gmx_mm256_unpack128hi_pd(_t9,_y1);\
+    _t9          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+    _y1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+    _x1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+    _z1          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+    _x2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+    _z2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+    _t1          = _mm256_sub_pd(_t1,_t10);\
+    _t2          = _mm256_sub_pd(_t2,_t9);\
+    _t3          = _mm256_sub_pd(_t3,_y3);\
+    _t4          = _mm256_sub_pd(_t4,_y1);\
+    _t5          = _mm256_sub_pd(_t5,_x1);\
+    _t6          = _mm256_sub_pd(_t6,_x2);\
+    _t7          = _mm256_sub_pd(_t7,_z1);\
+    _t8          = _mm256_sub_pd(_t8,_z2);\
+    _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3));\
+    _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3),_GMX_MM_PERMUTE128D(1,1)));\
+    _tE          = _mm256_extractf128_pd(_z3,0x1);\
+    _tC          = _mm_sub_sd(_tC, _tE);\
+    _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE,_GMX_MM_PERMUTE128D(1,1)));\
+    _mm256_storeu_pd(ptrA,_t1);\
+    _mm256_storeu_pd(ptrB,_t2);\
+    _mm256_storeu_pd(ptrC,_t3);\
+    _mm256_storeu_pd(ptrD,_t4);\
+    _mm256_storeu_pd(ptrA+4,_t5);\
+    _mm256_storeu_pd(ptrB+4,_t6);\
+    _mm256_storeu_pd(ptrC+4,_t7);\
+    _mm256_storeu_pd(ptrD+4,_t8);\
+    _mm_store_sd(ptrA+8,_tA);\
+    _mm_store_sd(ptrB+8,_tB);\
+    _mm_store_sd(ptrC+8,_tC);\
+    _mm_store_sd(ptrD+8,_tD);\
 }
-
-
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1,
+        __m256d x2, __m256d y2, __m256d z2,
+        __m256d x3, __m256d y3, __m256d z3)
 {
     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128d tA,tB,tC,tD,tE;
@@ -1235,15 +711,85 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
     _mm_store_sd(ptrC+8,tC);
     _mm_store_sd(ptrD+8,tD);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{ \
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14;\
+    __m128d _tA,_tB,_tC,_tD,_tE;\
+    _t1          = _mm256_loadu_pd(ptrA);\
+    _t2          = _mm256_loadu_pd(ptrB);\
+    _t3          = _mm256_loadu_pd(ptrC);\
+    _t4          = _mm256_loadu_pd(ptrD);\
+    _t5          = _mm256_loadu_pd(ptrA+4);\
+    _t6          = _mm256_loadu_pd(ptrB+4);\
+    _t7          = _mm256_loadu_pd(ptrC+4);\
+    _t8          = _mm256_loadu_pd(ptrD+4);\
+    _t9          = _mm256_loadu_pd(ptrA+8);\
+    _t10         = _mm256_loadu_pd(ptrB+8);\
+    _t11         = _mm256_loadu_pd(ptrC+8);\
+    _t12         = _mm256_loadu_pd(ptrD+8);\
+    _t13         = _mm256_unpacklo_pd(_x1,_y1);\
+    _x1          = _mm256_unpackhi_pd(_x1,_y1);\
+    _y1          = _mm256_unpacklo_pd(_z1,_x2);\
+    _z1          = _mm256_unpackhi_pd(_z1,_x2);\
+    _x2          = _mm256_unpacklo_pd(_y2,_z2);\
+    _y2          = _mm256_unpackhi_pd(_y2,_z2);\
+    _z2          = _mm256_unpacklo_pd(_x3,_y3);\
+    _x3          = _mm256_unpackhi_pd(_x3,_y3);\
+    _y3          = _mm256_unpacklo_pd(_z3,_x4);\
+    _z3          = _mm256_unpackhi_pd(_z3,_x4);\
+    _x4          = _mm256_unpacklo_pd(_y4,_z4);\
+    _y4          = _mm256_unpackhi_pd(_y4,_z4);\
+    _z4          = gmx_mm256_unpack128lo_pd(_t13,_y1);\
+    _t13         = gmx_mm256_unpack128hi_pd(_t13,_y1);\
+    _y1          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+    _x1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+    _z1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+    _x2          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+    _z2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+    _y2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+    _x3          = gmx_mm256_unpack128lo_pd(_y3,_x4);\
+    _y3          = gmx_mm256_unpack128hi_pd(_y3,_x4);\
+    _x4          = gmx_mm256_unpack128lo_pd(_z3,_y4);\
+    _z3          = gmx_mm256_unpack128hi_pd(_z3,_y4);\
+    _t1          = _mm256_sub_pd(_t1,_z4);\
+    _t2          = _mm256_sub_pd(_t2,_y1);\
+    _t3          = _mm256_sub_pd(_t3,_t13);\
+    _t4          = _mm256_sub_pd(_t4,_x1);\
+    _t5          = _mm256_sub_pd(_t5,_z1);\
+    _t6          = _mm256_sub_pd(_t6,_z2);\
+    _t7          = _mm256_sub_pd(_t7,_x2);\
+    _t8          = _mm256_sub_pd(_t8,_y2);\
+    _t9          = _mm256_sub_pd(_t9,_x3);\
+    _t10         = _mm256_sub_pd(_t10,_x4);\
+    _t11         = _mm256_sub_pd(_t11,_y3);\
+    _t12         = _mm256_sub_pd(_t12,_z3);\
+    _mm256_storeu_pd(ptrA,_t1);\
+    _mm256_storeu_pd(ptrB,_t2);\
+    _mm256_storeu_pd(ptrC,_t3);\
+    _mm256_storeu_pd(ptrD,_t4);\
+    _mm256_storeu_pd(ptrA+4,_t5);\
+    _mm256_storeu_pd(ptrB+4,_t6);\
+    _mm256_storeu_pd(ptrC+4,_t7);\
+    _mm256_storeu_pd(ptrD+4,_t8);\
+    _mm256_storeu_pd(ptrA+8,_t9);\
+    _mm256_storeu_pd(ptrB+8,_t10);\
+    _mm256_storeu_pd(ptrC+8,_t11);\
+    _mm256_storeu_pd(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1,
+        __m256d x2, __m256d y2, __m256d z2,
+        __m256d x3, __m256d y3, __m256d z3,
+        __m256d x4, __m256d y4, __m256d z4)
 {
     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
     __m128d tA,tB,tC,tD,tE;
@@ -1314,6 +860,7 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
     _mm256_storeu_pd(ptrC+8,t11);
     _mm256_storeu_pd(ptrD+8,t12);
 }
+#endif
 
 
 
@@ -1321,8 +868,8 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
 
 static gmx_inline void
 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
 {
     __m256d t1,t2;
     __m128d tA,tB;
@@ -1333,7 +880,7 @@ gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
 
-    fix1 = gmx_mm256_set_m128(tB,tA); /* 0 fiz fiy fix */
+    fix1 = gmx_mm256_set_m128d(tB,tA); /* 0 fiz fiy fix */
 
     t1   = _mm256_loadu_pd(fptr);
     t2   = _mm256_loadu_pd(fshiftptr);
@@ -1345,63 +892,59 @@ gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     _mm256_storeu_pd(fshiftptr,t2);
 }
 
-static gmx_inline void
-gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
-{
-    __m256d t1,t2,t3;
-    __m128d tA,tB,tC,tD,tE;
 
-    fix1 = _mm256_hadd_pd(fix1,fiy1);
-    fiz1 = _mm256_hadd_pd(fiz1,fix2);
-    fiy2 = _mm256_hadd_pd(fiy2,fiz2);
 
-    /* Add across the two lanes by swapping and adding back */
-    tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
-    tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
-    tC   = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
-    
-    t1   = gmx_mm256_set_m128(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
-
-    t2   = _mm256_loadu_pd(fptr);
-    tD   = _mm_loadu_pd(fptr+4);
-
-    t2   = _mm256_add_pd(t2,t1);
-    tD   = _mm_add_pd(tD,tC);
-    _mm256_storeu_pd(fptr,t2);
-    _mm_storeu_pd(fptr+4,tD);
-
-    /* Add up shift force */
-    /* t1:  fix2 fiz1 | fiy1 fix1 */
-    /* tC:              fiz2 fiy2 */
-
-    tA   = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
-    tB   = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1));   /* fiy2 fix2 */
-    tC   = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1));      /*  -   fiz2 */
-    
-    tB   = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
-    tC   = _mm_add_sd(tC,tA);
-
-    tD   = _mm_loadu_pd(fshiftptr);
-    tE   = _mm_load_sd(fshiftptr+2);
-
-    tD   = _mm_add_pd(tD,tB);
-    tE   = _mm_add_pd(tE,tC);
-
-    _mm_storeu_pd(fshiftptr,tD);
-    _mm_store_sd(fshiftptr+2,tE);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{ \
+    __m256d _t1,_t2,_t3,_t4;\
+    __m128d _tz3,_tA,_tB,_tC,_tD;\
+    fix1 = _mm256_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm256_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd());\
+    _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+    _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+    _t1   = _mm256_add_pd(_t1,_t2);\
+    _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+    _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+    _t3   = _mm256_add_pd(_t3,_t4);\
+    _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));\
+    _t2   = _mm256_loadu_pd(fptr);\
+    _t4   = _mm256_loadu_pd(fptr+4);\
+    _tA   = _mm_load_sd(fptr+8);\
+    _t2   = _mm256_add_pd(_t2,_t1);\
+    _t4   = _mm256_add_pd(_t4,_t3);\
+    _tA   = _mm_add_sd(_tA,_tz3);\
+    _mm256_storeu_pd(fptr,_t2);\
+    _mm256_storeu_pd(fptr+4,_t4);\
+    _mm_store_sd(fptr+8,_tA);\
+    _tB   = _mm256_extractf128_pd(_t1,0x1);\
+    _tC   = _mm256_extractf128_pd(_t3,0x1);\
+    _tz3  = _mm_add_sd(_tz3,_tB);\
+    _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3),_GMX_MM_PERMUTE128D(1,1));\
+    _tz3  = _mm_add_sd(_tz3,_tD);\
+    _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t1));\
+    _tD   = _mm_shuffle_pd(_tB,_mm256_castpd256_pd128(_t3),_MM_SHUFFLE2(0,1));\
+    _tC   = _mm_add_pd(_tC,_tD);\
+    _tA   = _mm_loadu_pd(fshiftptr);\
+    _tB   = _mm_load_sd(fshiftptr+2);\
+    _tA   = _mm_add_pd(_tA,_tC);\
+    _tB   = _mm_add_sd(_tB,_tz3);\
+    _mm_storeu_pd(fshiftptr,_tA);\
+    _mm_store_sd(fshiftptr+2,_tB);\
 }
-
-
-
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         __m256d fix3, __m256d fiy3, __m256d fiz3,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        __m256d fix2, __m256d fiy2, __m256d fiz2,
+        __m256d fix3, __m256d fiy3, __m256d fiz3,
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
 {
     __m256d t1,t2,t3,t4;
     __m128d tz3,tA,tB,tC,tD;
@@ -1459,15 +1002,66 @@ gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     _mm_storeu_pd(fshiftptr,tA);
     _mm_store_sd(fshiftptr+2,tB);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6;\
+    __m128d _tA,_tB,_tC,_tD;\
+    fix1 = _mm256_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm256_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm256_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm256_hadd_pd(fiy4,fiz4);\
+    _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+    _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+    _t1   = _mm256_add_pd(_t1,_t2);\
+    _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+    _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+    _t3   = _mm256_add_pd(_t3,_t4);\
+    _t5   = gmx_mm256_unpack128lo_pd(fiz3,fiy4);\
+    _t6   = gmx_mm256_unpack128hi_pd(fiz3,fiy4);\
+    _t5   = _mm256_add_pd(_t5,_t6);\
+    _t2   = _mm256_loadu_pd(fptr);\
+    _t4   = _mm256_loadu_pd(fptr+4);\
+    _t6   = _mm256_loadu_pd(fptr+8);\
+    _t2   = _mm256_add_pd(_t2,_t1);\
+    _t4   = _mm256_add_pd(_t4,_t3);\
+    _t6   = _mm256_add_pd(_t6,_t5);\
+    _mm256_storeu_pd(fptr,_t2);\
+    _mm256_storeu_pd(fptr+4,_t4);\
+    _mm256_storeu_pd(fptr+8,_t6);\
+    _tA   = _mm256_extractf128_pd(_t1,0x1);\
+    _tB   = _mm256_extractf128_pd(_t3,0x1);\
+    _tC   = _mm256_extractf128_pd(_t5,0x1);\
+    _tB   = _mm_add_pd(_tB,_mm256_castpd256_pd128(_t1));\
+    _tA   = _mm_add_pd(_tA,_mm256_castpd256_pd128(_t5));\
+    _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t3));\
+    _tD   = _mm_shuffle_pd(_tA,_tC,_MM_SHUFFLE2(0,1));\
+    _tB   = _mm_add_pd(_tB,_tD);\
+    _tC   = _mm_permute_pd(_tC,_GMX_MM_PERMUTE128D(1,1));\
+    _tC   = _mm_add_sd(_tC,_tA);\
+    _tA   = _mm_loadu_pd(fshiftptr);\
+    _tD   = _mm_load_sd(fshiftptr+2);\
+    _tA   = _mm_add_pd(_tA,_tB);\
+    _tD   = _mm_add_sd(_tD,_tC);\
+    _mm_storeu_pd(fshiftptr,_tA);\
+    _mm_store_sd(fshiftptr+2,_tD);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         __m256d fix3, __m256d fiy3, __m256d fiz3,
-                                         __m256d fix4, __m256d fiy4, __m256d fiz4,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        __m256d fix2, __m256d fiy2, __m256d fiz2,
+        __m256d fix3, __m256d fiy3, __m256d fiz3,
+        __m256d fix4, __m256d fiy4, __m256d fiz4,
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
 {
     __m256d t1,t2,t3,t4,t5,t6;
     __m128d tA,tB,tC,tD;
@@ -1530,6 +1124,7 @@ gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
     _mm_storeu_pd(fshiftptr,tA);
     _mm_store_sd(fshiftptr+2,tD);
 }
+#endif
 
 
 
@@ -1547,7 +1142,7 @@ gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
 
 static void
 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
-                      __m256d pot2, double * gmx_restrict ptrB)
+                         __m256d pot2, double * gmx_restrict ptrB)
 {
     __m128d t1,t2;
 
@@ -1561,49 +1156,4 @@ gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
 }
 
 
-static void
-gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
-                         __m256d pot2, double * gmx_restrict ptrB,
-                         __m256d pot3, double * gmx_restrict ptrC,
-                         __m256d pot4, double * gmx_restrict ptrD)
-{
-    __m256d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF,tG,tH;
-
-    tA   = _mm_load_sd(ptrA);
-    tB   = _mm_load_sd(ptrB);
-    tC   = _mm_load_sd(ptrC);
-    tD   = _mm_load_sd(ptrD);
-
-    /* do a transpose */
-    t1   = _mm256_unpacklo_pd(pot1, pot2);   /* p2c p1c | p2a p1a */
-    t2   = _mm256_unpackhi_pd(pot1, pot2);   /* p2d p1d | p2b p1b */
-    t3   = _mm256_unpacklo_pd(pot3, pot4);   /* p4c p3c | p4a p3a */
-    t4   = _mm256_unpackhi_pd(pot3, pot4);   /* p4d p3d | p4b p3b */
-    pot1 = _mm256_permute2f128_pd(t1, t3, 0x20);   /* p4a p3a | p2a p1a */
-    pot2 = _mm256_permute2f128_pd(t2, t4, 0x20);   /* p4b p3b | p2b p1b */
-    pot3 = _mm256_permute2f128_pd(t1, t3, 0x31);   /* p4c p3c | p2c p1c */
-    pot4 = _mm256_permute2f128_pd(t2, t4, 0x31);   /* p4d p3d | p2d p1d */
-
-    pot1 = _mm256_add_pd(pot1,pot2);
-    pot3 = _mm256_add_pd(pot3,pot4);
-    pot1 = _mm256_add_pd(pot1,pot3);  /* Sum in the four elements */
-
-    tE   = _mm256_castpd256_pd128(pot1);
-    tF   = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
-    tG   = _mm256_extractf128_pd(pot1,0x1);
-    tH   = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
-
-    tA   = _mm_add_sd(tA,tE);
-    tB   = _mm_add_sd(tB,tF);
-    tC   = _mm_add_sd(tC,tG);
-    tD   = _mm_add_sd(tD,tH);
-
-       _mm_store_sd(ptrA,tA);
-       _mm_store_sd(ptrB,tB);
-       _mm_store_sd(ptrC,tC);
-       _mm_store_sd(ptrD,tD);
-}
-
-
 #endif /* _kernelutil_x86_avx_256_double_h_ */
index 2f277f714ca757e3aa189e02112625496ec93d8e..b7dd00c1bec521811bfe5ecb12cde95a37e650fa 100644 (file)
@@ -290,7 +290,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -686,7 +686,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f8ca813efd47ab603f9439bc7749caf043145b39..a17e85869b34918dc4b7e2877e84a7a6b0fb45a0 100644 (file)
@@ -419,7 +419,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1055,7 +1055,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index a390f709b266e16ffe651c8443889dea9894eccb..d3b59098411455b992f4d52a5b4e67a19b43077f 100644 (file)
@@ -724,7 +724,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1938,7 +1938,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f7ff5591fc27260cc845f713ce166431c05618d0..1dc792a0dd0f67b598bd8e127140bd1af03f39a0 100644 (file)
@@ -462,7 +462,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1179,7 +1179,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 71cbe5c1da9a359df50c559ad0ec6cbe49472c15..6cd3f6eba2fe1caa3e2d148dde2d235a6d41e8a5 100644 (file)
@@ -772,7 +772,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2075,7 +2075,7 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 79c8cb40536904a082d01961588e506fefa6ae51..ee84ab14bfd3cc240537295c50145b3b0888809e 100644 (file)
@@ -270,7 +270,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -628,7 +628,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2c89d947447f1d09214f2813252a5a7b04689c00..c2fd96e7b96068fc9ffaac5d4365bd4e617e7a81 100644 (file)
@@ -399,7 +399,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -997,7 +997,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 576abdd9eeed4ee6eb033fb30c8b0c61a58e440d..61771c8dcb890be72fdc9cb7a784dd99f58be283 100644 (file)
@@ -704,7 +704,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1880,7 +1880,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e2aacee8cf2543708cfd65d65d1756430a0e8e11..c6846406394de297b298dd20095a425256757ff8 100644 (file)
@@ -433,7 +433,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1093,7 +1093,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d132597bf164bc3a5c0c5e7c364b18704ca4a9ed..fb78598a585f2127d2bc777dddcbc931ed10963b 100644 (file)
@@ -743,7 +743,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1989,7 +1989,7 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 78d8e130a37a6380be4e8b973521129941762937..0d1fce994cdd80ab59ebf4bb3b728c19ba3f1f7a 100644 (file)
@@ -239,7 +239,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -549,7 +549,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ff49f23a3e72c5fdd45351c97895acf115d93662..4ac6dada4978cc86042b3f5fe4b146f043e6dfab 100644 (file)
@@ -368,7 +368,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -918,7 +918,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 24cb460f5a5f88ca23716b8ab0996a5e73eea34f..6fe4a91917d78ba374261beb51d8a43c3c9cd010 100644 (file)
@@ -679,7 +679,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1822,7 +1822,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f62b5f4f40cbc31ae282837328d9141d166e2b94..f43db1f0e42a51bc56c6dabbebcbbd077338aa5f 100644 (file)
@@ -368,7 +368,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -918,7 +918,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e1d8e1edbb79906014d8d6735119d5434d5137b5..1e412c13e1bf64122968ee5bfed9c149c60e4d30 100644 (file)
@@ -679,7 +679,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1822,7 +1822,7 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f8174446f62198f3e05830c14c3b07933b025a71..647e8e72cfc57378190fbdcf17c9fd2668a57036 100644 (file)
@@ -282,7 +282,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -664,7 +664,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 663ebb50b7022702530a34f3c7e9980cd907c287..019c9b72b9de522a184191b169325ba9d3a278ff 100644 (file)
@@ -379,7 +379,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -939,7 +939,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 3b61a715684bf547e0d631063c8a9630e03614ed..fd6a4d41d6d0b39780747b193895dfe4bdec5201 100644 (file)
@@ -588,7 +588,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1540,7 +1540,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f4d7f9857421c6154e3d2f925270bd10cfa2d8b9..245acdafc588ddc42439d2f50e2eb8e4a9ef8963 100644 (file)
@@ -414,7 +414,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1038,7 +1038,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 8266aa2222b7373f7794596c8c113175ca811c9a..746f41666a0926be9b05600deb2dd07f5258841c 100644 (file)
@@ -628,7 +628,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1652,7 +1652,7 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6b5b157302968f23c61f1d19fadd7db2ec74bbfd..570d4d09900467ebef2a151f2a10a865600efa8d 100644 (file)
@@ -246,7 +246,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -564,7 +564,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b04ff6f96177d7269edd20fd21a5f7dc435ba3a2..17e34b724ca128caed567f0b7eae58eb4fffafba 100644 (file)
@@ -343,7 +343,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -839,7 +839,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 172dc67d0b3c39eea520e9190e309dbc60fc038a..902e678fb0805fccbd2a1a6282e635606cc0abb6 100644 (file)
@@ -552,7 +552,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1440,7 +1440,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f2cc1ef6c2a9c6a5031e80f789f7b35f80589acc..390301235e7b2d1b804fb8bb75388f0d730d58dc 100644 (file)
@@ -378,7 +378,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -938,7 +938,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 456e3ed4bff28f50afe94c48f8557653a48f74c9..fed2a2f7e51942c5972887ca2a64b4cb1d13b4a9 100644 (file)
@@ -592,7 +592,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1552,7 +1552,7 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 567e96c2896b299900d4c5284a4f869d855a8b60..f51a1c6b8bfe6383be5938460d1eb97471f84867 100644 (file)
@@ -217,7 +217,7 @@ nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -491,7 +491,7 @@ nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 61e29dc4a22e536e5afeb3c78068bf1ca94005fa..a8b9b2e174d21f3d31f012c238f4fb448c7a99e6 100644 (file)
@@ -314,7 +314,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -766,7 +766,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d7eb1753ef5ed056bb6c72143a13670061db2f33..2d422b4a1e29f2b28030981e32a181cb546dd67c 100644 (file)
@@ -529,7 +529,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1388,7 +1388,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 90f00e1743c5cd013f68786ae64460392878ab70..6c1d769084a6af72121dcf6cba5db895b0d3876a 100644 (file)
@@ -314,7 +314,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -766,7 +766,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4e4bf4fcf6afbbd0e8dd31c5c46c55aebc2d28e6..b77d518579ac85d8302958df2af0fa76d23c902d 100644 (file)
@@ -529,7 +529,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1388,7 +1388,7 @@ nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 395cb97ba1141e5df7f3e6fecb859bcc414b9542..a93cd0f04bc021ae268b8dabacf9d90c8c9a3641 100644 (file)
@@ -294,7 +294,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -680,7 +680,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 8335c885fe7dd3bb166b7ad3a13b43a228b36e60..84f79ddcbc4b28d146f4a94358529087cb006df3 100644 (file)
@@ -441,7 +441,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1095,7 +1095,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2744788517163d18cef42d70fe77c0924f593d9b..5c397f9057961d2039368f18ee1212457f5fa6be 100644 (file)
@@ -800,7 +800,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2116,7 +2116,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index bc124ee30cfbb8cd99dcd840b63b981d53c43ad2..2ada318bd1f2741cbc35aee35588fd3333344434 100644 (file)
@@ -485,7 +485,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1221,7 +1221,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 7ac56e43419ad78fd970132d8024e8b801c4763e..c2f9de398d9c3269177835d74d76d25cefa948ca 100644 (file)
@@ -849,7 +849,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2255,7 +2255,7 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 265055c7d3518d121da2b9c0c3269e0fb5e14166..b509fdf1d5c40320432a531917db33efd0f3e9ad 100644 (file)
@@ -260,7 +260,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -597,7 +597,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e16610b3cabd392db83225be42776853e34808fe..a10d4d46344701c5df4c42678426c59e8dd71ff8 100644 (file)
@@ -407,7 +407,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1012,7 +1012,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index feae0c1c6d2d010e66970dbc77802d98dc0f28ff..8a7f278ffc7b698e56bcf80d39e4026886af5973 100644 (file)
@@ -772,7 +772,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2054,7 +2054,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 31eb2b991d233f8aeaa6531001304de4001247d1..a9d75d641a0af67a6ba9a90e509591970c57db3e 100644 (file)
@@ -407,7 +407,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1012,7 +1012,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 286bdf8e55d9badb1b0e3e3cd6c8f31c590b8463..b839694452f8ba780f7e33fb5f4ebff96280158f 100644 (file)
@@ -772,7 +772,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2054,7 +2054,7 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 41afd21a60f8099c38981aaa780f0528d5d44f9c..136b320ac703b918ef51effcbe92bf77c543b81f 100644 (file)
@@ -317,7 +317,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -745,7 +745,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 13c115c3560230139846795ecb7d4ae3e0b1d159..2ed0f22d52df796783077c8672817f80efb2171d 100644 (file)
@@ -486,7 +486,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1234,7 +1234,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index a62cac646a2eda67c03144e3d039730c7cf576fa..7d0b8dc1278712f59be6bae3fa18b1d21dfcb9b9 100644 (file)
@@ -911,7 +911,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2477,7 +2477,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 0d4b027b8a6e25a548aae1791860ea4947de2153..fce1559cc0c7deb4750dd037c76a483e2adf8f06 100644 (file)
@@ -542,7 +542,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1397,7 +1397,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 81a843dd58259751612b78934801d3d572ec9590..c615fc1ee14f028ca7384e7f2a36179b2bb29ecf 100644 (file)
@@ -972,7 +972,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2653,7 +2653,7 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d823d26276e5f8df8752186dd3f22ea267b25a19..56c73d84fde8ce51f77cecf07c437af5b672c00c 100644 (file)
@@ -285,7 +285,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -662,7 +662,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 066de6a24ae39d089bc3db61326d0983504cefa6..b62d9f8193532c3ec7b09df8d986ed6ce7d163ef 100644 (file)
@@ -454,7 +454,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1151,7 +1151,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f5c09be1c17f7977ddbe5f1c784ce435d52231c4..aaa795111b5770df54775f1a73fe739b09cd5db7 100644 (file)
@@ -885,7 +885,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2415,7 +2415,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c4b8a1d02fdcd43bb3d07a2b1524403871ad7751..d47e204cab6ed015694e4c07ae4efe0273f5a200 100644 (file)
@@ -454,7 +454,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1151,7 +1151,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6187e5654f18d7bb62c521d5e0c68bce882d3c24..9119c8adb302ea02a7ba3953942b8d11f6308bb7 100644 (file)
@@ -885,7 +885,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2415,7 +2415,7 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 107d6ee8df5e697403cf34fdbf947eaf13a82703..cf98382304774bf2572e397a6191a3a3f1951ffd 100644 (file)
@@ -308,7 +308,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -724,7 +724,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d2f97f0066b7bfe9704edbc67cb0a2d847a3202a..f49f54923a25ff9a3813f9b6a07a620e9d5c7f6b 100644 (file)
@@ -435,7 +435,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1081,7 +1081,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 7f94be251d32943b006ba4b0dd6fa682032696be..1e69cc9f835eed5f3bf76676dfa42af4e235c56d 100644 (file)
@@ -734,7 +734,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1928,7 +1928,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 3c1888a21ceb313dc04f67c30c6bedd9890b8748..e0a99dcd7c98854546ccf4bc1adc8ce8eafcd36b 100644 (file)
@@ -472,7 +472,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1187,7 +1187,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 0e846bf839ba54d6528fd3761038c689845d7349..52497e021d28de90eef28aa58db66e2692fe156b 100644 (file)
@@ -776,7 +776,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -2047,7 +2047,7 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d18c19ca1a75d73e21fe5b5cb79c4e27bfb2700d..fe651a461a346fccd672d0d0666fa7ccc9d49211 100644 (file)
@@ -274,7 +274,7 @@ nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -631,7 +631,7 @@ nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 5dac58b7137c9320288aed35c455564f11af585c..0d88493ded5973e053c072b0da6f47d7c4032b7c 100644 (file)
@@ -401,7 +401,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -988,7 +988,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 34ff8a2fe52a336e5b8fb60bc0a955c5b6bfc331..08e08e91429e5722c88ef2eea3f141331be43456 100644 (file)
@@ -700,7 +700,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1835,7 +1835,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index aad46e99e55a7af1f98eaf86628bf567d844d013..b4aaf0ab0f3932b64e8a3106f83d9da3e3bdaebd 100644 (file)
@@ -436,7 +436,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1087,7 +1087,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 500c263a6202dc13bb412824a86cb67f2f94f6b3..9378bb338760e9ca6201e3ec0b862ffd9989645f 100644 (file)
@@ -740,7 +740,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1947,7 +1947,7 @@ nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4bd38f8a371bf6bdfe386522b268506edcc193db..2fbb38d2a380bfece0643285e4801422b05442ef 100644 (file)
@@ -245,7 +245,7 @@ nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -558,7 +558,7 @@ nb_kernel_ElecEw_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 862e4d39eda2b0d229df59d29619416f515e6645..e1536bbb597b431fc51245c7c2f6675bc51a9ddf 100644 (file)
@@ -372,7 +372,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -915,7 +915,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 296f26c87f26cd9ddc7e5edc52b0d5d5f4522f5c..ef4dea7cd8bd965308665294642cf83a0f8bcd80 100644 (file)
@@ -677,7 +677,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1783,7 +1783,7 @@ nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index ccc91c007bf0553b5bb45f3d4b1967d1ce9243eb..db46c0416a8aba75429ce0924ed7bbff66d13660 100644 (file)
@@ -372,7 +372,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -915,7 +915,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 7ee0e48366ef70bb66602a6f636af83b55730841..b40eaed3b3d6ae01302ea347708c2f600f844328 100644 (file)
@@ -677,7 +677,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1783,7 +1783,7 @@ nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b3f1d800172fd3590571ca132337d144cf3a9ce9..3ab68aaedb747e5e5ea77473180ea329aa93a606 100644 (file)
@@ -327,7 +327,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -790,7 +790,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e1ca1e33786677ca487746b80810516a6d5a3a99..abb078218264047f347d7ae3230a47d36d410667 100644 (file)
@@ -299,7 +299,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -711,7 +711,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4976a5da811df01ddc5927f14d1d68b8c1db6382..27c0bb96b2ece012f609ed02305ac24fcadfe699 100644 (file)
@@ -268,7 +268,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -632,7 +632,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 621a3c6a2ea3b505d1e4f7b95ebbf82b82efedd3..61b04cc81df57fb3ed7cf7402f4e57c206156ca7 100644 (file)
@@ -266,7 +266,7 @@ nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -622,7 +622,7 @@ nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d49028ca98e44f6e4a95582b293d9575f1a1aee6..a1f2f961494d709d0123399ff86114162615ed74 100644 (file)
@@ -248,7 +248,7 @@ nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -567,7 +567,7 @@ nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 13a6a80bc1607a3f43198f01029706fd5eb480bc..cfbcdad64cfdcfe5622bea6b66192f89398568b6 100644 (file)
@@ -273,7 +273,7 @@ nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -635,7 +635,7 @@ nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 979835935eebeb3f85846452260838c44e203f33..f4395ddcaa3757fcdaa54e20c263c4e07bf14f89 100644 (file)
@@ -230,7 +230,7 @@ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -522,7 +522,7 @@ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 3d72e2377f328dd4d07037568eabd9cfed0f8f49..1e41148ecc85edf90421304cc2c3d798b1f8af10 100644 (file)
@@ -301,7 +301,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -710,7 +710,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2692ae11bbd623bbcdb40727f04ba547c0cdfd79..dd9712d5b757957030c3324031a3e8fb5c4b13b6 100644 (file)
@@ -418,7 +418,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1041,7 +1041,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f853fa8828b13f23823aa7b4990ea1ad367dac67..c9bc102ef9538507a6216d3353af986ccc614784 100644 (file)
@@ -687,7 +687,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1810,7 +1810,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 900f85d01bf149e5bf427d67007f2b8c1a5757f7..acfceb4014c20f5bd0e7a0e3dc7333da0447d13f 100644 (file)
@@ -452,7 +452,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1138,7 +1138,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index e1345c5ec5914abb5bccedfd338dfc1c730662e1..9d0f22f85a9002eaede57c20a0e02a6b1452508f 100644 (file)
@@ -736,7 +736,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1949,7 +1949,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 2c4c3457cd9f77edc07b075084ec4f7286e03573..5cc28cea2f8dcd0662595091c964c808b8237af8 100644 (file)
@@ -269,7 +269,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -618,7 +618,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index efc7adbdb0e0563e32b46aa8dc2fc376f86817c7..00bd89dd476c1a9a0fe47426c06969513e770598 100644 (file)
@@ -386,7 +386,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -949,7 +949,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 7f5ba1dd765fcff230ddca16ed29058090847f25..9a38a58cf7b561c28dbe6cd15f7d97f5acde347e 100644 (file)
@@ -655,7 +655,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1718,7 +1718,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 61320ae427bf37165c83d2471b12941ac705929a..bae5ae38d38287a18a109dc6d24d99e2d286a26e 100644 (file)
@@ -430,7 +430,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1075,7 +1075,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 245f282a05c915993f9da58e6d80bf50663d0760..8b559aaa018f25dd756de7eb6779e52fb6361f23 100644 (file)
@@ -704,7 +704,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1857,7 +1857,7 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 6bf8d2f90bdb64540014f6d80bd132a6bc21ae2f..1969fd42d5d975e29e0238b1d43590dff2ea16ce 100644 (file)
@@ -292,7 +292,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -680,7 +680,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 54621eea3570059239f2374576a5b90c3f93ce43..2ac4ec82cd6c64b18b1c5a0da1149eb75adb7d30 100644 (file)
@@ -409,7 +409,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1011,7 +1011,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 3f68985c13da9448f466560abe438f72e27fb834..3536c0c6d5cff9b2be596a75ab30a87c1be32e32 100644 (file)
@@ -678,7 +678,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1780,7 +1780,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index df3839a75910deda3a2708314c6437fab6de4fab..e8b030407eabbc3539612cd410ca6679bc5aaf23 100644 (file)
@@ -454,7 +454,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1140,7 +1140,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f2e5252c67378fba928e098304a114e35669cedc..9749bac10120c958af6749cc436f7dcfba239f3c 100644 (file)
@@ -728,7 +728,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1922,7 +1922,7 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d45bf31e96e7f669b78ae0e3111feee8f8ce2910..52ae8ab6b86bea9a2a6321c81eb82c3b8d509ee8 100644 (file)
@@ -235,7 +235,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -535,7 +535,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1e73352fc188467def2d3a6eeae1d71991fede8f..afd12cd60bf3bab62364fd6fd4687ea9cb304057 100644 (file)
@@ -352,7 +352,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -866,7 +866,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index cf489d8cc6c8d0e9095a5dd011480c53b1fcc82f..c282de4780fb6c911e3856cae59bd9c32a2b948d 100644 (file)
@@ -627,7 +627,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1656,7 +1656,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 206da669ba7837b919347371e85f0550ae4c7a89..ec6b9558977b80ab36a5bd652b0266f323dea2cf 100644 (file)
@@ -352,7 +352,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -866,7 +866,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index a05b13c7d5b52cfe9931c86a56db229b525bc011..dcd4ff00fa7a7f5cdee552f24890fa333fb6f353 100644 (file)
@@ -627,7 +627,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1656,7 +1656,7 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index f69ca59c46607ca930c87d40c4de6c00f5f571a1..eb111057e45049a4f10ec0527766866ea4b3a688 100644 (file)
@@ -285,7 +285,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -669,7 +669,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 29ea62eb789419436cf6db4973faa1c78f0086c3..c7a2e252c63ffe1ea317818ae315818e6aa911c0 100644 (file)
@@ -382,7 +382,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -942,7 +942,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 568d423a9d8e8f15e58b1f2bea356733f126cc76..d953fb1db4c54b24fd59eccc45826d92e1246471 100644 (file)
@@ -591,7 +591,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1537,7 +1537,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1aa1faf160fe4b86be94dff8c70123ae4a14f933..63c306cf8f962038a018acdafc9767eeec058377 100644 (file)
@@ -417,7 +417,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1041,7 +1041,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index d12e99437ab2edccb653d56cb1ccc4843f4a3fe6..48766184092cfbaae15eade499d73e79717e40c5 100644 (file)
@@ -631,7 +631,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1649,7 +1649,7 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 865021124078dd94523f8f4593b974ff92dd491a..f5c84df7574cb2ae61af00a1fa42e5ae02844c2f 100644 (file)
@@ -249,7 +249,7 @@ nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -569,7 +569,7 @@ nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 26c65e2b3a4ac9e0a52ef6b86a9ebd29e7bd792f..8db375d2ea89a499be138f3891ec4756f2316f06 100644 (file)
@@ -346,7 +346,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -842,7 +842,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 0628af28f06f321971a21ebadae1528d17e87ab5..444fdc956ee1712297466ecf1481e15e537efc4c 100644 (file)
@@ -555,7 +555,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1437,7 +1437,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index b3a18371c293ee3e20b4855cdf018dde3e5ec5ad..6a3c7eaaa02bec9dc7e9f0e2cd60b05e4cc43daa 100644 (file)
@@ -381,7 +381,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -941,7 +941,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 240e6319bb23576859621ca4f5e53f98268f8ce3..bf6e04679883b780843a14d22659a0342d53a203 100644 (file)
@@ -595,7 +595,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1549,7 +1549,7 @@ nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index fde9f635b9bf7482336f58175faf4e636bcf4222..37ae1aa703d3c19b3303faf12c1a8af2b2403d07 100644 (file)
@@ -220,7 +220,7 @@ nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -496,7 +496,7 @@ nb_kernel_ElecRF_VdwNone_GeomP1P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 188ae83b91e8833edf6c34082c0d71b1701748bf..9b99483ca29d46dd95e0f66e4e15ddf358bb5210 100644 (file)
@@ -317,7 +317,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -769,7 +769,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index c8028edda0b5aae8a127e12e857ff50ac9ad1638..2c3a8b8f08d47562bcfb45a88f0fb4a87b61becf 100644 (file)
@@ -532,7 +532,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1385,7 +1385,7 @@ nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 244b59311e01113ed4c0319f26dda920fbaa9163..424e22d8b5fb440e3e18dd9a9c3bff651e99a265 100644 (file)
@@ -317,7 +317,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -769,7 +769,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 77221d582a3279e07156aa2fd034f28527d10077..6c7b730d64351f457006ff3bfab7be018112f8f9 100644 (file)
@@ -532,7 +532,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
@@ -1385,7 +1385,7 @@ nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_double
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 4e8d50428620a9bae44359ffaeddeaedbedc3dfb..9f36946353161cecea954211e171a9aae4b784f2 100644 (file)
@@ -382,7 +382,7 @@ void
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
              */
-            tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 
             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
index 1f2ab31be04f996cd47402a9cfed44e24fe12270..f3a1f6740a1b5f7d512fde1006d0b8f0005290e7 100644 (file)
@@ -199,10 +199,10 @@ gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx
 
 static gmx_inline void
 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1,
-                                            __m256 * gmx_restrict y1,
-                                            __m256 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1,
+        __m256 * gmx_restrict y1,
+        __m256 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
 
@@ -225,10 +225,10 @@ gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift
 
 static gmx_inline void
 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-                                            __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
-                                            __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+        __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+        __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
@@ -273,11 +273,11 @@ gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift
 
 static gmx_inline void
 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-                                            __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
-                                            __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
-                                            __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+        __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+        __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
+        __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
@@ -334,10 +334,10 @@ gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 {
     __m128 t1,t2,t3,t4;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
-    t1             = _mm_maskload_ps(ptrA,mask);
-    t2             = _mm_maskload_ps(ptrB,mask);
-    t3             = _mm_maskload_ps(ptrC,mask);
-    t4             = _mm_maskload_ps(ptrD,mask);
+    t1             = gmx_mm_maskload_ps(ptrA,mask);
+    t2             = gmx_mm_maskload_ps(ptrB,mask);
+    t3             = gmx_mm_maskload_ps(ptrC,mask);
+    t4             = gmx_mm_maskload_ps(ptrD,mask);
     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
     *x1           = _mm256_castps128_ps256(t1);
     *y1           = _mm256_castps128_ps256(t2);
@@ -431,10 +431,10 @@ gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
     __m256 t1,t2,t3,t4,t5,t6,t7,t8;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
 
-    t1             = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
-    t2             = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
-    t3             = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
-    t4             = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
+    t1             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
+    t2             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
+    t3             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
+    t4             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
 
     t5            = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
     t6            = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
@@ -503,7 +503,7 @@ gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 
     t1           = _mm256_unpacklo_ps(t1,t3);  /*  -   -  z3g z3e |  -   -  z3c z3a */
     t2           = _mm256_unpacklo_ps(t2,t4);  /*  -   -  z3h z3f |  -   -  z3d z3b */
-    
+
     *z3          = _mm256_unpacklo_ps(t1,t2);
 }
 
@@ -567,7 +567,7 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
     t6           = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
     t7           = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
     t8           = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
-    
+
     *z3          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
     *x4          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
     *y4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
@@ -577,8 +577,8 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 
 static gmx_inline void
 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC,float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1)
+        float * gmx_restrict ptrC,float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8;
     __m128i mask;
@@ -594,30 +594,79 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     t3          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /*  -  z1c y1c x1c */
     t4          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /*  -  z1d y1d x1d */
 
-    t5          = _mm_maskload_ps(ptrA,mask);
-    t6          = _mm_maskload_ps(ptrB,mask);
-    t7          = _mm_maskload_ps(ptrC,mask);
-    t8          = _mm_maskload_ps(ptrD,mask);
+    t5          = gmx_mm_maskload_ps(ptrA,mask);
+    t6          = gmx_mm_maskload_ps(ptrB,mask);
+    t7          = gmx_mm_maskload_ps(ptrC,mask);
+    t8          = gmx_mm_maskload_ps(ptrD,mask);
 
     t5          = _mm_sub_ps(t5,t1);
     t6          = _mm_sub_ps(t6,t2);
     t7          = _mm_sub_ps(t7,t3);
     t8          = _mm_sub_ps(t8,t4);
 
-    _mm_maskstore_ps(ptrA,mask,t5);
-    _mm_maskstore_ps(ptrB,mask,t6);
-    _mm_maskstore_ps(ptrC,mask,t7);
-    _mm_maskstore_ps(ptrD,mask,t8);
+    gmx_mm_maskstore_ps(ptrA,mask,t5);
+    gmx_mm_maskstore_ps(ptrB,mask,t6);
+    gmx_mm_maskstore_ps(ptrC,mask,t7);
+    gmx_mm_maskstore_ps(ptrD,mask,t8);
 }
 
-
-
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                                  x1,y1,z1,x2,y2,z2,x3,y3,z3) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6;\
+    __m128 _tA,_tB,_tC,_tD;\
+\
+    _t1         = _mm256_loadu_ps(ptrA);\
+    _t2         = _mm256_loadu_ps(ptrB);\
+    _t3         = _mm256_loadu_ps(ptrC);\
+    _t4         = _mm256_loadu_ps(ptrD);\
+    _tA         = _mm_load_ss(ptrA+8);\
+    _tB         = _mm_load_ss(ptrB+8);\
+    _tC         = _mm_load_ss(ptrC+8);\
+    _tD         = _mm_load_ss(ptrD+8);\
+    _t5         = _mm256_unpacklo_ps(x1,y1);\
+    x1          = _mm256_unpackhi_ps(x1,y1);\
+    y1          = _mm256_unpacklo_ps(z1,x2);\
+    z1          = _mm256_unpackhi_ps(z1,x2);\
+    x2          = _mm256_unpacklo_ps(y2,z2);\
+    y2          = _mm256_unpackhi_ps(y2,z2);\
+    _t6         = _mm256_unpacklo_ps(x3,y3);\
+    x3          = _mm256_unpackhi_ps(x3,y3);\
+    _t5         = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+    x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+    y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1);\
+    z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+    z2          = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(1,0,1,0));\
+    _t5         = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(3,2,3,2));\
+    y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+    x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_sub_ps(_t1,z2);\
+    _t2         = _mm256_sub_ps(_t2,_t5);\
+    _t3         = _mm256_sub_ps(_t3,y1);\
+    _t4         = _mm256_sub_ps(_t4,x1);\
+    _tA         = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3));\
+    _tB         = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));\
+    _tC         = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));\
+    _tD         = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));\
+    _mm256_storeu_ps(ptrA,_t1);\
+    _mm256_storeu_ps(ptrB,_t2);\
+    _mm256_storeu_ps(ptrC,_t3);\
+    _mm256_storeu_ps(ptrD,_t4);\
+    _mm_store_ss(ptrA+8,_tA);\
+    _mm_store_ss(ptrB+8,_tB);\
+    _mm_store_ss(ptrC+8,_tC);\
+    _mm_store_ss(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3)
 {
     __m256 t1,t2,t3,t4,t5,t6;
     __m128 tA,tB,tC,tD;
@@ -672,15 +721,76 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_store_ss(ptrC+8,tC);
     _mm_store_ss(ptrD+8,tD);
 }
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                                  x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5;\
+    __m128 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH;\
+\
+    _t1         = _mm256_loadu_ps(ptrA);\
+    _t2         = _mm256_loadu_ps(ptrB);\
+    _t3         = _mm256_loadu_ps(ptrC);\
+    _t4         = _mm256_loadu_ps(ptrD);\
+    _tA         = _mm_loadu_ps(ptrA+8);\
+    _tB         = _mm_loadu_ps(ptrB+8);\
+    _tC         = _mm_loadu_ps(ptrC+8);\
+    _tD         = _mm_loadu_ps(ptrD+8);\
+    _t5         = _mm256_unpacklo_ps(x1,y1);\
+    x1          = _mm256_unpackhi_ps(x1,y1);\
+    y1          = _mm256_unpacklo_ps(z1,x2);\
+    z1          = _mm256_unpackhi_ps(z1,x2);\
+    x2          = _mm256_unpacklo_ps(y2,z2);\
+    y2          = _mm256_unpackhi_ps(y2,z2);\
+    z2          = _mm256_unpacklo_ps(x3,y3);\
+    x3          = _mm256_unpackhi_ps(x3,y3);\
+    y3          = _mm256_unpacklo_ps(z3,x4);\
+    z3          = _mm256_unpackhi_ps(z3,x4);\
+    x4          = _mm256_unpacklo_ps(y4,z4);\
+    y4          = _mm256_unpackhi_ps(y4,z4);\
+    x2          = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+    x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+    y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);\
+    z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+    z2          = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0));\
+    _t5         = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2));\
+    y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+    x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+    _tE         = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0));\
+    _tF         = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2));\
+    _tG         = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0));\
+    _tH         = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_sub_ps(_t1,z2);\
+    _t2         = _mm256_sub_ps(_t2,_t5);\
+    _t3         = _mm256_sub_ps(_t3,y1);\
+    _t4         = _mm256_sub_ps(_t4,x1);\
+    _tA         = _mm_sub_ps(_tA,_tE);\
+    _tB         = _mm_sub_ps(_tB,_tF);\
+    _tC         = _mm_sub_ps(_tC,_tG);\
+    _tD         = _mm_sub_ps(_tD,_tH);\
+    _mm256_storeu_ps(ptrA,_t1);\
+    _mm256_storeu_ps(ptrB,_t2);\
+    _mm256_storeu_ps(ptrC,_t3);\
+    _mm256_storeu_ps(ptrD,_t4);\
+    _mm_storeu_ps(ptrA+8,_tA);\
+    _mm_storeu_ps(ptrB+8,_tB);\
+    _mm_storeu_ps(ptrC+8,_tC);\
+    _mm_storeu_ps(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3,
-                                          __m256 x4, __m256 y4, __m256 z4)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3,
+        __m256 x4, __m256 y4, __m256 z4)
 {
     __m256 t1,t2,t3,t4,t5;
     __m128 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -745,15 +855,15 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_storeu_ps(ptrC+8,tC);
     _mm_storeu_ps(ptrD+8,tD);
 }
-
+#endif
 
 
 static gmx_inline void
 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1)
 {
     __m256 t1,t2,t3,t4,t5,t6;
     __m256 tA,tB,tC,tD;
@@ -762,10 +872,10 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     /* Construct a mask without executing any data loads */
     mask        = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
 
-    tA          = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask));
-    tB          = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask));
-    tC          = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask));
-    tD          = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask));
+    tA          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
+    tB          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
+    tC          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
+    tD          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
     t1          = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
     t2          = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 
@@ -779,26 +889,103 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     tC          = _mm256_sub_ps(tC,t5);
     tD          = _mm256_sub_ps(tD,t6);
 
-    _mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
-    _mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
-    _mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
-    _mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
-    _mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
-    _mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
-    _mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
-    _mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
+    gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
+    gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
+    gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
+    gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
+    gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
+    gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
+    gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
+    gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
 }
 
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+    _tA         = _mm256_loadu_ps(ptrA);\
+    _tB         = _mm256_loadu_ps(ptrB);\
+    _tC         = _mm256_loadu_ps(ptrC);\
+    _tD         = _mm256_loadu_ps(ptrD);\
+    _tE         = _mm256_loadu_ps(ptrE);\
+    _tF         = _mm256_loadu_ps(ptrF);\
+    _tG         = _mm256_loadu_ps(ptrG);\
+    _tH         = _mm256_loadu_ps(ptrH);\
+    _t1         = _mm256_unpacklo_ps(_x1,_y1);\
+    _t2         = _mm256_unpackhi_ps(_x1,_y1);\
+    _t3         = _mm256_unpacklo_ps(_z1,_x2);\
+    _t4         = _mm256_unpackhi_ps(_z1,_x2);\
+    _t5         = _mm256_unpacklo_ps(_y2,_z2);\
+    _t6         = _mm256_unpackhi_ps(_y2,_z2);\
+    _t7         = _mm256_unpacklo_ps(_x3,_y3);\
+    _t8         = _mm256_unpackhi_ps(_x3,_y3);\
+    _t9         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t10        = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t11        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t12        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+    _t2         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+    _t3         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+    _t4         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+    _t5         = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+    _t6         = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+    _t7         = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+    _t8         = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+    _t1         = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+    _t2         = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+    _t9         = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+    _t10        = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+    _tA         = _mm256_sub_ps(_tA,_t5);\
+    _tB         = _mm256_sub_ps(_tB,_t7);\
+    _tC         = _mm256_sub_ps(_tC,_t1);\
+    _tD         = _mm256_sub_ps(_tD,_t9);\
+    _tE         = _mm256_sub_ps(_tE,_t6);\
+    _tF         = _mm256_sub_ps(_tF,_t8);\
+    _tG         = _mm256_sub_ps(_tG,_t2);\
+    _tH         = _mm256_sub_ps(_tH,_t10);\
+    _mm256_storeu_ps(ptrA,_tA);\
+    _mm256_storeu_ps(ptrB,_tB);\
+    _mm256_storeu_ps(ptrC,_tC);\
+    _mm256_storeu_ps(ptrD,_tD);\
+    _mm256_storeu_ps(ptrE,_tE);\
+    _mm256_storeu_ps(ptrF,_tF);\
+    _mm256_storeu_ps(ptrG,_tG);\
+    _mm256_storeu_ps(ptrH,_tH);\
+    _tI         = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));\
+    _tJ         = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));\
+    _tK         = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));\
+    _tL         = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));\
+    _tI         = _mm256_unpacklo_ps(_tI,_tK);\
+    _tJ         = _mm256_unpacklo_ps(_tJ,_tL);\
+    _tI         = _mm256_unpacklo_ps(_tI,_tJ);\
+    _tI         = _mm256_sub_ps(_tI,_z3);\
+    _tJ         = _mm256_permute_ps(_tI,_MM_SHUFFLE(1,1,1,1));\
+    _tK         = _mm256_permute_ps(_tI,_MM_SHUFFLE(2,2,2,2));\
+    _tL         = _mm256_permute_ps(_tI,_MM_SHUFFLE(3,3,3,3));\
+    _mm_store_ss(ptrA+8,_mm256_castps256_ps128(_tI));\
+    _mm_store_ss(ptrB+8,_mm256_castps256_ps128(_tJ));\
+    _mm_store_ss(ptrC+8,_mm256_castps256_ps128(_tK));\
+    _mm_store_ss(ptrD+8,_mm256_castps256_ps128(_tL));\
+    _mm_store_ss(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+    _mm_store_ss(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+    _mm_store_ss(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+    _mm_store_ss(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3)
 {
     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
     __m256 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -859,12 +1046,12 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm256_storeu_ps(ptrF,tF);
     _mm256_storeu_ps(ptrG,tG);
     _mm256_storeu_ps(ptrH,tH);
-    
+
     tI          = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
     tJ          = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
     tK          = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
     tL          = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
-    
+
     tI          = _mm256_unpacklo_ps(tI,tK);  /*  -  - zG zE |  -  - zC zA */
     tJ          = _mm256_unpacklo_ps(tJ,tL);  /*  -  - zH zF |  -  - zD zB */
     tI          = _mm256_unpacklo_ps(tI,tJ);  /* zH zG zF zE | zD zC zB zA */
@@ -883,17 +1070,102 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
     _mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
 }
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+    _tA         = _mm256_loadu_ps(ptrA);\
+    _tB         = _mm256_loadu_ps(ptrB);\
+    _tC         = _mm256_loadu_ps(ptrC);\
+    _tD         = _mm256_loadu_ps(ptrD);\
+    _tE         = _mm256_loadu_ps(ptrE);\
+    _tF         = _mm256_loadu_ps(ptrF);\
+    _tG         = _mm256_loadu_ps(ptrG);\
+    _tH         = _mm256_loadu_ps(ptrH);\
+    _t1         = _mm256_unpacklo_ps(_x1,_y1);\
+    _t2         = _mm256_unpackhi_ps(_x1,_y1);\
+    _t3         = _mm256_unpacklo_ps(_z1,_x2);\
+    _t4         = _mm256_unpackhi_ps(_z1,_x2);\
+    _t5         = _mm256_unpacklo_ps(_y2,_z2);\
+    _t6         = _mm256_unpackhi_ps(_y2,_z2);\
+    _t7         = _mm256_unpacklo_ps(_x3,_y3);\
+    _t8         = _mm256_unpackhi_ps(_x3,_y3);\
+    _t9         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t10        = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t11        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t12        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+    _t2         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+    _t3         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+    _t4         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+    _t5         = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+    _t6         = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+    _t7         = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+    _t8         = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+    _t1         = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+    _t2         = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+    _t9         = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+    _t10        = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+    _tA         = _mm256_sub_ps(_tA,_t5);\
+    _tB         = _mm256_sub_ps(_tB,_t7);\
+    _tC         = _mm256_sub_ps(_tC,_t1);\
+    _tD         = _mm256_sub_ps(_tD,_t9);\
+    _tE         = _mm256_sub_ps(_tE,_t6);\
+    _tF         = _mm256_sub_ps(_tF,_t8);\
+    _tG         = _mm256_sub_ps(_tG,_t2);\
+    _tH         = _mm256_sub_ps(_tH,_t10);\
+    _mm256_storeu_ps(ptrA,_tA);\
+    _mm256_storeu_ps(ptrB,_tB);\
+    _mm256_storeu_ps(ptrC,_tC);\
+    _mm256_storeu_ps(ptrD,_tD);\
+    _mm256_storeu_ps(ptrE,_tE);\
+    _mm256_storeu_ps(ptrF,_tF);\
+    _mm256_storeu_ps(ptrG,_tG);\
+    _mm256_storeu_ps(ptrH,_tH);\
+    _tI         = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));\
+    _tJ         = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));\
+    _tK         = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));\
+    _tL         = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));\
+    _t1         = _mm256_unpacklo_ps(_z3,_x4);\
+    _t2         = _mm256_unpackhi_ps(_z3,_x4);\
+    _t3         = _mm256_unpacklo_ps(_y4,_z4);\
+    _t4         = _mm256_unpackhi_ps(_y4,_z4);\
+    _t5         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t6         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t7         = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t8         = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _tI         = _mm256_sub_ps(_tI,_t5);\
+    _tJ         = _mm256_sub_ps(_tJ,_t6);\
+    _tK         = _mm256_sub_ps(_tK,_t7);\
+    _tL         = _mm256_sub_ps(_tL,_t8);\
+    _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(_tI));\
+    _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(_tJ));\
+    _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(_tK));\
+    _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(_tL));\
+    _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+    _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+    _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+    _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3,
-                                          __m256 x4, __m256 y4, __m256 z4)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3,
+        __m256 x4, __m256 y4, __m256 z4)
 {
     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
     __m256 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -959,7 +1231,7 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     tJ          = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
     tK          = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
     tL          = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
-    
+
     t1          = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
     t2          = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
     t3          = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
@@ -984,13 +1256,13 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
     _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
 }
-
+#endif
 
 
 static gmx_inline void
 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                      float * gmx_restrict fptr,
-                                      float * gmx_restrict fshiftptr)
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
 {
     __m128 t1,t2,t3;
 
@@ -1000,7 +1272,7 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
 
     /* Add across the two lanes */
     t1   = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
-    
+
     t2 = _mm_load_ss(fptr);
     t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
     t3 = _mm_load_ss(fshiftptr);
@@ -1015,12 +1287,53 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                                 fptr,fshiftptr) \
+{ \
+    __m256 _t1,_t2,_t3;\
+    __m128 _tA,_tB,_tC;\
+\
+    fix1 = _mm256_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm256_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+    fix1 = _mm256_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+\
+    _t1  = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+    _t2  = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+    _t1  = _mm256_add_ps(_t1,_t2);\
+    _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+    _t3  = _mm256_loadu_ps(fptr);\
+    _t3  = _mm256_add_ps(_t3,_t1);\
+    _mm256_storeu_ps(fptr,_t3);\
+    _tB  = _mm_load_ss(fptr+8);\
+    _tB  = _mm_add_ss(_tB,_tA);\
+    _mm_store_ss(fptr+8,_tB);\
+\
+    _tB  = _mm256_extractf128_ps(_t1,0x1);\
+    _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+    _tB  = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+    _tC  = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+    _tB  = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+    _tA  = _mm_add_ps(_tB,_tC);\
+    _tA  = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+    _tC  = _mm_loadu_ps(fshiftptr);\
+    _tC  = _mm_add_ps(_tC,_tA);\
+    _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                         __m256 fix2, __m256 fiy2, __m256 fiz2,
-                                         __m256 fix3, __m256 fiy3, __m256 fiz3,
-                                         float * gmx_restrict fptr,
-                                         float * gmx_restrict fshiftptr)
+        __m256 fix2, __m256 fiy2, __m256 fiz2,
+        __m256 fix3, __m256 fiy3, __m256 fiz3,
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
 {
     __m256 t1,t2,t3;
     __m128 tA,tB,tC;
@@ -1057,22 +1370,68 @@ gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
 
     tB   = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
     tA   = _mm_add_ps(tB,tC); /*  - z y x */
-    
+
     tA   = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
 
     tC   = _mm_loadu_ps(fshiftptr);
     tC   = _mm_add_ps(tC,tA);
     _mm_storeu_ps(fshiftptr,tC);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                                fptr,fshiftptr) \
+{ \
+    __m256 _t1,_t2,_t3; \
+    __m128 _tA,_tB,_tC; \
+\
+    fix1 = _mm256_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm256_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm256_hadd_ps(fiz3,fix4);\
+    fiy4 = _mm256_hadd_ps(fiy4,fiz4);\
+\
+    fix1 = _mm256_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm256_hadd_ps(fiz3,fiy4);\
+\
+    _t1  = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+    _t2  = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+    _t1  = _mm256_add_ps(_t1,_t2);\
+    _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+    _t3  = _mm256_loadu_ps(fptr);\
+    _t3  = _mm256_add_ps(_t3,_t1);\
+    _mm256_storeu_ps(fptr,_t3);\
+    _tB  = _mm_loadu_ps(fptr+8);\
+    _tB  = _mm_add_ps(_tB,_tA);\
+    _mm_storeu_ps(fptr+8,_tB);\
+\
+    _tB  = _mm256_extractf128_ps(_t1,0x1);\
+    _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+    _tB  = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+    _tC  = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+    _tA  = _mm_permute_ps(_tA,_MM_SHUFFLE(0,3,2,1));\
+    _tB  = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+    _tA  = _mm_add_ps(_tA,_tC);\
+    _tA  = _mm_add_ps(_tA,_tB);\
+    _tA  = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+    _tC  = _mm_loadu_ps(fshiftptr);\
+    _tC  = _mm_add_ps(_tC,_tA);\
+    _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                         __m256 fix2, __m256 fiy2, __m256 fiz2,
-                                         __m256 fix3, __m256 fiy3, __m256 fiz3,
-                                         __m256 fix4, __m256 fiy4, __m256 fiz4,
-                                         float * gmx_restrict fptr,
-                                         float * gmx_restrict fshiftptr)
+        __m256 fix2, __m256 fiy2, __m256 fiz2,
+        __m256 fix3, __m256 fiy3, __m256 fiz3,
+        __m256 fix4, __m256 fiy4, __m256 fiz4,
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
 {
     __m256 t1,t2,t3;
     __m128 tA,tB,tC;
@@ -1120,6 +1479,7 @@ gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
     tC   = _mm_add_ps(tC,tA);
     _mm_storeu_ps(fshiftptr,tC);
 }
+#endif
 
 
 
@@ -1153,26 +1513,4 @@ gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
 }
 
 
-static gmx_inline void
-gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
-                         __m256 pot2, float * gmx_restrict ptrB,
-                         __m256 pot3, float * gmx_restrict ptrC,
-                         __m256 pot4, float * gmx_restrict ptrD)
-{
-    __m128 t1,t2,t3,t4;
-
-    pot1 = _mm256_hadd_ps(pot1,pot2);
-    pot3 = _mm256_hadd_ps(pot3,pot4);
-    pot1 = _mm256_hadd_ps(pot1,pot3);
-    t1   = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
-    t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
-    t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
-    t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
-    _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
-    _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
-    _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
-}
-
-
 #endif /* _kernelutil_x86_avx_256_single_h_ */
index 006439173d4e8011d395e3195bbdd33580354c0a..35fb80eafc4936c869567813feb2868270545f5b 100644 (file)
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
 
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
 
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
 
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
 {
     __m128d t1,t2;
 
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrA+2,t2);
 }
 
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_load_sd(ptrA+8);\
+    _x1          = _mm_unpacklo_pd(_x1,_y1);\
+    _z1          = _mm_unpacklo_pd(_z1,_x2);\
+    _y2          = _mm_unpacklo_pd(_y2,_z2);\
+    _x3          = _mm_unpacklo_pd(_x3,_y3);\
+    _t1          = _mm_sub_pd(_t1,_x1);\
+    _t2          = _mm_sub_pd(_t2,_z1);\
+    _t3          = _mm_sub_pd(_t3,_y2);\
+    _t4          = _mm_sub_pd(_t4,_x3);\
+    _t5          = _mm_sub_sd(_t5,_z3);\
+    _mm_storeu_pd(ptrA,_t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_loadu_pd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrA+10);\
+    _x1          = _mm_unpacklo_pd(_x1,_y1);\
+    _z1          = _mm_unpacklo_pd(_z1,_x2);\
+    _y2          = _mm_unpacklo_pd(_y2,_z2);\
+    _x3          = _mm_unpacklo_pd(_x3,_y3);\
+    _z3          = _mm_unpacklo_pd(_z3,_x4);\
+    _y4          = _mm_unpacklo_pd(_y4,_z4);\
+    _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+    _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+    _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+    _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+    _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+    _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_store_sd(ptrB+2,t4);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_load_sd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrB);\
+    _t7          = _mm_loadu_pd(ptrB+2);\
+    _t8          = _mm_loadu_pd(ptrB+4);\
+    _t9          = _mm_loadu_pd(ptrB+6);\
+    _t10         = _mm_load_sd(ptrB+8);\
+    _tA          = _mm_unpacklo_pd(_x1,_y1);\
+    _tB          = _mm_unpackhi_pd(_x1,_y1);\
+    _tC          = _mm_unpacklo_pd(_z1,_x2);\
+    _tD          = _mm_unpackhi_pd(_z1,_x2);\
+    _tE          = _mm_unpacklo_pd(_y2,_z2);\
+    _tF          = _mm_unpackhi_pd(_y2,_z2);\
+    _tG          = _mm_unpacklo_pd(_x3,_y3);\
+    _tH          = _mm_unpackhi_pd(_x3,_y3);\
+    _tI          = _mm_unpackhi_pd(_z3,_z3);\
+    _t1          = _mm_sub_pd(_t1,_tA);\
+    _t2          = _mm_sub_pd(_t2,_tC);\
+    _t3          = _mm_sub_pd(_t3,_tE);\
+    _t4          = _mm_sub_pd(_t4,_tG);\
+    _t5          = _mm_sub_sd(_t5,_z3);\
+    _t6          = _mm_sub_pd(_t6,_tB);\
+    _t7          = _mm_sub_pd(_t7,_tD);\
+    _t8          = _mm_sub_pd(_t8,_tF);\
+    _t9          = _mm_sub_pd(_t9,_tH);\
+    _t10         = _mm_sub_sd(_t10,_tI);\
+    _mm_storeu_pd(ptrA,_t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_store_sd(ptrA+8,_t5);\
+    _mm_storeu_pd(ptrB,_t6);\
+    _mm_storeu_pd(ptrB+2,_t7);\
+    _mm_storeu_pd(ptrB+4,_t8);\
+    _mm_storeu_pd(ptrB+6,_t9);\
+    _mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_loadu_pd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrA+10);\
+    _t7          = _mm_loadu_pd(ptrB);\
+    _t8          = _mm_loadu_pd(ptrB+2);\
+    _t9          = _mm_loadu_pd(ptrB+4);\
+    _t10         = _mm_loadu_pd(ptrB+6);\
+    _t11         = _mm_loadu_pd(ptrB+8);\
+    _t12         = _mm_loadu_pd(ptrB+10);\
+    _tA          = _mm_unpacklo_pd(_x1,_y1);\
+    _tB          = _mm_unpackhi_pd(_x1,_y1);\
+    _tC          = _mm_unpacklo_pd(_z1,_x2);\
+    _tD          = _mm_unpackhi_pd(_z1,_x2);\
+    _tE          = _mm_unpacklo_pd(_y2,_z2);\
+    _tF          = _mm_unpackhi_pd(_y2,_z2);\
+    _tG          = _mm_unpacklo_pd(_x3,_y3);\
+    _tH          = _mm_unpackhi_pd(_x3,_y3);\
+    _tI          = _mm_unpacklo_pd(_z3,_x4);\
+    _tJ          = _mm_unpackhi_pd(_z3,_x4);\
+    _tK          = _mm_unpacklo_pd(_y4,_z4);\
+    _tL          = _mm_unpackhi_pd(_y4,_z4);\
+    _t1          = _mm_sub_pd(_t1,_tA);\
+    _t2          = _mm_sub_pd(_t2,_tC);\
+    _t3          = _mm_sub_pd(_t3,_tE);\
+    _t4          = _mm_sub_pd(_t4,_tG);\
+    _t5          = _mm_sub_pd(_t5,_tI);\
+    _t6          = _mm_sub_pd(_t6,_tK);\
+    _t7          = _mm_sub_pd(_t7,_tB);\
+    _t8          = _mm_sub_pd(_t8,_tD);\
+    _t9          = _mm_sub_pd(_t9,_tF);\
+    _t10         = _mm_sub_pd(_t10,_tH);\
+    _t11         = _mm_sub_pd(_t11,_tJ);\
+    _t12         = _mm_sub_pd(_t12,_tL);\
+    _mm_storeu_pd(ptrA,  _t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_storeu_pd(ptrA+8,_t5);\
+    _mm_storeu_pd(ptrA+10,_t6);\
+    _mm_storeu_pd(ptrB,  _t7);\
+    _mm_storeu_pd(ptrB+2,_t8);\
+    _mm_storeu_pd(ptrB+4,_t9);\
+    _mm_storeu_pd(ptrB+6,_t10);\
+    _mm_storeu_pd(ptrB+8,_t11);\
+    _mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -699,6 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
+#endif
 
 
 
@@ -726,6 +818,39 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+    GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+    GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+    _t1 = fix3;\
+    fix3 = _mm_unpacklo_pd(fix3,fiy3);\
+    fiy3 = _mm_unpackhi_pd(_t1,fiy3);\
+    fix1 = _mm_add_pd(fix1,fiy1);\
+    fiz1 = _mm_add_pd(fiz1,fix2);\
+    fiy2 = _mm_add_pd(fiy2,fiz2);\
+    fix3 = _mm_add_pd(fix3,fiy3);\
+    fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -767,8 +892,46 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+    GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+    GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+    GMX_MM_TRANSPOSE2_PD(fix3,fiy3);\
+    GMX_MM_TRANSPOSE2_PD(fiz3,fix4);\
+    GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);\
+    fix1 = _mm_add_pd(fix1,fiy1);\
+    fiz1 = _mm_add_pd(fiz1,fix2);\
+    fiy2 = _mm_add_pd(fiy2,fiz2);\
+    fix3 = _mm_add_pd(fix3,fiy3);\
+    fiz3 = _mm_add_pd(fiz3,fix4);\
+    fiy4 = _mm_add_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -793,7 +956,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     fix3 = _mm_add_pd(fix3,fiy3);
     fiz3 = _mm_add_pd(fiz3,fix4);
     fiy4 = _mm_add_pd(fiy4,fiz4);
-    
+
     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
@@ -814,7 +977,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
+#endif
 
 
 static gmx_inline void
index e0b324f582de5960fbdd5e4d624954808e9edde2..278312182055bc47f5848cedc9de2c41b7c46191 100644 (file)
@@ -31,7 +31,7 @@
 
 /* We require SSE2 now! */
 
-#include <math.h> 
+#include <math.h>
 
 #include "gmx_x86_sse2.h"
 
@@ -128,20 +128,20 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
-    
+
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
     t3   = _mm_load_ss(xyz_shift+2);
     t4   = _mm_load_ss(xyz+2);
     t1   = _mm_add_ps(t1,t2);
     t3   = _mm_add_ss(t3,t4);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
@@ -150,30 +150,30 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     tB   = _mm_load_ss(xyz_shift+2);
-    
+
     t1   = _mm_loadu_ps(xyz);
     t2   = _mm_loadu_ps(xyz+4);
     t3   = _mm_load_ss(xyz+8);
-    
+
     tA   = _mm_movelh_ps(tA,tB);
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ss(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -188,31 +188,31 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
     tB   = _mm_load_ss(xyz_shift+2);
-    
+
     t1   = _mm_loadu_ps(xyz);
     t2   = _mm_loadu_ps(xyz+4);
     t3   = _mm_loadu_ps(xyz+8);
-    
+
     tA   = _mm_movelh_ps(tA,tB);
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ps(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -263,7 +263,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   const float * gmx_restrict ptrD,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3) 
+                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 t1,t2,t3,t4;
     t1            = _mm_loadu_ps(ptrA);
@@ -302,7 +302,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4) 
+                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 t1,t2,t3,t4;
     t1            = _mm_loadu_ps(ptrA);
@@ -373,12 +373,78 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
 
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18         = _mm_movehl_ps(_z3,_z3);\
+_t19         = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20         = _mm_movelh_ps(_x1,_z1);\
+_t21         = _mm_movehl_ps(_z1,_x1);\
+_t22         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t23         = _mm_movelh_ps(_y2,_x3);\
+_t24         = _mm_movehl_ps(_x3,_y2);\
+_t25         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_load_ss(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t20);\
+_t2          = _mm_sub_ps(_t2,_t23);\
+_t3          = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_load_ss(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_t21);\
+_t5          = _mm_sub_ps(_t5,_t24);\
+_t6          = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_load_ss(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t22);\
+_t8          = _mm_sub_ps(_t8,_t25);\
+_t9          = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_load_ss(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
-                                       __m128 x3, __m128 y3, __m128 z3) 
+                                       __m128 x3, __m128 y3, __m128 z3)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
@@ -440,15 +506,87 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_store_ss(ptrD+8,t12);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_unpackhi_ps(_z3,_x4);\
+_z3          = _mm_unpacklo_ps(_z3,_x4);\
+_t18         = _mm_unpackhi_ps(_y4,_z4);\
+_y4          = _mm_unpacklo_ps(_y4,_z4);\
+_t19         = _mm_movelh_ps(_x1,_z1);\
+_z1          = _mm_movehl_ps(_z1,_x1);\
+_t20         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t21         = _mm_movelh_ps(_y2,_x3);\
+_x3          = _mm_movehl_ps(_x3,_y2);\
+_t22         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t23         = _mm_movelh_ps(_z3,_y4);\
+_y4          = _mm_movehl_ps(_y4,_z3);\
+_t24         = _mm_movelh_ps(_t17,_t18);\
+_t18         = _mm_movehl_ps(_t18,_t17);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_loadu_ps(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t19);\
+_t2          = _mm_sub_ps(_t2,_t21);\
+_t3          = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_loadu_ps(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_z1);\
+_t5          = _mm_sub_ps(_t5,_x3);\
+_t6          = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_loadu_ps(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t20);\
+_t8          = _mm_sub_ps(_t8,_t22);\
+_t9          = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_loadu_ps(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static void
 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
                                        __m128 x3, __m128 y3, __m128 z3,
-                                       __m128 x4, __m128 y4, __m128 z4) 
+                                       __m128 x4, __m128 y4, __m128 z4)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
@@ -514,7 +652,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_storeu_ps(ptrD+8,t12);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -543,6 +681,38 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4;\
+\
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+    _t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);\
+    _t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));\
+    _t3   = _mm_shuffle_ps(_t2,_t2,_MM_SHUFFLE(0,0,0,1));\
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+    fiz3 = _mm_add_ss(_mm_add_ps(fiz3,_t1)  , _mm_add_ps(_t2,_t3));\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+    _t4 = _mm_load_ss(fshiftptr+2);\
+    _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+    _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+    _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+    _t3 = _mm_shuffle_ps(_t3  ,_t3  ,_MM_SHUFFLE(1,2,0,0));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _mm_store_ss(fshiftptr+2,_t1);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -582,8 +752,39 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t1);
     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5;\
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+    _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);\
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+    fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+    _t5 = _mm_load_ss(fshiftptr+2);\
+    _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+    _t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+    _t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+    _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+    _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _t5 = _mm_add_ps(_t5,_t1);\
+    _mm_store_ss(fshiftptr+2,_t5);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -624,7 +825,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_store_ss(fshiftptr+2,t5);
     _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 }
-
+#endif
 
 
 static void
@@ -651,22 +852,4 @@ gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
 }
 
 
-static void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
 #endif /* _kernelutil_x86_sse2_single_h_ */
index e7bb484515c65505b2963a05b71e98c29613664f..f304aa5d222f9cbefedebd244abba4041f7974de 100644 (file)
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
 
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
 
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
 
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
 {
     __m128d t1,t2;
 
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrA+2,t2);
 }
 
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_store_sd(ptrB+2,t4);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -699,7 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
-
+#endif
 
 
 
@@ -719,6 +810,34 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -751,8 +870,39 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -790,8 +940,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
 
 static gmx_inline void
 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
index cee331911b3f87a7f27ab30f37e4f801a817b4f7..0f19ea8bfb8d8baa4f375461a116b0bb308f69e6 100644 (file)
 #ifndef _kernelutil_x86_sse4_1_single_h_
 #define _kernelutil_x86_sse4_1_single_h_
 
-#include <math.h> 
+#include <math.h>
 
 #include "gmx_x86_sse4_1.h"
 
 #undef gmx_restrict
-#define gmx_restrict 
+#define gmx_restrict
 
 /* Normal sum of four xmm registers */
 #define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
@@ -60,7 +60,7 @@ gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                              const float * gmx_restrict ptrD)
 {
     __m128 t1,t2;
-    
+
     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
     return _mm_unpacklo_ps(t1,t2);
@@ -74,14 +74,14 @@ gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                               __m128 xmm1)
 {
     __m128 t2,t3,t4;
-    
-    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);               
-    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));     
-    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1)); 
-    _mm_store_ss(ptrA,xmm1);                                           
-    _mm_store_ss(ptrB,t2);                                         
-    _mm_store_ss(ptrC,t3);                                         
-    _mm_store_ss(ptrD,t4);                                         
+
+    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_ss(ptrA,xmm1);
+    _mm_store_ss(ptrB,t2);
+    _mm_store_ss(ptrC,t3);
+    _mm_store_ss(ptrD,t4);
 }
 
 /* Similar to store, but increments value in memory */
@@ -92,7 +92,7 @@ gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                   float * gmx_restrict ptrD, __m128 xmm1)
 {
     __m128 tmp;
-    
+
     tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
     tmp = _mm_add_ps(tmp,xmm1);
     gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
@@ -108,7 +108,7 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                              __m128 * gmx_restrict c12)
 {
     __m128 t1,t2,t3,t4;
-    
+
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
     t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
@@ -122,20 +122,20 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
 {
     __m128 t1,t2,t3,t4;
-    
+
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
     t3   = _mm_load_ss(xyz_shift+2);
     t4   = _mm_load_ss(xyz+2);
     t1   = _mm_add_ps(t1,t2);
     t3   = _mm_add_ss(t3,t4);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
@@ -144,14 +144,14 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     tB   = _mm_load_ss(xyz_shift+2);
 
@@ -163,11 +163,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ss(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -182,31 +182,31 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
-    
+
     tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
     tB   = _mm_load_ss(xyz_shift+2);
-    
+
     t1   = _mm_loadu_ps(xyz);
     t2   = _mm_loadu_ps(xyz+4);
     t3   = _mm_loadu_ps(xyz+8);
-    
+
     tA   = _mm_movelh_ps(tA,tB);
     t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
     t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
     t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
     t1   = _mm_add_ps(t1,t4);
     t2   = _mm_add_ps(t2,t5);
     t3   = _mm_add_ps(t3,t6);
-    
+
     *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
     *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
     *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -257,7 +257,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   const float * gmx_restrict ptrD,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3) 
+                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 {
     __m128 t1,t2,t3,t4;
     t1            = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrA ) );
@@ -296,7 +296,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4) 
+                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 {
     __m128 t1,t2,t3,t4;
     t1            = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA) ) );
@@ -368,12 +368,78 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
 
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18         = _mm_movehl_ps(_z3,_z3);\
+_t19         = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20         = _mm_movelh_ps(_x1,_z1);\
+_t21         = _mm_movehl_ps(_z1,_x1);\
+_t22         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t23         = _mm_movelh_ps(_y2,_x3);\
+_t24         = _mm_movehl_ps(_x3,_y2);\
+_t25         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_load_ss(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t20);\
+_t2          = _mm_sub_ps(_t2,_t23);\
+_t3          = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_load_ss(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_t21);\
+_t5          = _mm_sub_ps(_t5,_t24);\
+_t6          = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_load_ss(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t22);\
+_t8          = _mm_sub_ps(_t8,_t25);\
+_t9          = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_load_ss(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
-                                       __m128 x3, __m128 y3, __m128 z3) 
+                                       __m128 x3, __m128 y3, __m128 z3)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
@@ -410,7 +476,7 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     t10         = _mm_loadu_ps(ptrD);
     t11         = _mm_loadu_ps(ptrD+4);
     t12         = _mm_load_ss(ptrD+8);
-    
+
     t1          = _mm_sub_ps(t1,t20);
     t2          = _mm_sub_ps(t2,t23);
     t3          = _mm_sub_ss(t3,z3);
@@ -436,15 +502,86 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_store_ss(ptrD+8,t12);
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_unpackhi_ps(_z3,_x4);\
+_z3          = _mm_unpacklo_ps(_z3,_x4);\
+_t18         = _mm_unpackhi_ps(_y4,_z4);\
+_y4          = _mm_unpacklo_ps(_y4,_z4);\
+_t19         = _mm_movelh_ps(_x1,_z1);\
+_z1          = _mm_movehl_ps(_z1,_x1);\
+_t20         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t21         = _mm_movelh_ps(_y2,_x3);\
+_x3          = _mm_movehl_ps(_x3,_y2);\
+_t22         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t23         = _mm_movelh_ps(_z3,_y4);\
+_y4          = _mm_movehl_ps(_y4,_z3);\
+_t24         = _mm_movelh_ps(_t17,_t18);\
+_t18         = _mm_movehl_ps(_t18,_t17);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_loadu_ps(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t19);\
+_t2          = _mm_sub_ps(_t2,_t21);\
+_t3          = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_loadu_ps(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_z1);\
+_t5          = _mm_sub_ps(_t5,_x3);\
+_t6          = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_loadu_ps(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t20);\
+_t8          = _mm_sub_ps(_t8,_t22);\
+_t9          = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_loadu_ps(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                        __m128 x1, __m128 y1, __m128 z1,
                                        __m128 x2, __m128 y2, __m128 z2,
                                        __m128 x3, __m128 y3, __m128 z3,
-                                       __m128 x4, __m128 y4, __m128 z4) 
+                                       __m128 x4, __m128 y4, __m128 z4)
 {
     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
@@ -510,7 +647,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
     _mm_storeu_ps(ptrD+4,t11);
     _mm_storeu_ps(ptrD+8,t12);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -518,27 +655,59 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       float * gmx_restrict fptr,
                                       float * gmx_restrict fshiftptr)
 {
-       __m128 t2,t3;
-       
+    __m128 t2,t3;
+
     fix1 = _mm_hadd_ps(fix1,fix1);
-       fiy1 = _mm_hadd_ps(fiy1,fiz1);
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
-    
-       t2 = _mm_load_ss(fptr);
-       t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
-       t3 = _mm_load_ss(fshiftptr);
-       t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
-       
-       t2 = _mm_add_ps(t2,fix1);
-       t3 = _mm_add_ps(t3,fix1);
-       
-       _mm_store_ss(fptr,t2);
-       _mm_storeh_pi((__m64 *)(fptr+1),t2);
-       _mm_store_ss(fshiftptr,t3);
-       _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+    fiy1 = _mm_hadd_ps(fiy1,fiz1);
+
+    fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
+
+    t2 = _mm_load_ss(fptr);
+    t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+    t3 = _mm_load_ss(fshiftptr);
+    t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+    t2 = _mm_add_ps(t2,fix1);
+    t3 = _mm_add_ps(t3,fix1);
+
+    _mm_store_ss(fptr,t2);
+    _mm_storeh_pi((__m64 *)(fptr+1),t2);
+    _mm_store_ss(fshiftptr,t3);
+    _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+_mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+_t4 = _mm_load_ss(fshiftptr+2);\
+_t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+_t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+_t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+_t3 = _mm_shuffle_ps(_t3,_t3,_MM_SHUFFLE(1,2,0,0));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_mm_store_ss(fshiftptr+2,_t1);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -546,39 +715,74 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       float * gmx_restrict fptr,
                                       float * gmx_restrict fshiftptr)
 {
-       __m128 t1,t2,t3,t4;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       fix3 = _mm_hadd_ps(fix3,fiy3);
-       fiz3 = _mm_hadd_ps(fiz3,fiz3);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
-       fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
-    
-       _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
-       _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
-       
-       t4 = _mm_load_ss(fshiftptr+2);
-       t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
-       t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
-       t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
-       t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
-    
-       t1 = _mm_add_ps(t1,t2);
-       t3 = _mm_add_ps(t3,t4);
-       t1 = _mm_add_ps(t1,t3); /* y x - z */
-       
-       _mm_store_ss(fshiftptr+2,t1);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
+    __m128 t1,t2,t3,t4;
+
+    fix1 = _mm_hadd_ps(fix1,fiy1);
+    fiz1 = _mm_hadd_ps(fiz1,fix2);
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);
+    fix3 = _mm_hadd_ps(fix3,fiy3);
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);
+
+    fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+    fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+    fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+    t4 = _mm_load_ss(fshiftptr+2);
+    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
+    t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
+    t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
+    t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
 
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3); /* y x - z */
 
+    _mm_store_ss(fshiftptr+2,t1);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fix4);\
+fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+_mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+_t5 = _mm_load_ss(fshiftptr+2);\
+_t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+_t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+_t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+_t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+_t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_t5 = _mm_add_ps(_t5,_t1);\
+_mm_store_ss(fshiftptr+2,_t5);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -587,41 +791,41 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       float * gmx_restrict fptr,
                                       float * gmx_restrict fshiftptr)
 {
-       __m128 t1,t2,t3,t4,t5;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       fix3 = _mm_hadd_ps(fix3,fiy3);
-       fiz3 = _mm_hadd_ps(fiz3,fix4);
-       fiy4 = _mm_hadd_ps(fiy4,fiz4);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
-       fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
-    
-       _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
-       _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
-       
-       t5 = _mm_load_ss(fshiftptr+2);
-       t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
-       t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
-       t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
-       t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
-       t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
-       
-       t1 = _mm_add_ps(t1,t2);
-       t3 = _mm_add_ps(t3,t4);
-       t1 = _mm_add_ps(t1,t3);
-       t5 = _mm_add_ps(t5,t1);
-       
-       _mm_store_ss(fshiftptr+2,t5);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+    __m128 t1,t2,t3,t4,t5;
+
+    fix1 = _mm_hadd_ps(fix1,fiy1);
+    fiz1 = _mm_hadd_ps(fiz1,fix2);
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);
+    fix3 = _mm_hadd_ps(fix3,fiy3);
+    fiz3 = _mm_hadd_ps(fiz3,fix4);
+    fiy4 = _mm_hadd_ps(fiy4,fiz4);
+
+    fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+    fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+    fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+    t5 = _mm_load_ss(fshiftptr+2);
+    t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+    t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+    t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+    t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+    t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
+
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3);
+    t5 = _mm_add_ps(t5,t1);
+
+    _mm_store_ss(fshiftptr+2,t5);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 }
-
+#endif
 
 
 static gmx_inline void
@@ -636,33 +840,15 @@ static gmx_inline void
 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                       __m128 pot2, float * gmx_restrict ptrB)
 {
-       __m128 t1,t2;
-       t1   = _mm_movehl_ps(pot2,pot1); 
-       t2   = _mm_movelh_ps(pot1,pot2); 
-       t1   = _mm_add_ps(t1,t2);       
-       t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
-       pot1 = _mm_add_ps(t1,t2);       
-       pot2 = _mm_movehl_ps(t2,pot1);
-       _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-       _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-}
-
-
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
-       _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-       _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-       _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-       _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+    __m128 t1,t2;
+    t1   = _mm_movehl_ps(pot2,pot1);
+    t2   = _mm_movelh_ps(pot1,pot2);
+    t1   = _mm_add_ps(t1,t2);
+    t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+    pot1 = _mm_add_ps(t1,t2);
+    pot2 = _mm_movehl_ps(t2,pot1);
+    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
 }
 
 
index 70c9dde017a9be9641de8e22515d5d6a223127f6..cad77484b5eacaf138049d9422083ec24ca4c989 100644 (file)
@@ -131,7 +131,6 @@ extern t_complex *** rc_tensor_allocation(int x, int y, int z)
   t_complex ***t;
   int i,j;
 
-  snew(t,x);
   t = (t_complex ***)calloc(x,sizeof(t_complex**));
   if(!t) exit(fprintf(stderr,"\nallocation error"));
   t[0] = (t_complex **)calloc(x*y,sizeof(t_complex*));
index ace7940aa5fc761e8aff0d773e076a787c232b05..3cbf07e6aa6e0d82d288ab37c1fd7057edfea37d 100644 (file)
@@ -331,7 +331,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
                     /* We tried again, and this time there was a copied buffer. 
                        We use that, and indicate that we're not reading from the
                        regular buf. This case should be pretty rare.  */
-                    tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1);
+                    tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount),-1);
                     tMPI_Atomic_memory_barrier_acq();
                     srcbuf=try_again_srcbuf;
                 }
@@ -354,7 +354,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
         {
             /* we decrement the read count; potentially releasing the buffer. */
             tMPI_Atomic_memory_barrier_rel();
-            tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
+            tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
         }
 #endif
     }
@@ -481,7 +481,7 @@ void tMPI_Wait_for_others(struct coll_env *cev, int myrank)
     else
     {
         /* wait until everybody else is done copying the original buffer. 
-           We use fetch_add because we want to be sure of coherency.
+           We use atomic add-return because we want to be sure of coherency.
            This wait is bound to be very short (otherwise it wouldn't 
            be double-buffering) so we always spin here. */
         /*tMPI_Atomic_memory_barrier_rel();*/
@@ -490,7 +490,7 @@ void tMPI_Wait_for_others(struct coll_env *cev, int myrank)
                                     -100000))
 #endif
 #if 0
-        while (tMPI_Atomic_fetch_add( &(cev->met[myrank].buf_readcount), 0) 
+        while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0) 
                != 0)
 #endif
 #if 1
index a25c6db243452b4dfb844f14f57cc380e18b7d97..bee468ceea3fe522f374dbf7a0f633609eb369b8 100644 (file)
@@ -137,7 +137,7 @@ void* tMPI_Once_wait(tMPI_Comm comm, void* (*function)(void*), void *param,
 
         tMPI_Atomic_memory_barrier_rel();
         /* signal that we're done */
-        tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
+        tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
         /* we need to keep being in sync */
         csync->syncs++;
     }
index 7e59a300baabf3d351e7f6e09d9416b838a86433..ee3a906a0ff296afd8647136880479a4c8afe02e 100644 (file)
@@ -463,7 +463,7 @@ void tMPI_Start_threads(tmpi_bool main_returns, int N,
                 tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
             }
         }
-        /* the main thread now also runs start_fn if we don't want
+        /* the main thread also runs start_fn if we don't want
            it to return */
         if (!main_returns)
             tMPI_Thread_starter((void*)&(threads[0]));
@@ -480,12 +480,11 @@ int tMPI_Init(int *argc, char ***argv,
     tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
 #endif
 
-
     if (TMPI_COMM_WORLD==0) /* we're the main process */
     {
         int N=0;
         tMPI_Get_N(argc, argv, "-nt", &N);
-        tMPI_Start_threads(FALSE, N, TMPI_AFFINITY_ALL_CORES, argc, argv, 
+        tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv, 
                            NULL, NULL, start_function);
     }
     else
index 46d7c47f2fc508e7fd35ca1bf68f3ed94b242120..da20b6700cdffbc1eea3dc902f393f1119f5060e 100644 (file)
@@ -1619,7 +1619,7 @@ void get_ir(const char *mdparin,const char *mdparout,
   CTYPE ("a value of -1 means: use rlist");
   RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
   CTYPE ("nblist cut-off");
-  RTYPE ("rlist",      ir->rlist,      -1);
+  RTYPE ("rlist",      ir->rlist,      1.0);
   CTYPE ("long-range cut-off for switched potentials");
   RTYPE ("rlistlong",  ir->rlistlong,  -1);
   ITYPE ("nstcalclr",  ir->nstcalclr,  -1);
@@ -1631,7 +1631,7 @@ void get_ir(const char *mdparin,const char *mdparout,
   EETYPE("coulomb-modifier",   ir->coulomb_modifier,    eintmod_names);
   CTYPE ("cut-off lengths");
   RTYPE ("rcoulomb-switch",    ir->rcoulomb_switch,    0.0);
-  RTYPE ("rcoulomb",   ir->rcoulomb,   -1);
+  RTYPE ("rcoulomb",   ir->rcoulomb,   1.0);
   CTYPE ("Relative dielectric constant for the medium and the reaction field");
   RTYPE ("epsilon-r",   ir->epsilon_r,  1.0);
   RTYPE ("epsilon-rf",  ir->epsilon_rf, 0.0);
@@ -1640,12 +1640,12 @@ void get_ir(const char *mdparin,const char *mdparout,
   EETYPE("vdw-modifier",       ir->vdw_modifier,    eintmod_names);
   CTYPE ("cut-off lengths");
   RTYPE ("rvdw-switch",        ir->rvdw_switch,        0.0);
-  RTYPE ("rvdw",       ir->rvdw,       -1);
+  RTYPE ("rvdw",       ir->rvdw,       1.0);
   CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
   EETYPE("DispCorr",    ir->eDispCorr,  edispc_names);
   CTYPE ("Extension of the potential lookup tables beyond the cut-off");
   RTYPE ("table-extension", ir->tabext, 1.0);
-  CTYPE ("Seperate tables between energy group pairs");
+  CTYPE ("Separate tables between energy group pairs");
   STYPE ("energygrp-table", egptable,   NULL);
   CTYPE ("Spacing for the PME/PPPM FFT grid");
   RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
index 564d61128b09d99daa6df36b9b0b0eb33bb2618b..c4f9c6e7fe96d198a8a215fcb9932a5a1757bc7d 100644 (file)
 extern "C" {
 #endif
 
-void do_edsam(t_inputrec *ir,gmx_large_int_t step,t_mdatoms *md,
+void do_edsam(t_inputrec *ir,gmx_large_int_t step,
                      t_commrec *cr,rvec xs[],rvec v[],matrix box,gmx_edsam_t ed);
 /* Essential dynamics constraints, called from constrain() */
 
-gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec *cr);
-/* Sets the ED input/output filenames, opens output (.edo) file */
+gmx_edsam_t ed_open(int natoms, edsamstate_t *EDstate, int nfile,const t_filenm fnm[],
+        unsigned long Flags, const output_env_t oenv, t_commrec *cr);
+/* Sets the ED input/output filenames, opens output file */
 
 void init_edsam(gmx_mtop_t *mtop,t_inputrec *ir,t_commrec *cr,
-                       gmx_edsam_t ed, rvec x[], matrix box);
+                       gmx_edsam_t ed, rvec x[], matrix box, edsamstate_t *edsamstate);
 /* Init routine for ED and flooding. Calls init_edi in a loop for every .edi-cycle 
  * contained in the input file, creates a NULL terminated list of t_edpar structures */
 
@@ -58,7 +59,7 @@ void dd_make_local_ed_indices(gmx_domdec_t *dd, gmx_edsam_t ed);
 /* Make a selection of the home atoms for the ED groups. 
  * Should be called at every domain decomposition. */
  
-void do_flood(FILE *log, t_commrec *cr, rvec x[],rvec force[], gmx_edsam_t ed,
+void do_flood(t_commrec *cr, t_inputrec *ir, rvec x[],rvec force[], gmx_edsam_t ed,
         matrix box, gmx_large_int_t step, gmx_bool bNS);
 /* Flooding - called from do_force() */
 
index d8fc34f1988bf1bffcf0ff0267e83daf155a2215..c379843876cbdd1003d3e88eed88698fea05834f 100644 (file)
@@ -87,9 +87,6 @@ gmx_densorder(int argc,char *argv[]);
 int 
 gmx_dielectric(int argc,char *argv[]);
 
-int 
-gmx_dih(int argc,char *argv[]);
-
 int 
 gmx_dipoles(int argc,char *argv[]);
 
index 25c42e1db2c7a720a635e747fd515384110e5192..6fe60f824780b5edbda95cd865103d0b1fc24dcc 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <stdio.h>
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -173,6 +174,39 @@ gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
                              enum gmx_cpuid_feature     feature);
 
 
+/* Return pointers to cpu topology information.
+ * 
+ * Important: CPU topology requires more OS support than most other
+ * functions in this file, including support for thread pinning to hardware.
+ * This means it will not work on some platforms, including e.g. Mac OS X.
+ * Thus, it is IMPERATIVE that you check the return value from this routine
+ * before doing anything with the information. It is only if the return
+ * value is zero that the data is valid.
+ *
+ * For the returned values we have:
+ * - nprocessors         Total number of logical processors reported by OS
+ * - npackages           Usually number of CPU sockets
+ * - ncores_per_package  Number of cores in each package
+ * - nhwthreads_per_core Number of hardware threads per core; 2 for hyperthreading.
+ * - package_id          Array with the package index for each logical cpu
+ * - core_id             Array with local core index for each logical cpu
+ * - hwthread_id         Array with local hwthread index for each logical cpu
+ * - locality_order      Array with logical cpu numbers, sorted in order
+ *                       of physical and logical locality in the system.
+ *
+ * All arrays are of length nprocessors.
+ */
+int
+gmx_cpuid_topology(gmx_cpuid_t        cpuid,
+                   int *              nprocessors,
+                   int *              npackages,
+                   int *              ncores_per_package,
+                   int *              nhwthreads_per_core,
+                   const int **       package_id,
+                   const int **       core_id,
+                   const int **       hwthread_id,
+                   const int **       locality_order);
+
 /* Enumerated values for x86 SMT enabled-status. Note that this does not refer
  * to Hyper-Threading support (that is the flag GMX_CPUID_FEATURE_X86_HTT), but
  * whether Hyper-Threading is _enabled_ and _used_ in bios right now.
@@ -211,12 +245,14 @@ enum gmx_cpuid_x86_smt
  * in order not to give the impression we can detect any SMT. We haven't
  * even tested the performance on other SMT implementations, so it is not
  * obvious we shouldn't use SMT there.
+ *
+ * Note that you can get more complete topology information from
+ * gmx_cpuid_topology(), although that requires slightly more OS support.
  */
 enum gmx_cpuid_x86_smt
 gmx_cpuid_x86_smt(gmx_cpuid_t cpuid);
 
 
-
 /* Formats a text string (up to n characters) from the data structure.
  * The output will have max 80 chars between newline characters.
  */
index b47f577be08ea8cdcfc4b85cac49a84aed430e60..02390c1fcc2c942705e6228d2a76561195151b97 100644 (file)
 #ifndef _gmx_math_x86_avx_128_fma_double_h_
 #define _gmx_math_x86_avx_128_fma_double_h_
 
+#include <immintrin.h> /* AVX */
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
 #include <math.h>
 
 #include "gmx_x86_avx_128_fma.h"
index 2cd5ed91cb987e38ea40b554b22f042ace3c7c7c..074b1bad06f592700d45ea8d0591dc316d448c20 100644 (file)
  * with different settings from the same source file.
  */
 
-/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
+/* NOTE: floor and blendv are NOT available with SSE2 only acceleration */
 
 #undef GMX_SIMD_WIDTH_HERE
 
 #undef gmx_epi32
 
+/* float/double SIMD register type */
 #undef gmx_mm_pr
 
 #undef gmx_load_pr
@@ -58,6 +59,7 @@
 #undef gmx_set1_pr
 #undef gmx_setzero_pr
 #undef gmx_store_pr
+/* Only used for debugging */
 #undef gmx_storeu_pr
 
 #undef gmx_add_pr
 #undef gmx_or_pr
 #undef gmx_andnot_pr
 
+/* Only used to speed up the nbnxn tabulated PME kernels */
 #undef gmx_floor_pr
+/* Only used with x86 when blendv is faster than comparison */
 #undef gmx_blendv_pr
 
 #undef gmx_movemask_pr
 
+/* Integer casts are only used for nbnxn x86 exclusion masks */
 #undef gmx_mm_castsi128_pr
+#undef gmx_mm_castsi256_pr
 
+/* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
 #undef gmx_cvttpr_epi32
 #undef gmx_cvtepi32_pr
 
 #undef gmx_calc_rsq_pr
 #undef gmx_sum4_pr
 
+/* Only required for nbnxn analytical PME kernels */
 #undef gmx_pmecorrF_pr
 #undef gmx_pmecorrV_pr
 
 
+/* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
+#undef gmx_mm_hpr
+
+#undef gmx_load_hpr
+#undef gmx_load1_hpr
+#undef gmx_store_hpr
+#undef gmx_add_hpr
+#undef gmx_sub_hpr
+
+#undef gmx_sum4_hpr
+
+#undef gmx_2hpr_to_pr
+
+
 /* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
  * the same intrinsics, with defines, can be compiled for either 128 or 256
  * bit wide SSE or AVX instructions.
 #error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
 #endif
 
+
+#ifdef GMX_X86_SSE2
+
 #ifdef GMX_MM128_HERE
 
 #define gmx_epi32  __m128i
 #endif
 
 #endif /* GMX_MM256_HERE */
+
+#endif /* GMX_X86_SSE2 */
index 13a02078b02ca95efa2b8540efaa82110d34adae..60bfd71a8530337500b643757ced6fc997d56518 100644 (file)
 #ifdef HAVE_X86INTRIN_H
 #include <x86intrin.h> /* FMA */
 #endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
 
 #include <stdio.h>
 
@@ -192,6 +196,19 @@ static int gmx_mm_check_and_reset_overflow(void)
     return sse_overflow;
 }
 
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
+
 
 
 #endif /* _gmx_x86_avx_128_fma_h_ */
index ed8ec05d1005ed4d4beb771cd86ab1aa9a630e04..360c941f6e734c62f18bea1bdd9b35d97a332f2b 100644 (file)
@@ -272,6 +272,18 @@ static int gmx_mm_check_and_reset_overflow(void)
     return sse_overflow;
 }
 
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
 
 
 #endif /* _gmx_x86_avx_256_h_ */
index c6a08cc1d90fe92a1b503f788c11e4491fb0db5a..8b9ed28563c02c7a8febea5cb18d41738c221ef0 100644 (file)
@@ -67,7 +67,6 @@ void check_multi_large_int(FILE *log,const gmx_multisim_t *ms,
  * The string name is used to print to the log file and in a fatal error
  * if the val's don't match.
  */
-
 void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
                       int nfile, const t_filenm fnm[], gmx_bool bParFn);
 /* Splits the communication into nsim separate simulations
index 0c9ed96b3ede79d75c5a860281539feab937cff8..321ed27373db13a8438ab865be853cd19100f896 100644 (file)
@@ -47,6 +47,8 @@ t_graph *mk_graph(FILE *fplog,
                         gmx_bool bShakeOnly,gmx_bool bSettle);
 /* Build a graph from an idef description. The graph can be used
  * to generate mol-shift indices.
+ * at_start and at_end should coincide will molecule boundaries,
+ * for the whole system this is simply 0 and natoms.
  * If bShakeOnly, only the connections in the shake list are used.
  * If bSettle && bShakeOnly the settles are used too.
  */
index e5415afea9cfb98cb831cff4825b25c90b64f48a..9eb6679ca9b635f209a902ef47d6cef00e78dafa 100644 (file)
@@ -96,9 +96,12 @@ typedef struct tMPI_Spinlock
    as the 486, and gcc on some Linux versions still target 80386 by default). 
   
    We also specifically check for icc, because intrinsics are not always
-   supported there. */
-#if ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
-     !defined(__INTEL_COMPILER) ) 
+   supported there.
+
+   llvm has issues with inline assembly and also in 32 bits has support for
+   the gcc intrinsics */
+#if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
+      !defined(__INTEL_COMPILER) )  || defined(__llvm__) )
 #include "gcc_intrinsics.h"
 
 #else
@@ -115,7 +118,7 @@ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
     __asm__ __volatile__("lock ; xaddl %0, %1;"
                          :"=r"(i) :"m"(a->value), "0"(i) : "memory");
     return i + __i;
-}  
+}
 
 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 {
index 221060f3b9463299382c27fb9a09b1bb839e9161..45f2ab5961582580c05f3cd93d97315a2ab6712a 100644 (file)
@@ -53,7 +53,7 @@ enum {
   efDAT, efDLG, 
   efMAP, efEPS, efMAT, efM2P,
   efMTX,
-  efEDI, efEDO, 
+  efEDI, 
   efHAT,
   efCUB,
   efXPM,
index 24ecf8e2f1fa81c0e13f0a4949fdfc2dd32cdcc5..5bcc17b72f4b0335779d69da443844c4157abb1c 100644 (file)
@@ -46,9 +46,10 @@ extern "C" {
 typedef enum { egcolWhite, egcolGrey, egcolBlack, egcolNR } egCol;
 
 typedef struct {
+  int      at0;     /* The first atom the graph was constructed for */
+  int      at1;     /* The last atom the graph was constructed for */
   int      nnodes;     /* The number of nodes, nnodes=at_end-at_start  */
   int      nbound;     /* The number of nodes with edges               */
-  int      natoms;      /* Total range for this graph: 0 to natoms      */
   int      at_start;   /* The first connected atom in this graph       */
   int      at_end;     /* The last+1 connected atom in this graph      */
   int      *nedge;     /* For each node the number of edges            */
index f1029da4150692bd371eab620528bc3df8406d18..1b447439dfa1397cef0cac83efafe150d9c24b4c 100644 (file)
@@ -48,15 +48,16 @@ extern "C" {
 #define GMX_NBNXN_SIMD
 
 #ifdef GMX_X86_AVX_256
-/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+/* Note that setting this to 128 will also work with AVX-256, but slower */
 #define GMX_NBNXN_SIMD_BITWIDTH  256
 #else
 #define GMX_NBNXN_SIMD_BITWIDTH  128
 #endif
 
 /* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
- * Currently the 2xNN SIMD kernels only make sense and are only implemented
- * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
  */
 #define GMX_NBNXN_SIMD_4XN
 #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
@@ -78,31 +79,8 @@ typedef enum
     nbnxnkNR
 } nbnxn_kernel_type;
 
-/* Note that _mm_... intrinsics can be converted to either SSE or AVX
- * depending on compiler flags.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
-static const char *nbnxn_kernel_name[nbnxnkNR] =
-  { "not set", "plain C",
-#if !(defined GMX_X86_SSE2)
-    "not available", "not available",
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
-#ifndef GMX_X86_SSE4_1
-    "SSE2", "SSE2",
-#else
-    "SSE4.1", "SSE4.1",
-#endif
-#else
-    "AVX-128", "AVX-128",
-#endif
-#else
-    "AVX-256",  "AVX-256",
-#endif
-#endif
-    "CUDA", "plain C" };
+/*! Return a string indentifying the kernel type */
+const char *lookup_nbnxn_kernel_name(int kernel_type);
 
 enum { ewaldexclTable, ewaldexclAnalytical };
 
index ed9df8553a2e9e333245663cd5d21b50bb1c504f..502d661e2fed6d8fc84e5064ccb1f408ff9b5af6 100644 (file)
@@ -68,6 +68,14 @@ typedef struct {
     unsigned excl;  /* The exclusion (interaction) bits */
 } nbnxn_cj_t;
 
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc))   => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
 #define NBNXN_CI_SHIFT          127
 #define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
 #define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
@@ -76,7 +84,7 @@ typedef struct {
 /* Simple pair-list i-unit */
 typedef struct {
     int ci;             /* i-cluster             */
-    int shift;          /* Shift vector index plus possible flags */
+    int shift;          /* Shift vector index plus possible flags, see above */
     int cj_ind_start;   /* Start index into cj   */
     int cj_ind_end;     /* End index into cj     */
 } nbnxn_ci_t;
@@ -221,6 +229,8 @@ typedef struct {
     int  xstride;    /* stride for a coordinate in x (usually 3 or 4)      */
     int  fstride;    /* stride for a coordinate in f (usually 3 or 4)      */
     real *x;         /* x and possibly q, size natoms*xstride              */
+    real *simd_4xn_diag;  /* indices to set the SIMD 4xN diagonal masks    */
+    real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
     int  nout;       /* The number of force arrays                         */
     nbnxn_atomdata_output_t *out;  /* Output data structures               */
     int  nalloc;     /* Allocation size of all arrays (for x/f *x/fstride) */
index b4256a1206358a7896607de233d16c2731587450..b444598e847d205fddbf66f995d40fdad24813db 100644 (file)
@@ -153,6 +153,28 @@ typedef struct
 }
 energyhistory_t;
 
+typedef struct
+{
+    /* If one uses essential dynamics or flooding on a group of atoms from
+     * more than one molecule, we cannot make this group whole with
+     * do_pbc_first_mtop(). We assume that the ED group has the correct PBC
+     * representation at the beginning of the simulation and keep track
+     * of the shifts to always get it into that representation.
+     * For proper restarts from a checkpoint we store the positions of the
+     * reference group at the time of checkpoint writing */
+    gmx_bool    bFromCpt;       /* Did we start from a checkpoint file?       */
+    int         nED;            /* No. of ED/Flooding data sets, if <1 no ED  */
+    int         *nref;          /* No. of atoms in i'th reference structure   */
+    int         *nav;           /* Same for average structure                 */
+    rvec        **old_sref;     /* Positions of the reference atoms
+                                   at the last time step (with correct PBC
+                                   representation)                            */
+    rvec        **old_sref_p;   /* Pointer to these positions                 */
+    rvec        **old_sav;      /* Same for the average positions             */
+    rvec        **old_sav_p;
+}
+edsamstate_t;
+
 typedef struct
 {
   int           natoms;
@@ -196,6 +218,7 @@ typedef struct
 
   energyhistory_t  enerhist; /* Energy history for statistics           */
   df_history_t  dfhist; /*Free energy history for free energy analysis  */
+  edsamstate_t  edsamstate;    /* Essential dynamics / flooding history */
 
   int           ddp_count; /* The DD partitioning count for this state  */
   int           ddp_count_cg_gl; /* The DD part. count for index_gl     */
index 0660994ddc46eba9b80f301a130672d67b7925f0..02f8bd41a9e2584723cf276ce71ab50b46a4032a 100644 (file)
@@ -176,40 +176,6 @@ static real gmx_software_invsqrt(real x)
 #define INVSQRT_DONE 
 #endif /* gmx_invsqrt */
 
-#ifdef GMX_POWERPC_SQRT
-static real gmx_powerpc_invsqrt(real x)
-{
-  const real  half=0.5;
-  const real  three=3.0;
-  t_convert   result,bit_pattern;
-  unsigned int exp,fract;
-  real        lu;
-  real        y;
-#ifdef GMX_DOUBLE
-  real        y2;
-#endif
-
-  lu = __frsqrte((double)x);
-
-  y=(half*lu*(three-((x*lu)*lu)));
-
-#if (GMX_POWERPC_SQRT==2)
-  /* Extra iteration required */
-  y=(half*y*(three-((x*y)*y)));
-#endif
-
-#ifdef GMX_DOUBLE
-  y2=(half*y*(three-((x*y)*y)));
-
-  return y2;                    /* 10 Flops */
-#else
-  return y;                     /* 5  Flops */
-#endif
-}
-#define gmx_invsqrt(x) gmx_powerpc_invsqrt(x)
-#define INVSQRT_DONE
-#endif /* powerpc_invsqrt */
-
 #ifndef INVSQRT_DONE
 #    ifdef GMX_DOUBLE
 #        ifdef HAVE_RSQRT
index 366dda2d545d1f8251bbfc40fdf480c7399d9b28..d059a34767e004920a3d2a799b1a20f22863dfd9 100644 (file)
@@ -6,7 +6,7 @@ Description: Gromacs library
 URL: http://www.gromacs.org
 Version: @PROJECT_VERSION@
 Requires: @PKG_FFT@ @PKG_XML@
-Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
 Libs: -L${libdir} -lgromacs@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@ -lm
 Cflags: -I${includedir} @PKG_CFLAGS@
 
index 519f5353251c4d2e003f76ada13a70171a4b4807..d25b81ec362d35f831ffab2c0e3d8cabe5af547a 100644 (file)
@@ -641,7 +641,7 @@ gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         if (constr->ed && delta_step > 0)
         {
             /* apply the essential dynamcs constraints here */
-            do_edsam(ir,step,md,cr,xprime,v,box,constr->ed);
+            do_edsam(ir,step,cr,xprime,v,box,constr->ed);
         }
     }
     
@@ -1251,9 +1251,9 @@ gmx_constr_t init_constraints(FILE *fplog,
     /* Initialize the essential dynamics sampling.
      * Put the pointer to the ED struct in constr */
     constr->ed = ed;
-    if (ed != NULL
+    if (ed != NULL || state->edsamstate.nED > 0)
     {
-        init_edsam(mtop,ir,cr,ed,state->x,state->box);
+        init_edsam(mtop,ir,cr,ed,state->x,state->box,&state->edsamstate);
     }
     
     constr->warn_mtop = mtop;
index 19a7e61364c9b4ea4dc5e6e63e940168592f220d..b0fe96ea04934c74952588f8ffdcd4dae964295e 100644 (file)
@@ -206,7 +206,7 @@ static void NHC_trotter(t_grpopts *opts,int nvar, gmx_ekindata_t *ekind,real dtf
 }
 
 static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box, 
-                         gmx_ekindata_t *ekind, tensor vir, real pcorr, real ecorr, t_extmass *MassQ)
+                         gmx_ekindata_t *ekind, tensor vir, real pcorr, t_extmass *MassQ)
 {
 
     real  pscal;
@@ -246,7 +246,7 @@ static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box,
     /* for now, we use Elr = 0, because if you want to get it right, you
        really should be using PME. Maybe print a warning? */
 
-    pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres);
+    pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres)+pcorr;
 
     vol = det(box);
     GW = (vol*(MassQ->Winv/PRESFAC))*(DIM*pscal - trace(ir->ref_p));   /* W is in ps^2 * bar * nm^3 */
@@ -933,7 +933,7 @@ void trotter_update(t_inputrec *ir,gmx_large_int_t step, gmx_ekindata_t *ekind,
         case etrtBAROV:
         case etrtBAROV2:
             boxv_trotter(ir,&(state->veta),dt,state->box,ekind,vir,
-                         enerd->term[F_PDISPCORR],enerd->term[F_DISPCORR],MassQ);
+                         enerd->term[F_PDISPCORR],MassQ);
             break;
         case etrtBARONHC:
         case etrtBARONHC2:
index e6095ebf549d90cdc485eb92f2870dc9926e43ed..e834fec52d3ab4136d37729a7eb85b5f8f240a52 100644 (file)
@@ -5328,7 +5328,7 @@ static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
         if (lossf >= DD_PERF_LOSS)
         {
             sprintf(buf,
-                    "NOTE: %.1f %% performance was lost due to load imbalance\n"
+                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
                     "      in the domain decomposition.\n",lossf*100);
             if (!comm->bDynLoadBal)
             {
@@ -9334,7 +9334,7 @@ void dd_partition_system(FILE            *fplog,
                               comm->zones.dens_zone0,
                               fr->cginfo,
                               state_local->x,
-                              ncg_moved,comm->moved,
+                              ncg_moved,bRedist ? comm->moved : NULL,
                               fr->nbv->grp[eintLocal].kernel_type,
                               fr->nbv->grp[eintLocal].nbat);
 
index ae2c31c1ec590315783bdf074562db5365edc9e2..f3c063264ec19fcf13e2780dda6024bff20222f5 100644 (file)
@@ -56,6 +56,7 @@
 #include "mtop_util.h"
 #include "edsam.h"
 #include "gmxfio.h"
+#include "xvgr.h"
 #include "groupcoord.h"
 
 
 #define nblock_bc(cr,nr,d) gmx_bcast((nr)*sizeof((d)[0]), (d),(cr))
 #define   snew_bc(cr,d,nr) { if (!MASTER(cr)) snew((d),(nr)); }
 
+/* These macros determine the column width in the output file */
+#define EDcol_sfmt "%17s"
+#define EDcol_efmt "%17.5e"
+#define EDcol_ffmt "%17f"
 
 /* enum to identify the type of ED: none, normal ED, flooding */
 enum {eEDnone, eEDedsam, eEDflood, eEDnr};
@@ -114,7 +119,6 @@ typedef struct
     real dt;
     real constEfl;
     real alpha2;
-    int flood_id;
     rvec *forces_cartesian;
     t_eigvec vecs;         /* use flooding for these */
 } t_edflood;
@@ -134,9 +138,11 @@ typedef struct gmx_edx
                                    * with respect to the collective
                                    * anrs[0...nr-1] array                     */
     rvec          *x;             /* positions for this structure             */
-    rvec          *x_old;         /* used to keep track of the shift vectors
-                                     such that the ED molecule can always be
-                                     made whole in the parallel case          */
+    rvec          *x_old;         /* Last positions which have the correct PBC
+                                     representation of the ED group. In
+                                     combination with keeping track of the
+                                     shift vectors, the ED group can always
+                                     be made whole                            */
     real          *m;             /* masses                                   */
     real          mtot;           /* total mass (only used in sref)           */
     real          *sqrtm;         /* sqrt of the masses used for mass-
@@ -170,19 +176,16 @@ typedef struct edpar
                                     * is used (i.e. apart from flooding)   */
     t_edflood      flood;          /* parameters especially for flooding   */
     struct t_ed_buffer *buf;       /* handle to local buffers              */
-    struct edpar   *next_edi;      /* Pointer to another ed dataset        */
+    struct edpar   *next_edi;      /* Pointer to another ED group          */
 } t_edpar;
 
 
 typedef struct gmx_edsam
 {
     int           eEDtype;        /* Type of ED: see enums above          */
-    const char    *edinam;        /* name of ED sampling input file       */
-    const char    *edonam;        /*                     output           */
     FILE          *edo;           /* output file pointer                  */
     t_edpar       *edpar;
     gmx_bool      bFirst;
-    gmx_bool      bStartFromCpt;
 } t_gmx_edsam;
 
 
@@ -201,7 +204,7 @@ struct t_do_edsam
     ivec *shifts_xc_ref;       /* Shifts for xc_ref */
     ivec *extra_shifts_xc_ref; /* xc_ref shift changes since last NS step */
     gmx_bool bUpdateShifts;    /* TRUE in NS steps to indicate that the
-                                  ED shifts for this ED dataset need to
+                                  ED shifts for this ED group need to
                                   be updated */
 };
 
@@ -218,11 +221,31 @@ struct t_ed_buffer
 
 /* Function declarations */
 static void fit_to_reference(rvec *xcoll,rvec transvec,matrix rotmat,t_edpar *edi);
-
 static void translate_and_rotate(rvec *x,int nat,rvec transvec,matrix rotmat);
+static real rmsd_from_structure(rvec *x, struct gmx_edx *s);
+static int read_edi_file(const char *fn, t_edpar *edi, int nr_mdatoms);
+static void crosscheck_edi_file_vs_checkpoint(gmx_edsam_t ed, edsamstate_t *EDstate);
+static void init_edsamstate(gmx_edsam_t ed, edsamstate_t *EDstate);
+static void write_edo_legend(gmx_edsam_t ed, int nED, const output_env_t oenv);
 /* End function declarations */
 
 
+/* Multiple ED groups will be labeled with letters instead of numbers 
+ * to avoid confusion with eigenvector indices */
+static char get_EDgroupChar(int nr_edi, int nED)
+{
+    if (nED == 1)
+    {
+        return ' ';
+    }
+
+    /* nr_edi = 1 -> A
+     * nr_edi = 2 -> B ...
+     */
+    return 'A' + nr_edi - 1;
+}
+
+
 /* Does not subtract average positions, projection on single eigenvector is returned
  * used by: do_linfix, do_linacc, do_radfix, do_radacc, do_radcon
  * Average position is subtracted in ed_apply_constraints prior to calling projectx
@@ -234,7 +257,9 @@ static real projectx(t_edpar *edi, rvec *xcoll, rvec *vec)
 
 
     for (i=0; i<edi->sav.nr; i++)
+    {
         proj += edi->sav.sqrtm[i]*iprod(vec[i], xcoll[i]);
+    }
 
     return proj;
 }
@@ -243,14 +268,16 @@ static real projectx(t_edpar *edi, rvec *xcoll, rvec *vec)
 /* Specialized: projection is stored in vec->refproj
  * -> used for radacc, radfix, radcon  and center of flooding potential
  * subtracts average positions, projects vector x */
-static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec, t_commrec *cr)
+static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec)
 {
     int i;
     real rad=0.0;
 
     /* Subtract average positions */
     for (i = 0; i < edi->sav.nr; i++)
+    {
         rvec_dec(x[i], edi->sav.x[i]);
+    }
 
     for (i = 0; i < vec->neig; i++)
     {
@@ -261,7 +288,9 @@ static void rad_project(t_edpar *edi, rvec *x, t_eigvec *vec, t_commrec *cr)
 
     /* Add average positions */
     for (i = 0; i < edi->sav.nr; i++)
+    {
         rvec_inc(x[i], edi->sav.x[i]);
+    }
 }
 
 
@@ -279,14 +308,20 @@ static void project_to_eigvectors(rvec       *x,    /* The positions to project
 
     /* Subtract average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_dec(x[i], edi->sav.x[i]);
+    }
 
     for (i=0; i<vec->neig; i++)
+    {
         vec->xproj[i] = projectx(edi, x, vec->vec[i]);
+    }
 
     /* Add average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_inc(x[i], edi->sav.x[i]);
+    }
 }
 
 
@@ -312,7 +347,9 @@ static real calc_radius(t_eigvec *vec)
 
 
     for (i=0; i<vec->neig; i++)
+    {
         rad += pow((vec->refproj[i]-vec->xproj[i]),2);
+    }
 
     return rad=sqrt(rad);
 }
@@ -341,11 +378,13 @@ static void dump_xcoll(t_edpar *edi, struct t_do_edsam *buf, t_commrec *cr,
     fp = fopen(fn, "w");
 
     for (i=0; i<edi->sav.nr; i++)
+    {
         fprintf(fp, "%d %9.5f %9.5f %9.5f   %d %d %d   %d %d %d\n",
                 edi->sav.anrs[i]+1,
                 xcoll[i][XX]  , xcoll[i][YY]  , xcoll[i][ZZ],
                 shifts[i][XX] , shifts[i][YY] , shifts[i][ZZ],
                 eshifts[i][XX], eshifts[i][YY], eshifts[i][ZZ]);
+    }
 
     fclose(fp);
 }
@@ -359,16 +398,22 @@ static void dump_edi_positions(FILE *out, struct gmx_edx *s, const char name[])
 
     fprintf(out, "#%s positions:\n%d\n", name, s->nr);
     if (s->nr == 0)
+    {
         return;
+    }
 
     fprintf(out, "#index, x, y, z");
     if (s->sqrtm)
+    {
         fprintf(out, ", sqrt(m)");
+    }
     for (i=0; i<s->nr; i++)
     {
         fprintf(out, "\n%6d  %11.6f %11.6f %11.6f",s->anrs[i], s->x[i][XX], s->x[i][YY], s->x[i][ZZ]);
         if (s->sqrtm)
+        {
             fprintf(out,"%9.3f",s->sqrtm[i]);
+        }
     }
     fprintf(out, "\n");
 }
@@ -388,7 +433,9 @@ static void dump_edi_eigenvecs(FILE *out, t_eigvec *ev,
         fprintf(out, "EV %4d\ncomponents %d\nstepsize %f\nxproj %f\nfproj %f\nrefproj %f\nradius %f\nComponents:\n",
                 ev->ieig[i], length, ev->stpsz[i], ev->xproj[i], ev->fproj[i], ev->refproj[i], ev->radius);
         for (j=0; j<length; j++)
+        {
             fprintf(out, "%11.6f %11.6f %11.6f\n", ev->vec[i][j][XX], ev->vec[i][j][YY], ev->vec[i][j][ZZ]);
+        }
     }
 }
 
@@ -453,7 +500,9 @@ static void dump_rvec(FILE *out, int dim, rvec *x)
 
 
     for (i=0; i<dim; i++)
+    {
         fprintf(out,"%4d   %f %f %f\n",i,x[i][XX],x[i][YY],x[i][ZZ]);
+    }
 }
 
 
@@ -467,7 +516,9 @@ static void dump_mat(FILE* out, int dim, double** mat)
     for (i=0;i<dim;i++)
     {
         for (j=0;j<dim;j++)
+        {
             fprintf(out,"%f ",mat[i][j]);
+        }
         fprintf(out,"\n");
     }
 }
@@ -492,7 +543,9 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
     gmx_bool bFirst;
 
     if(edi->buf->do_edfit != NULL)
+    {
         bFirst = FALSE;
+    }
     else
     {
         bFirst = TRUE;
@@ -539,7 +592,9 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
     /* construct loc->omega */
     /* loc->omega is symmetric -> loc->omega==loc->omega' */
     for(r=0;(r<6);r++)
+    {
         for(c=0;(c<=r);c++)
+        {
             if ((r>=3) && (c<3))
             {
                 loc->omega[r][c]=u[r-3][c];
@@ -550,6 +605,8 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
                 loc->omega[r][c]=0;
                 loc->omega[c][r]=0;
             }
+        }
+    }
 
     /* determine h and k */
 #ifdef DEBUG
@@ -557,13 +614,17 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
         int i;
         dump_mat(stderr,2*DIM,loc->omega);
         for (i=0; i<6; i++)
+        {
             fprintf(stderr,"d[%d] = %f\n",i,d[i]);
+        }
     }
 #endif
     jacobi(loc->omega,6,d,loc->om,&irot);
 
     if (irot==0)
+    {
         fprintf(stderr,"IROT=0\n");
+    }
 
     index=0; /* For the compiler only */
 
@@ -571,11 +632,13 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
     {
         max_d=-1000;
         for(i=0;(i<6);i++)
+        {
             if (d[i]>max_d)
             {
                 max_d=d[i];
                 index=i;
             }
+        }
         d[index]=-10000;
         for(i=0;(i<3);i++)
         {
@@ -586,16 +649,26 @@ static void do_edfit(int natoms,rvec *xp,rvec *x,matrix R,t_edpar *edi)
 
     /* determine R */
     for(c=0;(c<3);c++)
+    {
         for(r=0;(r<3);r++)
+        {
             R[c][r]=vk[0][r]*vh[0][c]+
-            vk[1][r]*vh[1][c]+
-            vk[2][r]*vh[2][c];
+                    vk[1][r]*vh[1][c]+
+                    vk[2][r]*vh[2][c];
+        }
+    }
     if (det(R) < 0)
+    {
         for(c=0;(c<3);c++)
+        {
             for(r=0;(r<3);r++)
+            {
                 R[c][r]=vk[0][r]*vh[0][c]+
-                vk[1][r]*vh[1][c]-
-                vk[2][r]*vh[2][c];
+                        vk[1][r]*vh[1][c]-
+                        vk[2][r]*vh[2][c];
+            }
+        }
+    }
 }
 
 
@@ -672,44 +745,40 @@ and call
   two edsam files from two peptide chains
 */
 
-static void write_edo_flood(t_edpar *edi, FILE *fp, gmx_large_int_t step)
+static void write_edo_flood(t_edpar *edi, FILE *fp, real rmsd)
 {
     int i;
-    char buf[22];
-    gmx_bool bOutputRef=FALSE;
 
 
-    fprintf(fp,"%d.th FL: %s %12.5e %12.5e %12.5e\n",
-            edi->flood.flood_id, gmx_step_str(step,buf),
-            edi->flood.Efl, edi->flood.Vfl, edi->flood.deltaF);
+    /* Output how well we fit to the reference structure */
+    fprintf(fp, EDcol_ffmt, rmsd);
 
-
-    /* Check whether any of the references changes with time (this can happen
-     * in case flooding is used as harmonic restraint). If so, output all the
-     * current reference projections. */
-    if (edi->flood.bHarmonic)
+    for (i=0; i<edi->flood.vecs.neig; i++)
     {
-        for (i = 0; i < edi->flood.vecs.neig; i++)
+        fprintf(fp, EDcol_efmt, edi->flood.vecs.xproj[i]);
+
+        /* Check whether the reference projection changes with time (this can happen
+         * in case flooding is used as harmonic restraint). If so, output the
+         * current reference projection */
+        if (edi->flood.bHarmonic && edi->flood.vecs.refprojslope[i] != 0.0)
         {
-            if (edi->flood.vecs.refprojslope[i] != 0.0)
-                bOutputRef=TRUE;
+            fprintf(fp, EDcol_efmt, edi->flood.vecs.refproj[i]);
         }
-        if (bOutputRef)
+
+        /* Output Efl if we are doing adaptive flooding */
+        if (0 != edi->flood.tau)
         {
-            fprintf(fp, "Ref. projs.: ");
-            for (i = 0; i < edi->flood.vecs.neig; i++)
-            {
-                fprintf(fp, "%12.5e ", edi->flood.vecs.refproj[i]);
-            }
-            fprintf(fp, "\n");
+            fprintf(fp, EDcol_efmt, edi->flood.Efl);
         }
-    }
-    fprintf(fp,"FL_FORCES: ");
-
-    for (i=0; i<edi->flood.vecs.neig; i++)
-        fprintf(fp," %12.5e",edi->flood.vecs.fproj[i]);
+        fprintf(fp, EDcol_efmt, edi->flood.Vfl);
 
-    fprintf(fp,"\n");
+        /* Output deltaF if we are doing adaptive flooding */
+        if (0 != edi->flood.tau)
+        {
+            fprintf(fp, EDcol_efmt, edi->flood.deltaF);
+        }
+        fprintf(fp, EDcol_efmt, edi->flood.vecs.fproj[i]);
+    }
 }
 
 
@@ -773,16 +842,20 @@ static void flood_forces(t_edpar *edi)
 
 
     if (edi->flood.bHarmonic)
+    {
         for (i=0; i<edi->flood.vecs.neig; i++)
         {
             edi->flood.vecs.fproj[i] = edi->flood.Efl* edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]);
         }
+    }
     else
+    {
         for (i=0; i<edi->flood.vecs.neig; i++)
         {
             /* if Efl is zero the forces are zero if not use the formula */
             edi->flood.vecs.fproj[i] = edi->flood.Efl!=0 ? edi->flood.kT/edi->flood.Efl/edi->flood.alpha2*energy*edi->flood.vecs.stpsz[i]*(edi->flood.vecs.xproj[i]-edi->flood.vecs.refproj[i]) : 0;
         }
+    }
 }
 
 
@@ -813,7 +886,9 @@ static void flood_blowup(t_edpar *edi, rvec *forces_cart)
 
     /* Clear forces first */
     for (j=0; j<edi->sav.nr_loc; j++)
+    {
         clear_rvec(forces_cart[j]);
+    }
 
     /* Now compute atomwise */
     for (j=0; j<edi->sav.nr_loc; j++)
@@ -842,7 +917,9 @@ static void update_adaption(t_edpar *edi)
         edi->flood.Efl = edi->flood.Efl+edi->flood.dt/edi->flood.tau*(edi->flood.deltaF0-edi->flood.deltaF);
         /* check if restrain (inverted flooding) -> don't let EFL become positive */
         if (edi->flood.alpha2<0 && edi->flood.Efl>-0.00000001)
+        {
             edi->flood.Efl = 0;
+        }
 
         edi->flood.deltaF = (1-edi->flood.dt/edi->flood.tau)*edi->flood.deltaF+edi->flood.dt/edi->flood.tau*edi->flood.Vfl;
     }
@@ -863,6 +940,7 @@ static void do_single_flood(
     matrix  rotmat;         /* rotation matrix */
     matrix  tmat;           /* inverse rotation */
     rvec    transvec;       /* translation vector */
+    real    rmsdev;
     struct t_do_edsam *buf;
 
 
@@ -877,8 +955,10 @@ static void do_single_flood(
 
     /* Only assembly REFERENCE positions if their indices differ from the average ones */
     if (!edi->bRefEqAv)
+    {
         communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, bNS, x,
                 edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+    }
 
     /* If bUpdateShifts was TRUE, the shifts have just been updated in get_positions.
      * We do not need to update the shifts until the next NS step */
@@ -889,9 +969,13 @@ static void do_single_flood(
 
     /* Fit the reference indices to the reference structure */
     if (edi->bRefEqAv)
+    {
         fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+    }
     else
+    {
         fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+    }
 
     /* Now apply the translation and rotation to the ED structure */
     translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
@@ -920,21 +1004,39 @@ static void do_single_flood(
 
     /* Finally add forces to the main force variable */
     for (i=0; i<edi->sav.nr_loc; i++)
+    {
         rvec_inc(force[edi->sav.anrs_loc[i]],edi->flood.forces_cartesian[i]);
+    }
 
     /* Output is written by the master process */
     if (do_per_step(step,edi->outfrq) && MASTER(cr))
-        write_edo_flood(edi,edo,step);
+    {
+        /* Output how well we fit to the reference */
+        if (edi->bRefEqAv)
+        {
+            /* Indices of reference and average structures are identical,
+             * thus we can calculate the rmsd to SREF using xcoll */
+            rmsdev = rmsd_from_structure(buf->xcoll,&edi->sref);
+        }
+        else
+        {
+            /* We have to translate & rotate the reference atoms first */
+            translate_and_rotate(buf->xc_ref, edi->sref.nr, transvec, rotmat);
+            rmsdev = rmsd_from_structure(buf->xc_ref,&edi->sref);
+        }
+
+        write_edo_flood(edi,edo,rmsdev);
+    }
 }
 
 
 /* Main flooding routine, called from do_force */
 extern void do_flood(
-        FILE            *log,    /* md.log file */
         t_commrec       *cr,     /* Communication record */
+        t_inputrec      *ir,     /* Input record */
         rvec            x[],     /* Positions on the local processor */
         rvec            force[], /* forcefield forces, to these the flooding forces are added */
-        gmx_edsam_t     ed,      /* ed data structure contains all ED and flooding datasets */
+        gmx_edsam_t     ed,      /* ed data structure contains all ED and flooding groups */
         matrix          box,     /* the box */
         gmx_large_int_t step,    /* The relative time step since ir->init_step is already subtracted */
         gmx_bool        bNS)     /* Are we in a neighbor searching step? */
@@ -942,15 +1044,27 @@ extern void do_flood(
     t_edpar *edi;
 
 
+    edi = ed->edpar;
+
+    /* Write time to edo, when required. Output the time anyhow since we need
+     * it in the output file for ED constraints. */
+    if (MASTER(cr) && do_per_step(step,edi->outfrq))
+    {
+        fprintf(ed->edo, "\n%12f", ir->init_t + step*ir->delta_t);
+    }
+
     if (ed->eEDtype != eEDflood)
+    {
         return;
+    }
 
-    edi = ed->edpar;
     while (edi)
     {
         /* Call flooding for one matrix */
         if (edi->flood.vecs.neig)
+        {
             do_single_flood(ed->edo,x,force,edi,step,box,cr,bNS);
+        }
         edi = edi->next_edi;
     }
 }
@@ -958,7 +1072,7 @@ extern void do_flood(
 
 /* Called by init_edi, configure some flooding related variables and structures,
  * print headers to output files */
-static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
+static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt)
 {
     int i;
 
@@ -969,10 +1083,10 @@ static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
 
     if (edi->flood.vecs.neig)
     {
-        /* If in any of the datasets we find a flooding vector, flooding is turned on */
+        /* If in any of the ED groups we find a flooding vector, flooding is turned on */
         ed->eEDtype = eEDflood;
 
-        fprintf(stderr,"ED: Flooding of matrix %d is switched on.\n", edi->flood.flood_id);
+        fprintf(stderr,"ED: Flooding %d eigenvector%s.\n", edi->flood.vecs.neig, edi->flood.vecs.neig > 1 ? "s":"");
 
         if (edi->flood.bConstForce)
         {
@@ -987,9 +1101,6 @@ static void init_flood(t_edpar *edi, gmx_edsam_t ed, real dt, t_commrec *cr)
                         edi->flood.vecs.ieig[i], edi->flood.vecs.fproj[i]);
             }
         }
-        fprintf(ed->edo,"FL_HEADER: Flooding of matrix %d is switched on! The flooding output will have the following format:\n",
-                edi->flood.flood_id);
-        fprintf(ed->edo,"FL_HEADER: Step     Efl          Vfl       deltaF\n");
     }
 }
 
@@ -1031,15 +1142,18 @@ static void get_flood_energies(t_edpar *edi, real Vfl[],int nnames)
         count++;
     }
     if (nnames!=count-1)
+    {
         gmx_fatal(FARGS,"Number of energies is not consistent with t_edi structure");
+    }
 }
 /************* END of FLOODING IMPLEMENTATION ****************************/
 #endif
 
 
-gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec *cr)
+gmx_edsam_t ed_open(int natoms, edsamstate_t *EDstate, int nfile,const t_filenm fnm[],unsigned long Flags, const output_env_t oenv, t_commrec *cr)
 {
     gmx_edsam_t ed;
+    int         nED;
 
 
     /* Allocate space for the ED data structure */
@@ -1050,13 +1164,38 @@ gmx_edsam_t ed_open(int nfile,const t_filenm fnm[],unsigned long Flags,t_commrec
 
     if (MASTER(cr))
     {
-        /* Open .edi input file: */
-        ed->edinam=ftp2fn(efEDI,nfile,fnm);
-        /* The master opens the .edo output file */
         fprintf(stderr,"ED sampling will be performed!\n");
-        ed->edonam = ftp2fn(efEDO,nfile,fnm);
-        ed->edo    = gmx_fio_fopen(ed->edonam,(Flags & MD_APPENDFILES)? "a+" : "w+");
-        ed->bStartFromCpt = Flags & MD_STARTFROMCPT;
+        snew(ed->edpar,1);
+
+        /* Read the edi input file: */
+        nED = read_edi_file(ftp2fn(efEDI,nfile,fnm),ed->edpar,natoms);
+
+        /* Make sure the checkpoint was produced in a run using this .edi file */
+        if (EDstate->bFromCpt)
+        {
+            crosscheck_edi_file_vs_checkpoint(ed, EDstate);
+        }
+        else 
+        {
+            EDstate->nED = nED;
+        }
+        init_edsamstate(ed, EDstate);
+
+        /* The master opens the ED output file */
+        if (Flags & MD_APPENDFILES)
+        {
+            ed->edo = gmx_fio_fopen(opt2fn("-eo",nfile,fnm),"a+");
+        }
+        else
+        {
+            ed->edo = xvgropen(opt2fn("-eo",nfile,fnm), 
+                    "Essential dynamics / flooding output", 
+                    "Time (ps)", 
+                    "RMSDs (nm), projections on EVs (nm), ...", oenv);
+
+            /* Make a descriptive legend */
+            write_edo_legend(ed, EDstate->nED, oenv);
+        }
     }
     return ed;
 }
@@ -1171,7 +1310,7 @@ static void broadcast_ed_data(t_commrec *cr, gmx_edsam_t ed, int numedis)
         /* Broadcast flooding eigenvectors and, if needed, values for the moving reference */
         bc_ed_vecs(cr, &edi->flood.vecs,  edi->sav.nr, edi->flood.bHarmonic);
 
-        /* Set the pointer to the next ED dataset */
+        /* Set the pointer to the next ED group */
         if (edi->next_edi)
         {
           snew_bc(cr, edi->next_edi, 1);
@@ -1182,8 +1321,7 @@ static void broadcast_ed_data(t_commrec *cr, gmx_edsam_t ed, int numedis)
 
 
 /* init-routine called for every *.edi-cycle, initialises t_edpar structure */
-static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
-                     t_commrec *cr,gmx_edsam_t ed,t_edpar *edi)
+static void init_edi(gmx_mtop_t *mtop,t_edpar *edi)
 {
     int  i;
     real totalmass = 0.0;
@@ -1277,7 +1415,9 @@ static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
 static void check(const char *line, const char *label)
 {
     if (!strstr(line,label))
+    {
         gmx_fatal(FARGS,"Could not find input parameter %s at expected position in edsam input-file (.edi)\nline read instead is %s",label,line);
+    }
 }
 
 
@@ -1347,7 +1487,9 @@ static void read_edx(FILE *file,int number,int *anrs,rvec *x)
         sscanf (line,"%d%lf%lf%lf",&anrs[i],&d[0],&d[1],&d[2]);
         anrs[i]--; /* we are reading FORTRAN indices */
         for(j=0; j<3; j++)
+        {
             x[i][j]=d[j]; /* always read as double and convert to single */
+        }
     }
 }
 
@@ -1428,7 +1570,9 @@ static void read_edvec(FILE *in,int nr,t_eigvec *tvec,gmx_bool bReadRefproj, gmx
             {
                 nscan = sscanf(line,"%d%lf",&idum,&rdum);
                 if (nscan != 2)
+                {
                     gmx_fatal(FARGS,"Expected 2 values for flooding vec: <nr> <stpsz>\n");
+                }
             }
             tvec->ieig[i]=idum;
             tvec->stpsz[i]=rdum;
@@ -1467,14 +1611,18 @@ static gmx_bool check_if_same(struct gmx_edx sref, struct gmx_edx sav)
     /* If the number of atoms differs between the two structures,
      * they cannot be identical */
     if (sref.nr != sav.nr)
+    {
         return FALSE;
+    }
 
     /* Now that we know that both stuctures have the same number of atoms,
      * check if also the indices are identical */
     for (i=0; i < sav.nr; i++)
     {
         if (sref.anrs[i] != sav.anrs[i])
+        {
             return FALSE;
+        }
     }
     fprintf(stderr, "ED: Note: Reference and average structure are composed of the same atom indices.\n");
 
@@ -1482,7 +1630,7 @@ static gmx_bool check_if_same(struct gmx_edx sref, struct gmx_edx sav)
 }
 
 
-static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int edi_nr, t_commrec *cr)
+static int read_edi(FILE* in,t_edpar *edi,int nr_mdatoms, const char *fn)
 {
     int readmagic;
     const int magic=670;
@@ -1498,21 +1646,28 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
     readmagic=read_edint(in,&bEOF);
     /* Check whether we have reached the end of the input file */
     if (bEOF)
+    {
         return 0;
+    }
 
     if (readmagic != magic)
     {
         if (readmagic==666 || readmagic==667 || readmagic==668)
+        {
             gmx_fatal(FARGS,"Wrong magic number: Use newest version of make_edi to produce edi file");
+        }
         else if (readmagic != 669)
-            gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,ed->edinam);
+        {
+            gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,fn);
+        }
     }
 
     /* check the number of atoms */
     edi->nini=read_edint(in,&bEOF);
     if (edi->nini != nr_mdatoms)
-        gmx_fatal(FARGS,"Nr of atoms in %s (%d) does not match nr of md atoms (%d)",
-                ed->edinam,edi->nini,nr_mdatoms);
+    {
+        gmx_fatal(FARGS,"Nr of atoms in %s (%d) does not match nr of md atoms (%d)", fn,edi->nini,nr_mdatoms);
+    }
 
     /* Done checking. For the rest we blindly trust the input */
     edi->fitmas          = read_checked_edint(in,"FITMAS");
@@ -1530,10 +1685,13 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
     edi->flood.kT        = read_checked_edreal(in,"KT");
     edi->flood.bHarmonic = read_checked_edint(in,"HARMONIC");
     if (readmagic > 669)
+    {
         edi->flood.bConstForce = read_checked_edint(in,"CONST_FORCE_FLOODING");
+    }
     else
+    {
         edi->flood.bConstForce = FALSE;
-    edi->flood.flood_id  = edi_nr;
+    }
     edi->sref.nr         = read_checked_edint(in,"NREF");
 
     /* allocate space for reference positions and read them */
@@ -1571,13 +1729,13 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
     edi->sori.nr=read_edint(in,&bEOF);
     if (edi->sori.nr > 0)
     {
-       if (bHaveReference)
-       {
-               /* Both an -ori structure and a at least one manual reference point have been
-                * specified. That's ambiguous and probably not intentional. */
-               gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
-                                "    point was manually specified in the edi file. That is ambiguous. Aborting.\n");
-       }
+        if (bHaveReference)
+        {
+            /* Both an -ori structure and a at least one manual reference point have been
+             * specified. That's ambiguous and probably not intentional. */
+            gmx_fatal(FARGS, "ED: An origin structure has been provided and a at least one (moving) reference\n"
+                             "    point was manually specified in the edi file. That is ambiguous. Aborting.\n");
+        }
         snew(edi->sori.anrs,edi->sori.nr);
         snew(edi->sori.x   ,edi->sori.nr);
         edi->sori.sqrtm    =NULL;
@@ -1593,7 +1751,7 @@ static int read_edi(FILE* in, gmx_edsam_t ed,t_edpar *edi,int nr_mdatoms, int ed
 /* Read in the edi input file. Note that it may contain several ED data sets which were
  * achieved by concatenating multiple edi files. The standard case would be a single ED
  * data set, though. */
-static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commrec *cr)
+static int read_edi_file(const char *fn, t_edpar *edi, int nr_mdatoms)
 {
     FILE    *in;
     t_edpar *curr_edi,*last_edi;
@@ -1604,39 +1762,40 @@ static void read_edi_file(gmx_edsam_t ed, t_edpar *edi, int nr_mdatoms, t_commre
     /* This routine is executed on the master only */
 
     /* Open the .edi parameter input file */
-    in = gmx_fio_fopen(ed->edinam,"r");
-    fprintf(stderr, "ED: Reading edi file %s\n", ed->edinam);
+    in = gmx_fio_fopen(fn,"r");
+    fprintf(stderr, "ED: Reading edi file %s\n", fn);
 
     /* Now read a sequence of ED input parameter sets from the edi file */
     curr_edi=edi;
     last_edi=edi;
-    while( read_edi(in, ed, curr_edi, nr_mdatoms, edi_nr, cr) )
+    while( read_edi(in, curr_edi, nr_mdatoms, fn) )
     {
         edi_nr++;
-        /* Make shure that the number of atoms in each dataset is the same as in the tpr file */
-        if (edi->nini != nr_mdatoms)
-            gmx_fatal(FARGS,"edi file %s (dataset #%d) was made for %d atoms, but the simulation contains %d atoms.",
-                    ed->edinam, edi_nr, edi->nini, nr_mdatoms);
+
         /* Since we arrived within this while loop we know that there is still another data set to be read in */
         /* We need to allocate space for the data: */
         snew(edi_read,1);
         /* Point the 'next_edi' entry to the next edi: */
         curr_edi->next_edi=edi_read;
-        /* Keep the curr_edi pointer for the case that the next dataset is empty: */
+        /* Keep the curr_edi pointer for the case that the next group is empty: */
         last_edi = curr_edi;
         /* Let's prepare to read in the next edi data set: */
         curr_edi = edi_read;
     }
     if (edi_nr == 0)
-        gmx_fatal(FARGS, "No complete ED data set found in edi file %s.", ed->edinam);
+    {
+        gmx_fatal(FARGS, "No complete ED data set found in edi file %s.", fn);
+    }
 
-    /* Terminate the edi dataset list with a NULL pointer: */
+    /* Terminate the edi group list with a NULL pointer: */
     last_edi->next_edi = NULL;
 
-    fprintf(stderr, "ED: Found %d ED dataset%s.\n", edi_nr, edi_nr>1? "s" : "");
+    fprintf(stderr, "ED: Found %d ED group%s.\n", edi_nr, edi_nr>1? "s" : "");
 
     /* Close the .edi file again */
     gmx_fio_fclose(in);
+
+    return edi_nr;
 }
 
 
@@ -1658,7 +1817,7 @@ static void fit_to_reference(rvec      *xcoll,    /* The positions to be fitted
     struct t_fit_to_ref *loc;
 
 
-    /* Allocate memory the first time this routine is called for each edi dataset */
+    /* Allocate memory the first time this routine is called for each edi group */
     if (NULL == edi->buf->fit_to_ref)
     {
         snew(edi->buf->fit_to_ref, 1);
@@ -1668,7 +1827,9 @@ static void fit_to_reference(rvec      *xcoll,    /* The positions to be fitted
 
     /* We do not touch the original positions but work on a copy. */
     for (i=0; i<edi->sref.nr; i++)
+    {
         copy_rvec(xcoll[i], loc->xcopy[i]);
+    }
 
     /* Calculate the center of mass */
     get_center(loc->xcopy, edi->sref.m, edi->sref.nr, com);
@@ -1708,7 +1869,9 @@ static real rmsd_from_structure(rvec           *x,  /* The positions under consi
 
 
     for (i=0; i < s->nr; i++)
+    {
         rmsd += distance2(s->x[i], x[i]);
+    }
 
     rmsd /= (real) s->nr;
     rmsd = sqrt(rmsd);
@@ -1724,15 +1887,17 @@ void dd_make_local_ed_indices(gmx_domdec_t *dd, struct gmx_edsam *ed)
 
     if (ed->eEDtype != eEDnone)
     {
-        /* Loop over ED datasets (usually there is just one dataset, though) */
+        /* Loop over ED groups */
         edi=ed->edpar;
         while (edi)
         {
             /* Local atoms of the reference structure (for fitting), need only be assembled
              * if their indices differ from the average ones */
             if (!edi->bRefEqAv)
+            {
                 dd_make_local_group_indices(dd->ga2la, edi->sref.nr, edi->sref.anrs,
                         &edi->sref.nr_loc, &edi->sref.anrs_loc, &edi->sref.nalloc_loc, edi->sref.c_ind);
+            }
 
             /* Local atoms of the average structure (on these ED will be performed) */
             dd_make_local_group_indices(dd->ga2la, edi->sav.nr, edi->sav.anrs,
@@ -1742,7 +1907,7 @@ void dd_make_local_ed_indices(gmx_domdec_t *dd, struct gmx_edsam *ed)
              * at the next call to communicate_group_positions, since obviously we are in a NS step */
             edi->buf->do_edsam->bUpdateShifts = TRUE;
 
-            /* Set the pointer to the next ED dataset (if any) */
+            /* Set the pointer to the next ED group (if any) */
             edi=edi->next_edi;
         }
     }
@@ -1763,7 +1928,8 @@ static inline void ed_unshift_single_coord(matrix box, const rvec x, const ivec
         xu[XX] = x[XX]-tx*box[XX][XX]-ty*box[YY][XX]-tz*box[ZZ][XX];
         xu[YY] = x[YY]-ty*box[YY][YY]-tz*box[ZZ][YY];
         xu[ZZ] = x[ZZ]-tz*box[ZZ][ZZ];
-    } else
+    }
+    else
     {
         xu[XX] = x[XX]-tx*box[XX][XX];
         xu[YY] = x[YY]-ty*box[YY][YY];
@@ -1772,7 +1938,7 @@ static inline void ed_unshift_single_coord(matrix box, const rvec x, const ivec
 }
 
 
-static void do_linfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
+static void do_linfix(rvec *xcoll, t_edpar *edi, gmx_large_int_t step)
 {
     int  i, j;
     real proj, add;
@@ -1799,7 +1965,7 @@ static void do_linfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
 }
 
 
-static void do_linacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
+static void do_linacc(rvec *xcoll, t_edpar *edi)
 {
     int  i, j;
     real proj, add;
@@ -1817,12 +1983,16 @@ static void do_linacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
         if (edi->vecs.linacc.stpsz[i] > 0.0)
         {
             if ((proj-edi->vecs.linacc.refproj[i]) < 0.0)
+            {
                 add = edi->vecs.linacc.refproj[i] - proj;
+            }
         }
         if (edi->vecs.linacc.stpsz[i] < 0.0)
         {
             if ((proj-edi->vecs.linacc.refproj[i]) > 0.0)
+            {
                 add = edi->vecs.linacc.refproj[i] - proj;
+            }
         }
 
         /* apply the correction */
@@ -1839,7 +2009,7 @@ static void do_linacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
 }
 
 
-static void do_radfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
+static void do_radfix(rvec *xcoll, t_edpar *edi)
 {
     int  i,j;
     real *proj, rad=0.0, ratio;
@@ -1871,7 +2041,8 @@ static void do_radfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
         /* apply the correction */
         proj[i] /= edi->sav.sqrtm[i];
         proj[i] *= ratio;
-        for (j=0; j<edi->sav.nr; j++) {
+        for (j=0; j<edi->sav.nr; j++)
+        {
             svmul(proj[i], edi->vecs.radfix.vec[i][j], vec_dum);
             rvec_inc(xcoll[j], vec_dum);
         }
@@ -1881,7 +2052,7 @@ static void do_radfix(rvec *xcoll, t_edpar *edi, int step, t_commrec *cr)
 }
 
 
-static void do_radacc(rvec *xcoll, t_edpar *edi, t_commrec *cr)
+static void do_radacc(rvec *xcoll, t_edpar *edi)
 {
     int  i,j;
     real *proj, rad=0.0, ratio=0.0;
@@ -1933,7 +2104,7 @@ struct t_do_radcon {
     real *proj;
 };
 
-static void do_radcon(rvec *xcoll, t_edpar *edi, t_commrec *cr)
+static void do_radcon(rvec *xcoll, t_edpar *edi)
 {
     int  i,j;
     real rad=0.0, ratio=0.0;
@@ -1955,10 +2126,14 @@ static void do_radcon(rvec *xcoll, t_edpar *edi, t_commrec *cr)
     loc = edi->buf->do_radcon;
 
     if (edi->vecs.radcon.neig == 0)
+    {
         return;
-
+    }
+    
     if (bFirst)
+    {
         snew(loc->proj, edi->vecs.radcon.neig);
+    }
 
     /* loop over radcon vectors */
     for (i=0; i<edi->vecs.radcon.neig; i++)
@@ -2005,92 +2180,88 @@ static void do_radcon(rvec *xcoll, t_edpar *edi, t_commrec *cr)
 }
 
 
-static void ed_apply_constraints(rvec *xcoll, t_edpar *edi, gmx_large_int_t step, t_commrec *cr)
+static void ed_apply_constraints(rvec *xcoll, t_edpar *edi, gmx_large_int_t step)
 {
     int i;
 
 
     /* subtract the average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_dec(xcoll[i], edi->sav.x[i]);
+    }
 
     /* apply the constraints */
     if (step >= 0)
-        do_linfix(xcoll, edi, step, cr);
-    do_linacc(xcoll, edi, cr);
+    {
+        do_linfix(xcoll, edi, step);
+    }
+    do_linacc(xcoll, edi);
     if (step >= 0)
-        do_radfix(xcoll, edi, step, cr);
-    do_radacc(xcoll, edi, cr);
-    do_radcon(xcoll, edi, cr);
+    {
+        do_radfix(xcoll, edi);
+    }
+    do_radacc(xcoll, edi);
+    do_radcon(xcoll, edi);
 
     /* add back the average positions */
     for (i=0; i<edi->sav.nr; i++)
+    {
         rvec_inc(xcoll[i], edi->sav.x[i]);
+    }
 }
 
 
-/* Write out the projections onto the eigenvectors */
-static void write_edo(int nr_edi, t_edpar *edi, gmx_edsam_t ed, gmx_large_int_t step,real rmsd)
+/* Write out the projections onto the eigenvectors. The order of output
+ * corresponds to ed_output_legend() */
+static void write_edo(t_edpar *edi, FILE *fp,real rmsd)
 {
     int i;
-    char buf[22];
 
 
-    if (edi->bNeedDoEdsam)
+    /* Output how well we fit to the reference structure */
+    fprintf(fp, EDcol_ffmt, rmsd);
+
+    for (i=0; i<edi->vecs.mon.neig; i++)
     {
-        if (step == -1)
-            fprintf(ed->edo, "Initial projections:\n");
-        else
-        {
-            fprintf(ed->edo,"Step %s, ED #%d  ", gmx_step_str(step, buf), nr_edi);
-            fprintf(ed->edo,"  RMSD %f nm\n",rmsd);
-        }
+        fprintf(fp, EDcol_efmt, edi->vecs.mon.xproj[i]);
+    }
 
-        if (edi->vecs.mon.neig)
-        {
-            fprintf(ed->edo,"  Monitor eigenvectors");
-            for (i=0; i<edi->vecs.mon.neig; i++)
-                fprintf(ed->edo," %d: %12.5e ",edi->vecs.mon.ieig[i],edi->vecs.mon.xproj[i]);
-            fprintf(ed->edo,"\n");
-        }
-        if (edi->vecs.linfix.neig)
-        {
-            fprintf(ed->edo,"  Linfix  eigenvectors");
-            for (i=0; i<edi->vecs.linfix.neig; i++)
-                fprintf(ed->edo," %d: %12.5e ",edi->vecs.linfix.ieig[i],edi->vecs.linfix.xproj[i]);
-            fprintf(ed->edo,"\n");
-        }
-        if (edi->vecs.linacc.neig)
-        {
-            fprintf(ed->edo,"  Linacc  eigenvectors");
-            for (i=0; i<edi->vecs.linacc.neig; i++)
-                fprintf(ed->edo," %d: %12.5e ",edi->vecs.linacc.ieig[i],edi->vecs.linacc.xproj[i]);
-            fprintf(ed->edo,"\n");
-        }
-        if (edi->vecs.radfix.neig)
-        {
-            fprintf(ed->edo,"  Radfix  eigenvectors");
-            for (i=0; i<edi->vecs.radfix.neig; i++)
-                fprintf(ed->edo," %d: %12.5e ",edi->vecs.radfix.ieig[i],edi->vecs.radfix.xproj[i]);
-            fprintf(ed->edo,"\n");
-            fprintf(ed->edo,"  fixed increment radius = %f\n", calc_radius(&edi->vecs.radfix));
-        }
-        if (edi->vecs.radacc.neig)
-        {
-            fprintf(ed->edo,"  Radacc  eigenvectors");
-            for (i=0; i<edi->vecs.radacc.neig; i++)
-                fprintf(ed->edo," %d: %12.5e ",edi->vecs.radacc.ieig[i],edi->vecs.radacc.xproj[i]);
-            fprintf(ed->edo,"\n");
-            fprintf(ed->edo,"  acceptance radius      = %f\n", calc_radius(&edi->vecs.radacc));
-        }
-        if (edi->vecs.radcon.neig)
-        {
-            fprintf(ed->edo,"  Radcon  eigenvectors");
-            for (i=0; i<edi->vecs.radcon.neig; i++)
-                fprintf(ed->edo," %d: %12.5e ",edi->vecs.radcon.ieig[i],edi->vecs.radcon.xproj[i]);
-            fprintf(ed->edo,"\n");
-            fprintf(ed->edo,"  contracting radius     = %f\n", calc_radius(&edi->vecs.radcon));
-        }
+    for (i=0; i<edi->vecs.linfix.neig; i++)
+    {
+        fprintf(fp, EDcol_efmt, edi->vecs.linfix.xproj[i]);
+    }
+
+    for (i=0; i<edi->vecs.linacc.neig; i++)
+    {
+        fprintf(fp, EDcol_efmt, edi->vecs.linacc.xproj[i]);
+    }
+
+    for (i=0; i<edi->vecs.radfix.neig; i++)
+    {
+        fprintf(fp, EDcol_efmt, edi->vecs.radfix.xproj[i]);
+    }
+    if (edi->vecs.radfix.neig)
+    {
+        fprintf(fp, EDcol_ffmt, calc_radius(&edi->vecs.radfix)); /* fixed increment radius */
+    }
+
+    for (i=0; i<edi->vecs.radacc.neig; i++)
+    {
+        fprintf(fp, EDcol_efmt, edi->vecs.radacc.xproj[i]);
+    }
+    if (edi->vecs.radacc.neig)
+    {
+        fprintf(fp, EDcol_ffmt, calc_radius(&edi->vecs.radacc)); /* acceptance radius */
+    }
+
+    for (i=0; i<edi->vecs.radcon.neig; i++)
+    {
+        fprintf(fp, EDcol_efmt, edi->vecs.radcon.xproj[i]);
+    }
+    if (edi->vecs.radcon.neig)
+    {
+        fprintf(fp, EDcol_ffmt, calc_radius(&edi->vecs.radcon)); /* contracting radius */
     }
 }
 
@@ -2115,7 +2286,9 @@ static void copyEvecReference(t_eigvec* floodvecs)
 
 
     if (NULL==floodvecs->refproj0)
+    {
         snew(floodvecs->refproj0, floodvecs->neig);
+    }
 
     for (i=0; i<floodvecs->neig; i++)
     {
@@ -2124,55 +2297,382 @@ static void copyEvecReference(t_eigvec* floodvecs)
 }
 
 
+/* Call on MASTER only. Check whether the essential dynamics / flooding
+ * groups of the checkpoint file are consistent with the provided .edi file. */
+static void crosscheck_edi_file_vs_checkpoint(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+    t_edpar *edi = NULL;    /* points to a single edi data set */
+    int edinum;
+
+
+    if (NULL == EDstate->nref || NULL == EDstate->nav)
+    {
+        gmx_fatal(FARGS, "Essential dynamics and flooding can only be switched on (or off) at the\n"
+                         "start of a new simulation. If a simulation runs with/without ED constraints,\n"
+                         "it must also continue with/without ED constraints when checkpointing.\n"
+                         "To switch on (or off) ED constraints, please prepare a new .tpr to start\n"
+                         "from without a checkpoint.\n");
+    }
+
+    edi=ed->edpar;
+    edinum = 0;
+    while(edi != NULL)
+    {
+        /* Check number of atoms in the reference and average structures */
+        if (EDstate->nref[edinum] != edi->sref.nr)
+        {
+            gmx_fatal(FARGS, "The number of reference structure atoms in ED group %c is\n"
+                             "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+                    get_EDgroupChar(edinum+1, 0), EDstate->nref[edinum], edi->sref.nr);
+        }
+        if (EDstate->nav[edinum] != edi->sav.nr)
+        {
+            gmx_fatal(FARGS, "The number of average structure atoms in ED group %c is\n"
+                             "not the same in .cpt (NREF=%d) and .edi (NREF=%d) files!\n",
+                    get_EDgroupChar(edinum+1, 0), EDstate->nav[edinum], edi->sav.nr);
+        }
+        edi=edi->next_edi;
+        edinum++;
+    }
+
+    if (edinum != EDstate->nED)
+    {
+        gmx_fatal(FARGS, "The number of essential dynamics / flooding groups is not consistent.\n"
+                         "There are %d ED groups in the .cpt file, but %d in the .edi file!\n"
+                         "Are you sure this is the correct .edi file?\n", EDstate->nED, edinum);
+    }
+}
+
+
+/* The edsamstate struct stores the information we need to make the ED group
+ * whole again after restarts from a checkpoint file. Here we do the following:
+ * a) If we did not start from .cpt, we prepare the struct for proper .cpt writing,
+ * b) if we did start from .cpt, we copy over the last whole structures from .cpt,
+ * c) in any case, for subsequent checkpoint writing, we set the pointers in
+ * edsamstate to the x_old arrays, which contain the correct PBC representation of
+ * all ED structures at the last time step. */
+static void init_edsamstate(gmx_edsam_t ed, edsamstate_t *EDstate)
+{
+    int     i, nr_edi;
+    t_edpar *edi;
+
+
+    snew(EDstate->old_sref_p, EDstate->nED);
+    snew(EDstate->old_sav_p , EDstate->nED);
+
+    /* If we did not read in a .cpt file, these arrays are not yet allocated */
+    if (!EDstate->bFromCpt)
+    {
+        snew(EDstate->nref, EDstate->nED);
+        snew(EDstate->nav , EDstate->nED);
+    }
+
+    /* Loop over all ED/flooding data sets (usually only one, though) */
+    edi = ed->edpar;
+    for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
+    {
+        /* We always need the last reference and average positions such that
+         * in the next time step we can make the ED group whole again
+         * if the atoms do not have the correct PBC representation */
+        if (EDstate->bFromCpt)
+        {
+            /* Copy the last whole positions of reference and average group from .cpt */
+            for (i=0; i<edi->sref.nr; i++)
+            {
+                copy_rvec(EDstate->old_sref[nr_edi-1][i], edi->sref.x_old[i]);
+            }
+            for (i=0; i<edi->sav.nr ; i++)
+            {
+                copy_rvec(EDstate->old_sav [nr_edi-1][i], edi->sav.x_old [i]);
+            }
+        }
+        else
+        {
+            EDstate->nref[nr_edi-1] = edi->sref.nr;
+            EDstate->nav [nr_edi-1] = edi->sav.nr;
+        }
+
+        /* For subsequent checkpoint writing, set the edsamstate pointers to the edi arrays: */
+        EDstate->old_sref_p[nr_edi-1] = edi->sref.x_old;
+        EDstate->old_sav_p [nr_edi-1] = edi->sav.x_old ;
+
+        edi = edi->next_edi;
+    }
+}
+
+
+/* Adds 'buf' to 'str' */
+static void add_to_string(char **str, char *buf)
+{
+    int len;
+
+
+    len = strlen(*str) + strlen(buf) + 1;
+    srenew(*str, len);
+    strcat(*str, buf);
+}
+
+
+static void add_to_string_aligned(char **str, char *buf)
+{
+    char buf_aligned[STRLEN];
+
+    sprintf(buf_aligned, EDcol_sfmt, buf);
+    add_to_string(str, buf_aligned);
+}
+
+
+static void nice_legend(const char ***setname, int *nsets, char **LegendStr, char *value, char *unit, char EDgroupchar)
+{
+    char tmp[STRLEN], tmp2[STRLEN];
+
+
+    sprintf(tmp, "%c %s", EDgroupchar, value);
+    add_to_string_aligned(LegendStr, tmp);
+    sprintf(tmp2, "%s (%s)", tmp, unit);
+    (*setname)[*nsets] = strdup(tmp2);
+    (*nsets)++;
+}
+
+
+static void nice_legend_evec(const char ***setname, int *nsets, char **LegendStr, t_eigvec *evec, char EDgroupChar, const char *EDtype)
+{
+    int i;
+    char tmp[STRLEN];
+
+
+    for (i=0; i<evec->neig; i++)
+    {
+        sprintf(tmp, "EV%dprj%s", evec->ieig[i], EDtype);
+        nice_legend(setname, nsets, LegendStr, tmp, "nm", EDgroupChar);
+    }
+}
+
+
+/* Makes a legend for the xvg output file. Call on MASTER only! */
+static void write_edo_legend(gmx_edsam_t ed, int nED, const output_env_t oenv)
+{
+    t_edpar    *edi = NULL;
+    int        i;
+    int        nr_edi, nsets, n_flood, n_edsam;
+    const char **setname;
+    char       buf[STRLEN];
+    char       *LegendStr=NULL;
+
+
+    edi         = ed->edpar;
+
+    fprintf(ed->edo, "# Output will be written every %d step%s\n", ed->edpar->outfrq, ed->edpar->outfrq != 1 ? "s":"");
+
+    for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+    {
+        fprintf(ed->edo, "#\n");
+        fprintf(ed->edo, "# Summary of applied con/restraints for the ED group %c\n", get_EDgroupChar(nr_edi, nED));
+        fprintf(ed->edo, "# Atoms in average structure: %d\n", edi->sav.nr);
+        fprintf(ed->edo, "#    monitor  : %d vec%s\n" , edi->vecs.mon.neig   , edi->vecs.mon.neig    != 1 ? "s":"");
+        fprintf(ed->edo, "#    LINFIX   : %d vec%s\n" , edi->vecs.linfix.neig, edi->vecs.linfix.neig != 1 ? "s":"");
+        fprintf(ed->edo, "#    LINACC   : %d vec%s\n" , edi->vecs.linacc.neig, edi->vecs.linacc.neig != 1 ? "s":"");
+        fprintf(ed->edo, "#    RADFIX   : %d vec%s\n" , edi->vecs.radfix.neig, edi->vecs.radfix.neig != 1 ? "s":"");
+        fprintf(ed->edo, "#    RADACC   : %d vec%s\n" , edi->vecs.radacc.neig, edi->vecs.radacc.neig != 1 ? "s":"");
+        fprintf(ed->edo, "#    RADCON   : %d vec%s\n" , edi->vecs.radcon.neig, edi->vecs.radcon.neig != 1 ? "s":"");
+        fprintf(ed->edo, "#    FLOODING : %d vec%s  " , edi->flood.vecs.neig , edi->flood.vecs.neig  != 1 ? "s":"");
+
+        if (edi->flood.vecs.neig)
+        {
+            /* If in any of the groups we find a flooding vector, flooding is turned on */
+            ed->eEDtype = eEDflood;
+
+            /* Print what flavor of flooding we will do */
+            if (0 == edi->flood.tau) /* constant flooding strength */
+            {
+                fprintf(ed->edo, "Efl_null = %g", edi->flood.constEfl);
+                if (edi->flood.bHarmonic)
+                {
+                    fprintf(ed->edo, ", harmonic");
+                }
+            }
+            else /* adaptive flooding */
+            {
+                fprintf(ed->edo, ", adaptive");
+            }
+        }
+        fprintf(ed->edo, "\n");
+
+        edi = edi->next_edi;
+    }
+
+    /* Print a nice legend */
+    snew(LegendStr, 1);
+    LegendStr[0] = '\0';
+    sprintf(buf, "#     %6s", "time");
+    add_to_string(&LegendStr, buf);
+
+    /* Calculate the maximum number of columns we could end up with */
+    edi     = ed->edpar;
+    nsets = 0;
+    for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+    {
+        nsets += 5 +edi->vecs.mon.neig
+                   +edi->vecs.linfix.neig
+                   +edi->vecs.linacc.neig
+                   +edi->vecs.radfix.neig
+                   +edi->vecs.radacc.neig
+                   +edi->vecs.radcon.neig
+                + 6*edi->flood.vecs.neig;
+        edi = edi->next_edi;
+    }
+    snew(setname, nsets);
+
+    /* In the mdrun time step in a first function call (do_flood()) the flooding 
+     * forces are calculated and in a second function call (do_edsam()) the 
+     * ED constraints. To get a corresponding legend, we need to loop twice
+     * over the edi groups and output first the flooding, then the ED part */
+    
+    /* The flooding-related legend entries, if flooding is done */
+    nsets = 0;
+    if (eEDflood == ed->eEDtype)
+    {
+        edi   = ed->edpar;
+        for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+        {
+            /* Always write out the projection on the flooding EVs. Of course, this can also
+             * be achieved with the monitoring option in do_edsam() (if switched on by the
+             * user), but in that case the positions need to be communicated in do_edsam(),
+             * which is not necessary when doing flooding only. */
+            nice_legend(&setname, &nsets, &LegendStr, "RMSD to ref", "nm", get_EDgroupChar(nr_edi, nED) );
+
+            for (i=0; i<edi->flood.vecs.neig; i++)
+            {
+                sprintf(buf, "EV%dprjFLOOD", edi->flood.vecs.ieig[i]);
+                nice_legend(&setname, &nsets, &LegendStr, buf, "nm", get_EDgroupChar(nr_edi, nED));
+
+                /* Output the current reference projection if it changes with time;
+                 * this can happen when flooding is used as harmonic restraint */
+                if (edi->flood.bHarmonic && edi->flood.vecs.refprojslope[i] != 0.0)
+                {
+                    sprintf(buf, "EV%d ref.prj.", edi->flood.vecs.ieig[i]);
+                    nice_legend(&setname, &nsets, &LegendStr, buf, "nm", get_EDgroupChar(nr_edi, nED));
+                }
+
+                /* For flooding we also output Efl, Vfl, deltaF, and the flooding forces */
+                if (0 != edi->flood.tau) /* only output Efl for adaptive flooding (constant otherwise) */
+                {
+                    sprintf(buf, "EV%d-Efl", edi->flood.vecs.ieig[i]);
+                    nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol", get_EDgroupChar(nr_edi, nED));
+                }
+
+                sprintf(buf, "EV%d-Vfl", edi->flood.vecs.ieig[i]);
+                nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol", get_EDgroupChar(nr_edi, nED));
+
+                if (0 != edi->flood.tau) /* only output deltaF for adaptive flooding (zero otherwise) */
+                {
+                    sprintf(buf, "EV%d-deltaF", edi->flood.vecs.ieig[i]);
+                    nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol", get_EDgroupChar(nr_edi, nED));
+                }
+
+                sprintf(buf, "EV%d-FLforces", edi->flood.vecs.ieig[i]);
+                nice_legend(&setname, &nsets, &LegendStr, buf, "kJ/mol/nm", get_EDgroupChar(nr_edi, nED));
+            }
+
+            edi = edi->next_edi;
+        } /* End of flooding-related legend entries */
+    }
+    n_flood = nsets;
+
+    /* Now the ED-related entries, if essential dynamics is done */
+    edi         = ed->edpar;
+    for (nr_edi = 1; nr_edi <= nED; nr_edi++)
+    {
+        nice_legend(&setname, &nsets, &LegendStr, "RMSD to ref", "nm", get_EDgroupChar(nr_edi, nED) );
+
+        /* Essential dynamics, projections on eigenvectors */
+        nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.mon   , get_EDgroupChar(nr_edi, nED), "MON"   );
+        nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linfix, get_EDgroupChar(nr_edi, nED), "LINFIX");
+        nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linacc, get_EDgroupChar(nr_edi, nED), "LINACC");
+        nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radfix, get_EDgroupChar(nr_edi, nED), "RADFIX");
+        if (edi->vecs.radfix.neig)
+        {
+            nice_legend(&setname, &nsets, &LegendStr, "RADFIX radius", "nm", get_EDgroupChar(nr_edi, nED));
+        }
+        nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radacc, get_EDgroupChar(nr_edi, nED), "RADACC");
+        if (edi->vecs.radacc.neig)
+        {
+            nice_legend(&setname, &nsets, &LegendStr, "RADACC radius", "nm", get_EDgroupChar(nr_edi, nED));
+        }
+        nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radcon, get_EDgroupChar(nr_edi, nED), "RADCON");
+        if (edi->vecs.radcon.neig)
+        {
+            nice_legend(&setname, &nsets, &LegendStr, "RADCON radius", "nm", get_EDgroupChar(nr_edi, nED));
+        }
+
+        edi = edi->next_edi;
+    } /* end of 'pure' essential dynamics legend entries */
+    n_edsam = nsets - n_flood;
+
+    xvgr_legend(ed->edo, nsets, setname, oenv);
+    sfree(setname);
+
+    fprintf(ed->edo, "#\n"
+                     "# Legend for %d column%s of flooding plus %d column%s of essential dynamics data:\n",
+                     n_flood, 1 == n_flood ? "":"s", 
+                     n_edsam, 1 == n_edsam ? "":"s");
+    fprintf(ed->edo, "%s", LegendStr);
+    sfree(LegendStr);
+    
+    fflush(ed->edo);
+}
+
+
 void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                 t_inputrec  *ir,     /* input record                       */
                 t_commrec   *cr,     /* communication record               */
                 gmx_edsam_t ed,      /* contains all ED data               */
                 rvec        x[],     /* positions of the whole MD system   */
-                matrix      box)     /* the box                            */
+                matrix      box,     /* the box                            */
+                edsamstate_t *EDstate)
 {
     t_edpar *edi = NULL;    /* points to a single edi data set */
-    int     numedis=0;      /* keep track of the number of ED data sets in edi file */
     int     i,nr_edi,avindex;
     rvec    *x_pbc  = NULL; /* positions of the whole MD system with pbc removed  */
-    rvec    *xfit   = NULL; /* the positions which will be fitted to the reference structure  */
-    rvec    *xstart = NULL; /* the positions which are subject to ED sampling */
+    rvec    *xfit=NULL, *xstart=NULL; /* dummy arrays to determine initial RMSDs  */
     rvec    fit_transvec;   /* translation ... */
     matrix  fit_rotmat;     /* ... and rotation from fit to reference structure */
 
 
     if (!DOMAINDECOMP(cr) && PAR(cr) && MASTER(cr))
+    {
         gmx_fatal(FARGS, "Please switch on domain decomposition to use essential dynamics in parallel.");
+    }
 
     if (MASTER(cr))
+    {
         fprintf(stderr, "ED: Initializing essential dynamics constraints.\n");
 
+        if (NULL == ed)
+        {
+            gmx_fatal(FARGS, "The checkpoint file you provided is from an essential dynamics or\n"
+                             "flooding simulation. Please also provide the correct .edi file with -ei.\n");
+        }
+    }
+
     /* Needed for initializing radacc radius in do_edsam */
-    ed->bFirst = 1;
+    ed->bFirst = TRUE;
 
     /* The input file is read by the master and the edi structures are
      * initialized here. Input is stored in ed->edpar. Then the edi
      * structures are transferred to the other nodes */
     if (MASTER(cr))
     {
-        snew(ed->edpar,1);
-        /* Read the whole edi file at once: */
-        read_edi_file(ed,ed->edpar,mtop->natoms,cr);
-
-        /* Initialization for every ED/flooding dataset. Flooding uses one edi dataset per
+        /* Initialization for every ED/flooding group. Flooding uses one edi group per
          * flooding vector, Essential dynamics can be applied to more than one structure
          * as well, but will be done in the order given in the edi file, so
          * expect different results for different order of edi file concatenation! */
         edi=ed->edpar;
         while(edi != NULL)
         {
-            init_edi(mtop,ir,cr,ed,edi);
-
-            /* Init flooding parameters if needed */
-            init_flood(edi,ed,ir->delta_t,cr);
-
+            init_edi(mtop,edi);
+            init_flood(edi,ed,ir->delta_t);
             edi=edi->next_edi;
-            numedis++;
         }
     }
 
@@ -2189,42 +2689,55 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
 
         /* Reset pointer to first ED data set which contains the actual ED data */
         edi=ed->edpar;
-
         /* Loop over all ED/flooding data sets (usually only one, though) */
-        for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+        for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
         {
-            /* We use srenew to allocate memory since the size of the buffers
-             * is likely to change with every ED dataset */
-            srenew(xfit  , edi->sref.nr );
-            srenew(xstart, edi->sav.nr  );
-
-            /* Extract the positions of the atoms to which will be fitted */
-            for (i=0; i < edi->sref.nr; i++)
+            /* For multiple ED groups we use the output frequency that was specified
+             * in the first set */
+            if (nr_edi > 1)
             {
-                copy_rvec(x_pbc[edi->sref.anrs[i]], xfit[i]);
-
-                /* Save the sref positions such that in the next time step we can make the ED group whole
-                 * in case any of the atoms do not have the correct PBC representation */
-                copy_rvec(xfit[i], edi->sref.x_old[i]);
+                edi->outfrq = ed->edpar->outfrq;
             }
 
-            /* Extract the positions of the atoms subject to ED sampling */
-            for (i=0; i < edi->sav.nr; i++)
+            /* Extract the initial reference and average positions. When starting
+             * from .cpt, these have already been read into sref.x_old
+             * in init_edsamstate() */
+            if (!EDstate->bFromCpt)
             {
-                copy_rvec(x_pbc[edi->sav.anrs[i]], xstart[i]);
+                /* If this is the first run (i.e. no checkpoint present) we assume
+                 * that the starting positions give us the correct PBC representation */
+                for (i=0; i < edi->sref.nr; i++)
+                {
+                    copy_rvec(x_pbc[edi->sref.anrs[i]], edi->sref.x_old[i]);
+                }
 
-                /* Save the sav positions such that in the next time step we can make the ED group whole
-                 * in case any of the atoms do not have the correct PBC representation */
-                copy_rvec(xstart[i], edi->sav.x_old[i]);
+                for (i=0; i < edi->sav.nr; i++)
+                {
+                    copy_rvec(x_pbc[edi->sav.anrs[i]], edi->sav.x_old[i]);
+                }
             }
 
+            /* Now we have the PBC-correct start positions of the reference and
+               average structure. We copy that over to dummy arrays on which we
+               can apply fitting to print out the RMSD. We srenew the memory since
+               the size of the buffers is likely different for every ED group */
+            srenew(xfit  , edi->sref.nr );
+            srenew(xstart, edi->sav.nr  );
+            copy_rvecn(edi->sref.x_old, xfit, 0, edi->sref.nr);
+            copy_rvecn(edi->sav.x_old, xstart, 0, edi->sav.nr);
+
             /* Make the fit to the REFERENCE structure, get translation and rotation */
             fit_to_reference(xfit, fit_transvec, fit_rotmat, edi);
 
             /* Output how well we fit to the reference at the start */
             translate_and_rotate(xfit, edi->sref.nr, fit_transvec, fit_rotmat);
-            fprintf(stderr, "ED: Initial RMSD from reference after fit = %f nm (dataset #%d)\n",
-                    rmsd_from_structure(xfit, &edi->sref), nr_edi);
+            fprintf(stderr, "ED: Initial RMSD from reference after fit = %f nm",
+                    rmsd_from_structure(xfit, &edi->sref));
+            if (EDstate->nED > 1)
+            {
+                fprintf(stderr, " (ED group %c)", get_EDgroupChar(nr_edi, EDstate->nED));
+            }
+            fprintf(stderr, "\n");
 
             /* Now apply the translation and rotation to the atoms on which ED sampling will be performed */
             translate_and_rotate(xstart, edi->sav.nr, fit_transvec, fit_rotmat);
@@ -2259,13 +2772,18 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                      * the average structure, which must be projected */
                     avindex = edi->star.nr - edi->sav.nr;
                 }
-                rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon, cr);
-            } else
-                rad_project(edi, xstart, &edi->vecs.radcon, cr);
+                rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon);
+            }
+            else
+            {
+                rad_project(edi, xstart, &edi->vecs.radcon);
+            }
 
             /* process structure that will serve as origin of expansion circle */
             if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
+            {
                 fprintf(stderr, "ED: Setting center of flooding potential (0 = average structure)\n");
+            }
 
             if (edi->sori.nr > 0)
             {
@@ -2285,13 +2803,13 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                     avindex = edi->sori.nr - edi->sav.nr;
                 }
 
-                rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radacc, cr);
-                rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radfix, cr);
+                rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radacc);
+                rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radfix);
                 if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
                 {
                     fprintf(stderr, "ED: The ORIGIN structure will define the flooding potential center.\n");
                     /* Set center of flooding potential to the ORIGIN structure */
-                    rad_project(edi, &edi->sori.x[avindex], &edi->flood.vecs, cr);
+                    rad_project(edi, &edi->sori.x[avindex], &edi->flood.vecs);
                     /* We already know that no (moving) reference position was provided,
                      * therefore we can overwrite refproj[0]*/
                     copyEvecReference(&edi->flood.vecs);
@@ -2299,15 +2817,17 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             }
             else /* No origin structure given */
             {
-                rad_project(edi, xstart, &edi->vecs.radacc, cr);
-                rad_project(edi, xstart, &edi->vecs.radfix, cr);
+                rad_project(edi, xstart, &edi->vecs.radacc);
+                rad_project(edi, xstart, &edi->vecs.radfix);
                 if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
                 {
                     if (edi->flood.bHarmonic)
                     {
                         fprintf(stderr, "ED: A (possibly changing) ref. projection will define the flooding potential center.\n");
                         for (i=0; i<edi->flood.vecs.neig; i++)
+                        {
                             edi->flood.vecs.refproj[i] = edi->flood.vecs.refproj0[i];
+                        }
                     }
                     else
                     {
@@ -2315,7 +2835,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
                         /* Set center of flooding potential to the center of the covariance matrix,
                          * i.e. the average structure, i.e. zero in the projected system */
                         for (i=0; i<edi->flood.vecs.neig; i++)
+                        {
                             edi->flood.vecs.refproj[i] = 0.0;
+                        }
                     }
                 }
             }
@@ -2324,20 +2846,18 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             {
                 for (i=0; i<edi->flood.vecs.neig; i++)
                 {
-                    fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", i, edi->flood.vecs.refproj[i]);
+                    fprintf(stdout, "ED: EV %d flooding potential center: %11.4e", edi->flood.vecs.ieig[i], edi->flood.vecs.refproj[i]);
                     if (edi->flood.bHarmonic)
+                    {
                         fprintf(stdout, " (adding %11.4e/timestep)", edi->flood.vecs.refprojslope[i]);
+                    }
                     fprintf(stdout, "\n");
                 }
             }
 
             /* set starting projections for linsam */
-            rad_project(edi, xstart, &edi->vecs.linacc, cr);
-            rad_project(edi, xstart, &edi->vecs.linfix, cr);
-
-            /* Output to file, set the step to -1 so that write_edo knows it was called from init_edsam */
-            if (ed->edo && !(ed->bStartFromCpt))
-                write_edo(nr_edi, edi, ed, -1, 0);
+            rad_project(edi, xstart, &edi->vecs.linacc);
+            rad_project(edi, xstart, &edi->vecs.linfix);
 
             /* Prepare for the next edi data set: */
             edi=edi->next_edi;
@@ -2352,9 +2872,9 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
     if (PAR(cr))
     {
         /* First let everybody know how many ED data sets to expect */
-        gmx_bcast(sizeof(numedis), &numedis, cr);
+        gmx_bcast(sizeof(EDstate->nED), &EDstate->nED, cr);
         /* Broadcast the essential dynamics / flooding data to all nodes */
-        broadcast_ed_data(cr, ed, numedis);
+        broadcast_ed_data(cr, ed, EDstate->nED);
     }
     else
     {
@@ -2363,7 +2883,7 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
 
         /* Loop over all ED data sets (usually only one, though) */
         edi=ed->edpar;
-        for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+        for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
         {
             edi->sref.anrs_loc = edi->sref.anrs;
             edi->sav.anrs_loc  = edi->sav.anrs;
@@ -2373,13 +2893,17 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             snew(edi->sav.c_ind, edi->sav.nr);
             /* Initialize the array */
             for (i=0; i<edi->sav.nr; i++)
+            {
                 edi->sav.c_ind[i] = i;
+            }
             /* In the general case we will need a different-sized array for the reference indices: */
             if (!edi->bRefEqAv)
             {
                 snew(edi->sref.c_ind, edi->sref.nr);
                 for (i=0; i<edi->sref.nr; i++)
+                {
                     edi->sref.c_ind[i] = i;
+                }
             }
             /* Point to the very same array in case of other structures: */
             edi->star.c_ind = edi->sav.c_ind;
@@ -2390,7 +2914,7 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
             edi->star.nr_loc = edi->star.nr;
             edi->sori.nr_loc = edi->sori.nr;
 
-            /* An on we go to the next edi dataset */
+            /* An on we go to the next ED group */
             edi=edi->next_edi;
         }
     }
@@ -2398,7 +2922,7 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
     /* Allocate space for ED buffer variables */
     /* Again, loop over ED data sets */
     edi=ed->edpar;
-    for (nr_edi = 1; nr_edi <= numedis; nr_edi++)
+    for (nr_edi = 1; nr_edi <= EDstate->nED; nr_edi++)
     {
         /* Allocate space for ED buffer */
         snew(edi->buf, 1);
@@ -2426,20 +2950,21 @@ void init_edsam(gmx_mtop_t  *mtop,   /* global topology                    */
         dump_edi(edi, cr, nr_edi);
 #endif
 
-        /* An on we go to the next edi dataset */
+        /* Next ED group */
         edi=edi->next_edi;
     }
 
     /* Flush the edo file so that the user can check some things
      * when the simulation has started */
     if (ed->edo)
+    {
         fflush(ed->edo);
+    }
 }
 
 
 void do_edsam(t_inputrec  *ir,
               gmx_large_int_t step,
-              t_mdatoms   *md,
               t_commrec   *cr,
               rvec        xs[],   /* The local current positions on this processor */
               rvec        v[],    /* The velocities */
@@ -2454,21 +2979,25 @@ void do_edsam(t_inputrec  *ir,
     struct t_do_edsam *buf;
     t_edpar *edi;
     real    rmsdev=-1;      /* RMSD from reference structure prior to applying the constraints */
-    gmx_bool bSuppress=FALSE; /* Write .edo file on master? */
+    gmx_bool bSuppress=FALSE; /* Write .xvg output file on master? */
 
 
     /* Check if ED sampling has to be performed */
     if ( ed->eEDtype==eEDnone )
+    {
         return;
+    }
 
     /* Suppress output on first call of do_edsam if
      * two-step sd2 integrator is used */
     if ( (ir->eI==eiSD2) && (v != NULL) )
+    {
         bSuppress = TRUE;
+    }
 
     dt_1 = 1.0/ir->delta_t;
 
-    /* Loop over all ED datasets (usually one) */
+    /* Loop over all ED groups (usually one) */
     edi  = ed->edpar;
     edinr = 0;
     while (edi != NULL)
@@ -2480,8 +3009,10 @@ void do_edsam(t_inputrec  *ir,
             buf=edi->buf->do_edsam;
 
             if (ed->bFirst)
+            {
                 /* initialise radacc radius for slope criterion */
                 buf->oldrad=calc_radius(&edi->vecs.radacc);
+            }
 
             /* Copy the positions into buf->xc* arrays and after ED
              * feed back corrections to the official positions */
@@ -2494,13 +3025,12 @@ void do_edsam(t_inputrec  *ir,
             communicate_group_positions(cr, buf->xcoll, buf->shifts_xcoll, buf->extra_shifts_xcoll, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
                     edi->sav.nr, edi->sav.nr_loc, edi->sav.anrs_loc, edi->sav.c_ind, edi->sav.x_old,  box);
 
-#ifdef DEBUG_ED
-            dump_xcoll(edi, buf, cr, step);
-#endif
             /* Only assembly reference positions if their indices differ from the average ones */
             if (!edi->bRefEqAv)
+            {
                 communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
                         edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
+            }
 
             /* If bUpdateShifts was TRUE then the shifts have just been updated in communicate_group_positions.
              * We do not need to update the shifts until the next NS step. Note that dd_make_local_ed_indices
@@ -2512,9 +3042,13 @@ void do_edsam(t_inputrec  *ir,
 
             /* Fit the reference indices to the reference structure */
             if (edi->bRefEqAv)
+            {
                 fit_to_reference(buf->xcoll , transvec, rotmat, edi);
+            }
             else
+            {
                 fit_to_reference(buf->xc_ref, transvec, rotmat, edi);
+            }
 
             /* Now apply the translation and rotation to the ED structure */
             translate_and_rotate(buf->xcoll, edi->sav.nr, transvec, rotmat);
@@ -2540,8 +3074,8 @@ void do_edsam(t_inputrec  *ir,
             if (do_per_step(step,edi->maxedsteps) && step >= edi->presteps)
             {
                 project(buf->xcoll, edi);
-                rad_project(edi, buf->xcoll, &edi->vecs.radacc, cr);
-                rad_project(edi, buf->xcoll, &edi->vecs.radfix, cr);
+                rad_project(edi, buf->xcoll, &edi->vecs.radacc);
+                rad_project(edi, buf->xcoll, &edi->vecs.radfix);
                 buf->oldrad=-1.e5;
             }
 
@@ -2552,10 +3086,13 @@ void do_edsam(t_inputrec  *ir,
                 if (edi->vecs.radacc.radius - buf->oldrad < edi->slope)
                 {
                     project(buf->xcoll, edi);
-                    rad_project(edi, buf->xcoll, &edi->vecs.radacc, cr);
+                    rad_project(edi, buf->xcoll, &edi->vecs.radacc);
                     buf->oldrad = 0.0;
-                } else
+                }
+                else
+                {
                     buf->oldrad = edi->vecs.radacc.radius;
+                }
             }
 
             /* apply the constraints */
@@ -2563,7 +3100,7 @@ void do_edsam(t_inputrec  *ir,
             {
                 /* ED constraints should be applied already in the first MD step
                  * (which is step 0), therefore we pass step+1 to the routine */
-                ed_apply_constraints(buf->xcoll, edi, step+1 - ir->init_step, cr);
+                ed_apply_constraints(buf->xcoll, edi, step+1 - ir->init_step);
             }
 
             /* write to edo, when required */
@@ -2571,7 +3108,9 @@ void do_edsam(t_inputrec  *ir,
             {
                 project(buf->xcoll, edi);
                 if (MASTER(cr) && !bSuppress)
-                    write_edo(edinr, edi, ed, step, rmsdev);
+                {
+                    write_edo(edi, ed->edo, rmsdev);
+                }
             }
 
             /* Copy back the positions unless monitoring only */
@@ -2605,10 +3144,10 @@ void do_edsam(t_inputrec  *ir,
             }
         } /* END of if (edi->bNeedDoEdsam) */
 
-        /* Prepare for the next ED dataset */
+        /* Prepare for the next ED group */
         edi = edi->next_edi;
 
-    } /* END of loop over ED datasets */
+    } /* END of loop over ED groups */
 
     ed->bFirst = FALSE;
 }
index 38bf30cf32a8d5ca229bc17fa5f0ba38e6a50e54..05ccd0eda5f8f5810efeef0980c6aca1d085fc21 100644 (file)
@@ -53,7 +53,7 @@ FILE* debug;
 #include "gmxcomplex.h"
 #include "gmx_fft.h"
 
-#ifndef GMX_LIB_MPI
+#ifndef GMX_MPI
 double MPI_Wtime();
 #endif
 
index 934a8d939c922b27f819439794b603ee93599d87..8ed1a583a0c5ebc7c6febd83816b4d57c36691eb 100644 (file)
@@ -1164,7 +1164,7 @@ static void make_nbf_tables(FILE *fp,const output_env_t oenv,
     nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
     nbl->table_elec.ninteractions = 1;
     nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
-    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),16);
+    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),32);
 
     nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
     nbl->table_vdw.format = nbl->table_elec_vdw.format;
@@ -1175,7 +1175,7 @@ static void make_nbf_tables(FILE *fp,const output_env_t oenv,
     nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
     nbl->table_vdw.ninteractions = 2;
     nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
-    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),16);
+    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),32);
 
     for(i=0; i<=nbl->table_elec_vdw.n; i++)
     {
@@ -1431,7 +1431,7 @@ static void pick_nbnxn_kernel_cpu(FILE *fp,
 #endif
         if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
         {
-#ifdef GMX_NBNXN_SIMD_2XNN
+#ifdef GMX_NBNXN_SIMD_4XN
             *kernel_type = nbnxnk4xN_SIMD_4xN;
 #else
             gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
@@ -1467,40 +1467,58 @@ static void pick_nbnxn_kernel_cpu(FILE *fp,
 }
 
 
-/* Note that _mm_... intrinsics can be converted to either SSE or AVX
- * depending on compiler flags.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
-static const char *nbk_name[] =
-  { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+const char *lookup_nbnxn_kernel_name(int kernel_type)
+{
+    const char *returnvalue = NULL;
+    switch(kernel_type)
+    {
+    case nbnxnkNotSet: returnvalue = "not set"; break;
+    case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
+#ifndef GMX_NBNXN_SIMD
+    case nbnxnk4xN_SIMD_4xN: returnvalue = "not available"; break;
+    case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
+#else
+#ifdef GMX_X86_SSE2
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+        /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
+         * on compiler flags. As we use nearly identical intrinsics, using an AVX
+         * compiler flag without an AVX macro effectively results in AVX kernels.
+         * For gcc we check for __AVX__
+         * At least a check for icc should be added (if there is a macro)
+         */
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 #ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
-    "SSE2 4x4",
+    case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE2"; break;
+    case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
 #else
-    "SSE2 4x2",
+    case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE4.1"; break;
+    case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
 #endif
 #else
-#ifndef GMX_DOUBLE
-    "SSE4.1 4x4",
-#else
-    "SSE4.1 4x2",
+    case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-128"; break;
+    case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
 #endif
 #endif
-#else
-#ifndef GMX_DOUBLE
-    "AVX-128 4x4",
-#else
-    "AVX-128 4x2",
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+    case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-256"; break;
+    case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
 #endif
+#else /* not GMX_X86_SSE2 */
+    case nbnxnk4xN_SIMD_4xN: returnvalue = "SIMD"; break;
+    case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
 #endif
-#ifndef GMX_DOUBLE
-    "AVX-256 4x8",
-#else
-    "AVX-256 4x4",
 #endif
-    "CUDA 8x8x8", "plain C 8x8x8" };
+    case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
+    case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
+
+    case nbnxnkNR:
+    default:
+        gmx_fatal(FARGS, "Illegal kernel type selected");
+        returnvalue = NULL;
+        break;
+    }
+    return returnvalue;
+};
 
 static void pick_nbnxn_kernel(FILE *fp,
                               const t_commrec *cr,
@@ -1591,7 +1609,7 @@ static void pick_nbnxn_kernel(FILE *fp,
     if (bDoNonbonded && fp != NULL)
     {
         fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
-                nbnxn_kernel_name[*kernel_type],
+                lookup_nbnxn_kernel_name(*kernel_type),
                 nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
                 nbnxn_kernel_to_cj_size(*kernel_type));
     }
@@ -1649,9 +1667,9 @@ static void init_ewald_f_table(interaction_const_t *ic,
     sfree_aligned(ic->tabq_coul_V);
 
     /* Create the original table data in FDV0 */
-    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
-    snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
-    snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
+    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,32);
+    snew_aligned(ic->tabq_coul_F,ic->tabq_size,32);
+    snew_aligned(ic->tabq_coul_V,ic->tabq_size,32);
     table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
                                 ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
 }
@@ -1686,9 +1704,9 @@ void init_interaction_const(FILE *fp,
     snew(ic, 1);
 
     /* Just allocate something so we can free it */
-    snew_aligned(ic->tabq_coul_FDV0,16,16);
-    snew_aligned(ic->tabq_coul_F,16,16);
-    snew_aligned(ic->tabq_coul_V,16,16);
+    snew_aligned(ic->tabq_coul_FDV0,16,32);
+    snew_aligned(ic->tabq_coul_F,16,32);
+    snew_aligned(ic->tabq_coul_V,16,32);
 
     ic->rlist       = fr->rlist;
     ic->rlistlong   = fr->rlistlong;
index 230ead9eb47414763061f07ec5ee27e9bff88ad5..cebefc09c5e1eeb69a408f52250957fe00e987ad 100644 (file)
@@ -722,7 +722,7 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
         fprintf(fplog, "%s\n", hline);
 
         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
-        fprintf(fplog, "\n Force evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
+        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                 tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                 gpu_cpu_ratio);
 
@@ -744,9 +744,9 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                          * but we currently can't check that here.
                          */
                         md_print_warn(NULL,fplog,
-                                      "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
+                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                       "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
-                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.\n");
+                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                     }
                     else
                     {
@@ -754,15 +754,15 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                          * too small for increasing the cut-off for PME tuning.
                          */
                         md_print_warn(NULL,fplog,
-                                      "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
-                                      "      performance loss.\n");
+                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
+                                      "      performance loss.");
                     }
                 }
                 if (gpu_cpu_ratio > 1.2)
                 {
                     md_print_warn(NULL,fplog,
-                                  "NOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
-                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.\n");
+                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
+                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                 }
             }
         }
index 2219580e0e055b48f42b0fc0b2ab177d0996768f..73aab70dfbf0f3698ca4785931435f0562c24ecd 100644 (file)
@@ -109,10 +109,10 @@ void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes)
             snew(state->cg_p,state->nalloc);
         }
     }
-  if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE) {
+    if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc)) {
     state->nrng  = gmx_rng_n();
     state->nrngi = 1;
-    if (EI_SD(ir->eI) || ir->eI == eiBD) {
+    if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc)) {
       /* This will be correct later with DD */
       state->nrng  *= nnodes;
       state->nrngi *= nnodes;
@@ -182,7 +182,7 @@ void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
 {
     bcast_ir_mtop(cr,inputrec,mtop);
 
-    if (inputrec->eI == eiBD || EI_SD(inputrec->eI)) {
+    if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc)) {
         /* Make sure the random seeds are different on each node */
         inputrec->ld_seed += cr->nodeid;
     }
index f6268d41a9c93c5d53cd4364e141095a2f745548..4db41010c3c56e8865864f959c0a8e261597e256 100644 (file)
 #include "nbnxn_atomdata.h"
 #include "gmx_omp_nthreads.h"
 
-/* Default nbnxn allocation routine, allocates 32 byte aligned,
- * which works for plain C and aligned SSE and AVX loads/stores.
- */
+/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
 void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
 {
-    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
+    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,NBNXN_MEM_ALIGN);
 }
 
 /* Free function for memory allocated with nbnxn_alloc_aligned */
@@ -650,6 +648,38 @@ void nbnxn_atomdata_init(FILE *fp,
     nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
     nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
     nbat->x       = NULL;
+
+#ifdef GMX_NBNXN_SIMD
+    if (simple)
+    {
+        /* Set the diagonal cluster pair exclusion mask setup data.
+         * In the kernel we check 0 < j - i to generate the masks.
+         * Here we store j - i for generating the mask for the first i,
+         * we substract 0.5 to avoid rounding issues.
+         * In the kernel we can subtract 1 to generate the subsequent mask.
+         */
+        const int simd_width=GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+        int simd_4xn_diag_size,j;
+
+        simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE,simd_width);
+        snew_aligned(nbat->simd_4xn_diag,simd_4xn_diag_size,NBNXN_MEM_ALIGN);
+        for(j=0; j<simd_4xn_diag_size; j++)
+        {
+            nbat->simd_4xn_diag[j] = j - 0.5;
+        }
+
+        snew_aligned(nbat->simd_2xnn_diag,simd_width,NBNXN_MEM_ALIGN);
+        for(j=0; j<simd_width/2; j++)
+        {
+            /* The j-cluster size is half the SIMD width */
+            nbat->simd_2xnn_diag[j]              = j - 0.5;
+            /* The next half of the SIMD width is for i + 1 */
+            nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
+        }
+    }
+#endif
+
+    /* Initialize the output data structures */
     nbat->nout    = nout;
     snew(nbat->out,nbat->nout);
     nbat->nalloc  = 0;
@@ -1021,21 +1051,22 @@ nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
 }
 
 static void
-nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
-                                     gmx_bool bDestSet,
-                                     real ** gmx_restrict src,
-                                     int nsrc,
-                                     int i0, int i1)
+nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
+                                 gmx_bool bDestSet,
+                                 real ** gmx_restrict src,
+                                 int nsrc,
+                                 int i0, int i1)
 {
-#ifdef NBNXN_SEARCH_SSE
-/* We can use AVX256 here, but not when AVX128 kernels are selected.
- * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
+#ifdef GMX_NBNXN_SIMD
+/* The SIMD width here is actually independent of that in the kernels,
+ * but we use the same width for simplicity (usually optimal anyhow).
  */
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
 #endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#endif
 #include "gmx_simd_macros.h"
 
     int       i,s;
@@ -1252,8 +1283,8 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
                 }
                 if (nfptr > 0)
                 {
-#ifdef NBNXN_SEARCH_SSE
-                    nbnxn_atomdata_reduce_reals_x86_simd
+#ifdef GMX_NBNXN_SIMD
+                    nbnxn_atomdata_reduce_reals_simd
 #else
                     nbnxn_atomdata_reduce_reals
 #endif
index fe863769edb2fc04301c3ccb485eec5d3a148e6c..bf9e92b7758722e545df17c7441bf0615d7118c6 100644 (file)
@@ -62,8 +62,11 @@ extern "C" {
 /* With GPU kernels the cluster size is 8 atoms */
 #define NBNXN_GPU_CLUSTER_SIZE         8
 
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage */
-#define NBNXN_GPU_JGROUP_SIZE  4
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
+ * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ */
+#define NBNXN_GPU_JGROUP_SIZE       4
+#define NBNXN_GPU_JGROUP_SIZE_2LOG  2
 
 /* To avoid NaN when excluded atoms are at zero distance, we add a small
  * number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
index 11ab258d9eccd9dcee739bf623079dd7df5231b9..22c6bb931801c58072e04aafe569ac8c8bb14b26 100644 (file)
@@ -227,9 +227,9 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #if !defined PRUNE_NBL && __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
 #pragma unroll 4
 #endif
-            for (jm = 0; jm < 4; jm++)
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
             {
-                if (imask & (255U << (jm * NCL_PER_SUPERCL)))
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
                 {
                     mask_ji = (1U << (jm * NCL_PER_SUPERCL));
 
index 6891d13bb769999c31c23e295de6f74318f593d4..be1c7a778c95b6258fcb0ba80111d9b06822841c 100644 (file)
@@ -206,9 +206,9 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
 #if CUDA_VERSION >= 4010
             #pragma unroll 4
 #endif
-            for (jm = 0; jm < 4; jm++)
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
             {
-                imask_j = (imask >> (jm * 8)) & 255U;
+                imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
                 if (imask_j)
                 {
                     nsubi = __popc(imask_j);
index 610be4571d4a6504d9991f9761cdadd8c783f33c..d057fc550a158c4fec3640a668bf026df5c78798 100644 (file)
@@ -48,6 +48,9 @@
 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
 #define FBUF_STRIDE                 (CL_SIZE_SQ)
 
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  *  Original idea: OpenMM
  */
index a9b8e2d4419283b62dc1de822ab2e6428e0f1e75..c54e068a6da224593416774f60ebd525e4fb2094 100644 (file)
@@ -46,7 +46,17 @@ extern "C" {
 
 
 #ifdef GMX_X86_SSE2
-#define NBNXN_SEARCH_SSE
+/* Use 4-way SIMD for, always, single precision bounding box calculations */
+#define NBNXN_SEARCH_BB_SSE
+#endif
+
+
+#ifdef GMX_NBNXN_SIMD
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
+#define NBNXN_MEM_ALIGN  (GMX_NBNXN_SIMD_BITWIDTH/8)
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_MEM_ALIGN  32
 #endif
 
 
index f915050caf92cbe35a2ebd0e607138cc18cd8a9e..dc43b39f05858802f2307de0961f95e0e932ca99 100644 (file)
@@ -186,7 +186,7 @@ nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
             excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
             excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 
-            for(jm=0; jm<4; jm++)
+            for(jm=0; jm<NBNXN_GPU_JGROUP_SIZE; jm++)
             {
                 cj               = nbl->cj4[cj4_ind].cj[jm];
 
index f248d3c121680ff695c40af6390343a26905589e..08d8bc640ed1dfc4f79abeb357f63e53b9141fa3 100644 (file)
@@ -103,7 +103,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
     real       *nbfp_i;
     int        n,ci,ci_sh;
     int        ish,ishf;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
 
@@ -208,8 +208,15 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef CALC_ENERGIES
 #ifndef ENERGY_GROUPS
@@ -232,8 +239,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
             }
         }
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
 #ifdef CALC_ENERGIES
             real Vc_sub_self;
index cab66c3e346310461148ab9489e42a5cde9eab5b..fa50cbeb4b7e6346ec0541bf142a6028a6fd4eac 100644 (file)
 #ifdef CALC_LJ
 
 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
-            load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+            load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE0,c12_SSE0);
 #ifndef HALF_LJ
-            load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+            load_lj_pair_params2(nbfp2,nbfp3,type,aj,c6_SSE2,c12_SSE2);
 #endif
 #endif /* not defined any LJ rule */
 
index faa445efbfb1fb2ef9d67becdbc88ed1a9b46de2..f43c9e5eac877ef5ec67ffca6559ef2889d38370 100644 (file)
@@ -35,7 +35,7 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+/* GMX_MM256_HERE should be set before including this file */
 #include "gmx_simd_macros.h"
 
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
 #define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
 
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE     4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if defined GMX_MM256_HERE
 #define STRIDE     4
 #endif 
 
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
-#else
-/* SSE double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
-#endif
-#endif
-
 #ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 2x(4+4) kernel */
 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
 #define TAB_FDV0
 #else
-/* AVX double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+#error "unsupported kernel configuration"
 #endif
 #endif
 
@@ -167,7 +152,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     int        nbfp_stride;
     int        n,ci,ci_sh;
     int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        sci,scix,sciy,sciz,sci2;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
@@ -203,15 +188,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
     gmx_mm_pr  mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
 
-    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-
-#ifndef GMX_MM256_HERE
-    __m128i    zeroi_SSE = _mm_setzero_si128();
+    gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+    gmx_mm_pr diag_SSE0,diag_SSE2;
+#else
+    gmx_mm_pr diag0_SSE0,diag0_SSE2;
+    gmx_mm_pr diag1_SSE0,diag1_SSE2;
 #endif
-#ifdef GMX_X86_SSE4_1
+
     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
-#endif
 
     gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
     gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
@@ -229,8 +214,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     const real *tab_coul_V;
 #endif
 #ifdef GMX_MM256_HERE
-    int        ti0_array[2*UNROLLJ-1],*ti0;
-    int        ti2_array[2*UNROLLJ-1],*ti2;
+    int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+    int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
 #endif
 #ifdef CALC_ENERGIES
     gmx_mm_pr  mhalfsp_SSE;
@@ -308,11 +293,34 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     nbfp_stride = NBFP_STRIDE;
 #endif
 
+    /* Load j-i for the first i */
+    diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
+    /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+    diag_SSE0    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE2    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if 2*UNROLLI == UNROLLJ
+    diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+#endif
+#endif
+
 #ifdef CALC_COUL_TAB
 #ifdef GMX_MM256_HERE
-    /* Generate aligned table pointers */
-    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    /* Generate aligned table index pointers */
+    ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 #endif
 
     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
@@ -407,7 +415,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
     egps_jshift  = 2*nbat->neg_2log;
     egps_jmask   = (1<<egps_jshift) - 1;
     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 #endif
 
@@ -420,9 +428,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
 
         ish              = (nbln->shift & NBNXN_CI_SHIFT);
         ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
@@ -441,8 +448,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
         sci             += (ci & 1)*(STRIDE>>1);
 #endif
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef ENERGY_GROUPS
         egps_i = nbat->energrp[ci];
@@ -513,8 +527,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
         iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz)  ,shZ_SSE);
         iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
             gmx_mm_pr facel_SSE;
 
index 1676f1f43dc38c85901a37d8744d2afe45bdfee7..602922c9457fc77596061eb896a2f7035846d95a 100644 (file)
             ajz           = ajy + STRIDE;
 
 #ifdef CHECK_EXCLS
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
             {
                 /* Load integer interaction mask */
                 __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
 
-                /* The is no unequal sse instruction, so we need a not here */
                 int_SSE0  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
                 int_SSE1  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
                 int_SSE2  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
                 int_SSE3  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
             }
-#else
+#endif
+#if defined GMX_X86_SSE2 && defined GMX_MM256_HERE
             {
 #ifndef GMX_DOUBLE
                 /* Load integer interaction mask */
index 1ab915deaecc3e5670ac0de2fffbd6b7da69bf78..db8602c7c668ac393ed077bd1b1cb919c6e72618 100644 (file)
 
 #ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
 #define SUM_SIMD(x) SUM_SIMD4(x)
 #define TAB_FDV0
 #else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
 #define SUM_SIMD(x) (x[0]+x[1])
 #endif
 #endif
 
 #ifdef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
 #define TAB_FDV0
 #else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
 #define SUM_SIMD(x) SUM_SIMD4(x)
 #endif
 #endif
@@ -167,7 +167,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     int        nbfp_stride;
     int        n,ci,ci_sh;
     int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
     int        sci,scix,sciy,sciz,sci2;
     int        cjind0,cjind1,cjind;
     int        ip,jp;
@@ -203,7 +203,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
 #endif
 
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
 #ifndef GMX_DOUBLE
     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
@@ -216,7 +216,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
 #endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
     /* AVX: use floating point masks, as there are no integer instructions */
 #ifndef GMX_DOUBLE
     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
@@ -230,46 +231,18 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 #endif
 #endif
 
-#ifndef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-    __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+    gmx_mm_pr diag_SSE0,diag_SSE1,diag_SSE2,diag_SSE3;
 #else
-    __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
-    __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
-    __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#else /* GMX_MM256_HERE */
-#ifndef GMX_DOUBLE
-    gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
-    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
+    gmx_mm_pr diag0_SSE0,diag0_SSE1,diag0_SSE2,diag0_SSE3;
+    gmx_mm_pr diag1_SSE0,diag1_SSE1,diag1_SSE2,diag1_SSE3;
 #endif
 
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
     __m128i    zeroi_SSE = _mm_setzero_si128();
 #endif
-#ifdef GMX_X86_SSE4_1
     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
-#endif
 
     gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
     gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
@@ -289,10 +262,10 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     const real *tab_coul_V;
 #endif
 #ifdef GMX_MM256_HERE
-    int        ti0_array[2*UNROLLJ-1],*ti0;
-    int        ti1_array[2*UNROLLJ-1],*ti1;
-    int        ti2_array[2*UNROLLJ-1],*ti2;
-    int        ti3_array[2*UNROLLJ-1],*ti3;
+    int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+    int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1],*ti1;
+    int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
+    int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1],*ti3;
 #endif
 #ifdef CALC_ENERGIES
     gmx_mm_pr  mhalfsp_SSE;
@@ -374,13 +347,50 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     nbfp_stride = NBFP_STRIDE;
 #endif
 
+    /* Load j-i for the first i */
+    diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+    /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+    diag_SSE0    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE1    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE2    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE3    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
+    diag0_SSE0   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag0_SSE1   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag0_SSE2   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag0_SSE3   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+
+#if UNROLLI == 2*UNROLLJ
+    /* Load j-i for the second half of the j-cluster */
+    diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+#endif
+
+    diag1_SSE0   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag1_SSE1   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag1_SSE2   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag1_SSE3   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#endif
+#endif
+
 #ifdef CALC_COUL_TAB
 #ifdef GMX_MM256_HERE
-    /* Generate aligned table pointers */
-    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    /* Generate aligned table index pointers */
+    ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 #endif
 
     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
@@ -475,7 +485,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
     egps_jshift  = 2*nbat->neg_2log;
     egps_jmask   = (1<<egps_jshift) - 1;
     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 #endif
 
@@ -488,9 +498,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 
         ish              = (nbln->shift & NBNXN_CI_SHIFT);
         ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
         ci               = nbln->ci;
         ci_sh            = (ish == CENTRAL ? ci : -1);
 
@@ -509,8 +518,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
         sci             += (ci & 1)*(STRIDE>>1);
 #endif
 
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 
 #ifdef ENERGY_GROUPS
         egps_i = nbat->energrp[ci];
@@ -585,8 +601,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
 
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
         {
             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
index 45ab2aedcc9205f3d7d86d41cb3fbef186d09ef2..4e904dc61d7687ea18c1df9d8a1f63e589709fee 100644 (file)
 #ifndef _nbnxn_kernel_sse_utils_h_
 #define _nbnxn_kernel_sse_utils_h_
 
-/* This files contains all functions/macros for the SSE/AVX kernels
- * which have explicit dependencies on the j-size / SIMD-width, which
- * can be 2 (SSE-double), 4 (SSE-single,AVX-double) or 8 (AVX-single).
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
  * The functionality which depends on the j-cluster size is:
  *   LJ-parameter lookup
  *   force table lookup
  *   energy group pair energy storage
  */
 
+#ifdef GMX_X86_SSE2
+
+/* Transpose 2 double precision registers */
 #define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1)                      \
 {                                                                       \
-    out0 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(0,0));                   \
-    out1 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(1,1));                   \
+    out0 = _mm_unpacklo_pd(in0,in1);                                    \
+    out1 = _mm_unpackhi_pd(in0,in1);                                    \
 }
 
 #if defined GMX_MM128_HERE || !defined GMX_DOUBLE
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
 #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1)    \
 {                                                                       \
     __m128 _c01,_c23;                                                   \
-    _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(1,0,1,0));                \
-    _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(1,0,1,0));                \
+    _c01 = _mm_movelh_ps(in0,in1);                                      \
+    _c23 = _mm_movelh_ps(in2,in3);                                      \
     out0 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0));              \
     out1 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(3,1,3,1));              \
 }
 #else
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
 #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1)    \
 {                                                                       \
     __m256d _c01,_c23;                                                  \
@@ -72,6 +76,7 @@
 }
 #endif
 
+/* Collect element 2 of the 4 inputs to out */
 #define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0,in1,in2,in3,out)           \
 {                                                                       \
     __m128 _c01,_c23;                                                   \
 
 #ifndef GMX_MM256_HERE
 #ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out)                   \
 {                                                                       \
-    _MM_TRANSPOSE4_PS(i_SSE0,i_SSE1,i_SSE2,i_SSE3);                     \
-    i_SSE0 = _mm_add_ps(i_SSE0,i_SSE1);                                 \
-    i_SSE2 = _mm_add_ps(i_SSE2,i_SSE3);                                 \
-    o_SSE  = _mm_add_ps(i_SSE0,i_SSE2);                                 \
+    _MM_TRANSPOSE4_PS(in0,in1,in2,in3);                                 \
+    in0 = _mm_add_ps(in0,in1);                                          \
+    in2 = _mm_add_ps(in2,in3);                                          \
+    out  = _mm_add_ps(in0,in2);                                         \
 }
 #else
-#define GMX_MM_TRANSPOSE_SUM2_PD(i_SSE0,i_SSE1,o_SSE)                   \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM2_PD(in0,in1,out)                           \
 {                                                                       \
-    GMX_MM_TRANSPOSE2_PD(i_SSE0,i_SSE1);                                \
-    o_SSE  = _mm_add_pd(i_SSE0,i_SSE1);                                 \
+    GMX_MM_TRANSPOSE2_PD(in0,in1);                                      \
+    out  = _mm_add_pd(in0,in1);                                         \
 }
 #endif
 #else
 #ifndef GMX_DOUBLE
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out)                   \
 {                                                                       \
-    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE1);                             \
-    i_SSE2 = _mm256_hadd_ps(i_SSE2,i_SSE3);                             \
-    i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
-    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
+    in0 = _mm256_hadd_ps(in0,in1);                                      \
+    in2 = _mm256_hadd_ps(in2,in3);                                      \
+    in1 = _mm256_hadd_ps(in0,in2);                                      \
+    out = _mm_add_ps(_mm256_castps256_ps128(in1),_mm256_extractf128_ps(in1,1)); \
 }
-#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE)                  \
+/* Sum the elements of halfs of each input register and store sums in out */
+#define GMX_MM_TRANSPOSE_SUM4H_PR(in0,in2,out)                          \
 {                                                                       \
-    i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps());                \
-    i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps());                \
-    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
-    i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001);                      \
-    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+    in0 = _mm256_hadd_ps(in0,_mm256_setzero_ps());                      \
+    in2 = _mm256_hadd_ps(in2,_mm256_setzero_ps());                      \
+    in0 = _mm256_hadd_ps(in0,in2);                                      \
+    in2 = _mm256_permute_ps(in0,_MM_SHUFFLE(2,3,0,1));                  \
+    out = _mm_add_ps(_mm256_castps256_ps128(in0),_mm256_extractf128_ps(in2,1)); \
 }
 #else
-#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+/* Sum the elements within each input register and store the sums in out */
+#define GMX_MM_TRANSPOSE_SUM4_PR(in0,in1,in2,in3,out)                   \
 {                                                                       \
-    i_SSE0 = _mm256_hadd_pd(i_SSE0,i_SSE1);                             \
-    i_SSE2 = _mm256_hadd_pd(i_SSE2,i_SSE3);                             \
-    o_SSE  = _mm256_add_pd(_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x20),_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x31)); \
+    in0 = _mm256_hadd_pd(in0,in1);                                      \
+    in2 = _mm256_hadd_pd(in2,in3);                                      \
+    out = _mm256_add_pd(_mm256_permute2f128_pd(in0,in2,0x20),_mm256_permute2f128_pd(in0,in2,0x31)); \
 }
 #endif
 #endif
@@ -136,24 +146,24 @@ gmx_mm128_invsqrt_ps_single(__m128 x)
     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
 }
 
-/* Do 2/4 double precision invsqrt operations.
- * Doing the SSE rsqrt and the first Newton Raphson iteration
+/* Do 2 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
  * in single precision gives full double precision accuracy.
- * The speed is more than twice as fast as two gmx_mm_invsqrt_pd calls.
+ * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
  */
-#define GMX_MM128_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1)              \
+#define GMX_MM128_INVSQRT2_PD(in0,in1,out0,out1)                        \
 {                                                                       \
     const __m128d half  = _mm_set1_pd(0.5);                             \
     const __m128d three = _mm_set1_pd(3.0);                             \
-    __m128  s_SSE,ir_SSE;                                               \
+    __m128  s,ir;                                                       \
     __m128d lu0,lu1;                                                    \
                                                                         \
-    s_SSE  = _mm_movelh_ps(_mm_cvtpd_ps(i_SSE0),_mm_cvtpd_ps(i_SSE1));  \
-    ir_SSE = gmx_mm128_invsqrt_ps_single(s_SSE);                        \
-    lu0    = _mm_cvtps_pd(ir_SSE);                                      \
-    lu1    = _mm_cvtps_pd(_mm_movehl_ps(ir_SSE,ir_SSE));                \
-    o_SSE0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
-    o_SSE1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+    s    = _mm_movelh_ps(_mm_cvtpd_ps(in0),_mm_cvtpd_ps(in1));          \
+    ir   = gmx_mm128_invsqrt_ps_single(s);                              \
+    lu0  = _mm_cvtps_pd(ir);                                            \
+    lu1  = _mm_cvtps_pd(_mm_movehl_ps(ir,ir));                          \
+    out0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),in0)),lu0)); \
+    out1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),in1)),lu1)); \
 }
 
 #define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
@@ -173,19 +183,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
 }
 
-#define GMX_MM256_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1)              \
+/* Do 4 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ */
+#define GMX_MM256_INVSQRT2_PD(in0,in1,out0,out1)                        \
 {                                                                       \
     const __m256d half  = _mm256_set1_pd(0.5);                          \
     const __m256d three = _mm256_set1_pd(3.0);                          \
-    __m256  s_SSE,ir_SSE;                                               \
+    __m256  s,ir;                                                       \
     __m256d lu0,lu1;                                                    \
                                                                         \
-    s_SSE  = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(i_SSE0)),_mm256_cvtpd_ps(i_SSE1),1); \
-    ir_SSE = gmx_mm256_invsqrt_ps_single(s_SSE);                        \
-    lu0    = _mm256_cvtps_pd(_mm256_castps256_ps128(ir_SSE));           \
-    lu1    = _mm256_cvtps_pd(_mm256_extractf128_ps(ir_SSE,1));          \
-    o_SSE0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
-    o_SSE1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+    s    = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)),_mm256_cvtpd_ps(in1),1); \
+    ir   = gmx_mm256_invsqrt_ps_single(s);                              \
+    lu0  = _mm256_cvtps_pd(_mm256_castps256_ps128(ir));                 \
+    lu1  = _mm256_cvtps_pd(_mm256_extractf128_ps(ir,1));                \
+    out0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),in0)),lu0)); \
+    out1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),in1)),lu1)); \
 }
 
 #define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
@@ -236,18 +250,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
 }
 
-#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE)                \
+#define load_lj_pair_params2(nbfp0,nbfp1,type,aj,c6_SSE,c12_SSE)        \
 {                                                                       \
-    __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2];                     \
+    __m128 clj_SSE0[UNROLLJ],clj_SSE1[UNROLLJ],c6t_SSE[2],c12t_SSE[2];  \
     int p;                                                              \
                                                                         \
-    for(p=0; p<2*UNROLLJ; p++)                                            \
+    for(p=0; p<UNROLLJ; p++)                                            \
     {                                                                   \
         /* Here we load 4 aligned floats, but we need just 2 */         \
-        clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
+        clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE);        \
     }                                                                   \
-    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
-    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+    for(p=0; p<UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE);        \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0],clj_SSE0[1],clj_SSE0[2],clj_SSE0[3],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0],clj_SSE1[1],clj_SSE1[2],clj_SSE1[3],c6t_SSE[1],c12t_SSE[1]); \
                                                                         \
     GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE);                     \
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
@@ -298,7 +317,9 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
  * But AMD CPUs perform significantly worse with gcc than with icc.
  * Performance is improved a bit by using the extract function UNROLLJ times,
  * instead of doing an _mm_store_si128 for every i-particle.
- * With AVX this significantly deteriorates performance (8 extracts iso 4).
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
  * Because of this, the load_table_f macro always takes the ti parameter,
  * but it is only used with AVX.
  */
@@ -546,4 +567,6 @@ static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
 }
 #endif
 
+#endif /* GMX_X86_SSE2 */
+
 #endif /* _nbnxn_kernel_sse_utils_h_ */
index c0f08bd5614733664b035f7cce225a61265f2cac..5b16fc06a7326e90b870e86a9aee0ee89260229b 100644 (file)
 #define BBU_Z  6
 
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
 
 #ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
 #define NBNXN_SEARCH_SSE_SINGLE
 #endif
 
 /* Include basic SSE2 stuff */
 #include <emmintrin.h>
 
-#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
-#define NBNXN_8BB_SSE
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
 #endif
 
 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
  * Here AVX-256 turns out to be slightly slower than AVX-128.
  */
-#define STRIDE_8BB        4
-#define STRIDE_8BB_2LOG   2
+#define STRIDE_PBB        4
+#define STRIDE_PBB_2LOG   2
 
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
 
 #ifdef GMX_NBNXN_SIMD
 
 #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
 
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 #define NBNXN_BBXXXX
 /* Size of bounding box corners quadruplet */
-#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
+#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
 #endif
 
 /* We shift the i-particles backward for PBC.
@@ -418,6 +421,7 @@ static real grid_atom_density(int n,rvec corner0,rvec corner1)
 
 static int set_grid_size_xy(const nbnxn_search_t nbs,
                             nbnxn_grid_t *grid,
+                            int dd_zone,
                             int n,rvec corner0,rvec corner1,
                             real atom_density,
                             int XFormat)
@@ -464,6 +468,23 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
         grid->ncy = 1;
     }
 
+    grid->sx = size[XX]/grid->ncx;
+    grid->sy = size[YY]/grid->ncy;
+    grid->inv_sx = 1/grid->sx;
+    grid->inv_sy = 1/grid->sy;
+
+    if (dd_zone > 0)
+    {
+        /* This is a non-home zone, add an extra row of cells
+         * for particles communicated for bonded interactions.
+         * These can be beyond the cut-off. It doesn't matter where
+         * they end up on the grid, but for performance it's better
+         * if they don't end up in cells that can be within cut-off range.
+         */
+        grid->ncx++;
+        grid->ncy++;
+    }
+
     /* We need one additional cell entry for particles moved by DD */
     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
     {
@@ -497,8 +518,8 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
         grid->nc_nalloc = over_alloc_large(nc_max);
         srenew(grid->nsubc,grid->nc_nalloc);
         srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_8BB_SSE
-        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
+#ifdef NBNXN_PBB_SSE
+        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
 #else
         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
 #endif
@@ -526,23 +547,39 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
 
     copy_rvec(corner0,grid->c0);
     copy_rvec(corner1,grid->c1);
-    grid->sx = size[XX]/grid->ncx;
-    grid->sy = size[YY]/grid->ncy;
-    grid->inv_sx = 1/grid->sx;
-    grid->inv_sy = 1/grid->sy;
 
     return nc_max;
 }
 
-#define SORT_GRID_OVERSIZE 2
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
 #define SGSF (SORT_GRID_OVERSIZE + 1)
 
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the inverse hole spacing.
+ * nsort, the theortical hole limit, is only used for debugging.
+ * sort is the sorting work array.
+ */
 static void sort_atoms(int dim,gmx_bool Backwards,
                        int *a,int n,rvec *x,
                        real h0,real invh,int nsort,int *sort)
 {
     int i,c;
-    int zi,zim;
+    int zi,zim,zi_min,zi_max;
     int cp,tmp;
 
     if (n <= 1)
@@ -551,13 +588,10 @@ static void sort_atoms(int dim,gmx_bool Backwards,
         return;
     }
 
-    /* For small oversize factors clearing the whole area is fastest.
-     * For large oversize we should clear the used elements after use.
-     */
-    for(i=0; i<nsort; i++)
-    {
-        sort[i] = -1;
-    }
+    /* Determine the index range used, so we can limit it for the second pass */
+    zi_min = INT_MAX;
+    zi_max = -1;
+
     /* Sort the particles using a simple index sort */
     for(i=0; i<n; i++)
     {
@@ -582,6 +616,8 @@ static void sort_atoms(int dim,gmx_bool Backwards,
         if (sort[zi] < 0)
         {
             sort[zi] = a[i];
+            zi_min = min(zi_min,zi);
+            zi_max = max(zi_max,zi);
         }
         else
         {
@@ -611,8 +647,10 @@ static void sort_atoms(int dim,gmx_bool Backwards,
                     zim++;
                 }
                 sort[zim] = cp;
+                zi_max = max(zi_max,zim);
             }
             sort[zi] = a[i];
+            zi_max = max(zi_max,zi);
         }
     }
 
@@ -624,16 +662,18 @@ static void sort_atoms(int dim,gmx_bool Backwards,
             if (sort[zi] >= 0)
             {
                 a[c++] = sort[zi];
+                sort[zi] = -1;
             }
         }
     }
     else
     {
-        for(zi=nsort-1; zi>=0; zi--)
+        for(zi=zi_max; zi>=zi_min; zi--)
         {
             if (sort[zi] >= 0)
             {
                 a[c++] = sort[zi];
+                sort[zi] = -1;
             }
         }
     }
@@ -744,7 +784,7 @@ static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
     bb[BBU_Z] = R2F_U(zh);
 }
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
 
 /* Packed coordinates, bb order xyz0 */
 static void calc_bounding_box_x_x4_halves(int na,const real *x,
@@ -796,15 +836,15 @@ static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
         i += stride;
     }
     /* Note: possible double to float conversion here */
-    bb[0*STRIDE_8BB] = R2F_D(xl);
-    bb[1*STRIDE_8BB] = R2F_D(yl);
-    bb[2*STRIDE_8BB] = R2F_D(zl);
-    bb[3*STRIDE_8BB] = R2F_U(xh);
-    bb[4*STRIDE_8BB] = R2F_U(yh);
-    bb[5*STRIDE_8BB] = R2F_U(zh);
+    bb[0*STRIDE_PBB] = R2F_D(xl);
+    bb[1*STRIDE_PBB] = R2F_D(yl);
+    bb[2*STRIDE_PBB] = R2F_D(zl);
+    bb[3*STRIDE_PBB] = R2F_U(xh);
+    bb[4*STRIDE_PBB] = R2F_U(yh);
+    bb[5*STRIDE_PBB] = R2F_U(zh);
 }
 
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
 
 #ifdef NBNXN_SEARCH_SSE_SINGLE
 
@@ -837,17 +877,17 @@ static void calc_bounding_box_xxxx_sse(int na,const float *x,
 {
     calc_bounding_box_sse(na,x,bb_work);
 
-    bb[0*STRIDE_8BB] = bb_work[BBL_X];
-    bb[1*STRIDE_8BB] = bb_work[BBL_Y];
-    bb[2*STRIDE_8BB] = bb_work[BBL_Z];
-    bb[3*STRIDE_8BB] = bb_work[BBU_X];
-    bb[4*STRIDE_8BB] = bb_work[BBU_Y];
-    bb[5*STRIDE_8BB] = bb_work[BBU_Z];
+    bb[0*STRIDE_PBB] = bb_work[BBL_X];
+    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+    bb[3*STRIDE_PBB] = bb_work[BBU_X];
+    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
 }
 
 #endif /* NBNXN_SEARCH_SSE_SINGLE */
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
 
 /* Combines pairs of consecutive bounding boxes */
 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
@@ -926,18 +966,18 @@ static void print_bbsizes_supersub(FILE *fp,
     for(c=0; c<grid->nc; c++)
     {
 #ifdef NBNXN_BBXXXX
-        for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
+        for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
         {
             int cs_w,i,d;
 
-            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
-            for(i=0; i<STRIDE_8BB; i++)
+            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+            for(i=0; i<STRIDE_PBB; i++)
             {
                 for(d=0; d<DIM; d++)
                 {
                     ba[d] +=
-                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
-                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
+                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
                 }
             }
         }
@@ -1081,7 +1121,7 @@ void fill_cell(const nbnxn_search_t nbs,
         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
         bb_ptr = grid->bb + offset;
 
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
         if (2*grid->na_cj == grid->na_c)
         {
             calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
@@ -1109,8 +1149,8 @@ void fill_cell(const nbnxn_search_t nbs,
                              */
         bb_ptr =
             grid->bb +
-            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
-            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
+            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 
 #ifdef NBNXN_SEARCH_SSE_SINGLE
         if (nbat->XFormat == nbatXYZQ)
@@ -1128,9 +1168,9 @@ void fill_cell(const nbnxn_search_t nbs,
         {
             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
                     sx,sy,sz,
-                    bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
-                    bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
-                    bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
+                    bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
+                    bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
+                    bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
         }
     }
 #endif
@@ -1353,7 +1393,8 @@ static void sort_columns_supersub(const nbnxn_search_t nbs,
 /* Determine in which grid column atoms should go */
 static void calc_column_indices(nbnxn_grid_t *grid,
                                 int a0,int a1,
-                                rvec *x,const int *move,
+                                rvec *x,
+                                int dd_zone,const int *move,
                                 int thread,int nthread,
                                 int *cell,
                                 int *cxy_na)
@@ -1369,50 +1410,78 @@ static void calc_column_indices(nbnxn_grid_t *grid,
 
     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
-    for(i=n0; i<n1; i++)
+    if (dd_zone == 0)
     {
-        if (move == NULL || move[i] >= 0)
+        /* Home zone */
+        for(i=n0; i<n1; i++)
         {
-            /* We need to be careful with rounding,
-             * particles might be a few bits outside the local box.
-             * The int cast takes care of the lower bound,
-             * we need to explicitly take care of the upper bound.
-             */
-            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
-            if (cx == grid->ncx)
-            {
-                cx = grid->ncx - 1;
-            }
-            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
-            if (cy == grid->ncy)
+            if (move == NULL || move[i] >= 0)
             {
-                cy = grid->ncy - 1;
-            }
-            /* For the moment cell contains only the, grid local,
-             * x and y indices, not z.
-             */
-            cell[i] = cx*grid->ncy + cy;
+                /* We need to be careful with rounding,
+                 * particles might be a few bits outside the local zone.
+                 * The int cast takes care of the lower bound,
+                 * we will explicitly take care of the upper bound.
+                 */
+                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 
 #ifdef DEBUG_NBNXN_GRIDDING
-            if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+                if (cx < 0 || cx >= grid->ncx ||
+                    cy < 0 || cy >= grid->ncy)
+                {
+                    gmx_fatal(FARGS,
+                              "grid cell cx %d cy %d out of range (max %d %d)\n"
+                              "atom %f %f %f, grid->c0 %f %f",
+                              cx,cy,grid->ncx,grid->ncy,
+                              x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+                }
+#endif
+                /* Take care of potential rouding issues */
+                cx = min(cx,grid->ncx - 1);
+                cy = min(cy,grid->ncy - 1);
+
+                /* For the moment cell will contain only the, grid local,
+                 * x and y indices, not z.
+                 */
+                cell[i] = cx*grid->ncy + cy;
+            }
+            else
             {
-                gmx_fatal(FARGS,
-                          "grid cell cx %d cy %d out of range (max %d %d)\n"
-                          "atom %f %f %f, grid->c0 %f %f",
-                          cx,cy,grid->ncx,grid->ncy,
-                          x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+                /* Put this moved particle after the end of the grid,
+                 * so we can process it later without using conditionals.
+                 */
+                cell[i] = grid->ncx*grid->ncy;
             }
-#endif
+
+            cxy_na[cell[i]]++;
         }
-        else
+    }
+    else
+    {
+        /* Non-home zone */
+        for(i=n0; i<n1; i++)
         {
-            /* Put this moved particle after the end of the grid,
-             * so we can process it later without using conditionals.
+            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+            /* For non-home zones there could be particles outside
+             * the non-bonded cut-off range, which have been communicated
+             * for bonded interactions only. For the result it doesn't
+             * matter where these end up on the grid. For performance
+             * we put them in an extra row at the border.
              */
-            cell[i] = grid->ncx*grid->ncy;
-        }
+            cx = max(cx,0);
+            cx = min(cx,grid->ncx - 1);
+            cy = max(cy,0);
+            cy = min(cy,grid->ncy - 1);
 
-        cxy_na[cell[i]]++;
+            /* For the moment cell will contain only the, grid local,
+             * x and y indices, not z.
+             */
+            cell[i] = cx*grid->ncy + cy;
+
+            cxy_na[cell[i]]++;
+        }
     }
 }
 
@@ -1436,7 +1505,7 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
 #pragma omp parallel for num_threads(nthread) schedule(static)
     for(thread=0; thread<nthread; thread++)
     {
-        calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+        calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
                             nbs->cell,nbs->work[thread].cxy_na);
     }
 
@@ -1503,6 +1572,11 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
             srenew(nbs->work[thread].sort_work,
                    nbs->work[thread].sort_work_nalloc);
+            /* When not in use, all elements should be -1 */
+            for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
+            {
+                nbs->work[thread].sort_work[i] = -1;
+            }
         }
     }
 
@@ -1516,12 +1590,18 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
     }
 
-    /* Set the cell indices for the moved particles */
-    n0 = grid->nc*grid->na_sc;
-    n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
-    for(i=n0; i<n1; i++)
+    if (dd_zone == 0)
     {
-        nbs->cell[nbs->a[i]] = i;
+        /* Set the cell indices for the moved particles */
+        n0 = grid->nc*grid->na_sc;
+        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+        if (dd_zone == 0)
+        {
+            for(i=n0; i<n1; i++)
+            {
+                nbs->cell[nbs->a[i]] = i;
+            }
+        }
     }
 
     /* Sort the super-cell columns along z into the sub-cells. */
@@ -1544,7 +1624,7 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
         }
     }
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
     if (grid->bSimple && nbat->XFormat == nbatX8)
     {
         combine_bounding_box_pairs(grid,grid->bb);
@@ -1666,7 +1746,8 @@ void nbnxn_put_on_grid(nbnxn_search_t nbs,
         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
     }
 
-    nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+    nc_max_grid = set_grid_size_xy(nbs,grid,
+                                   dd_zone,n-nmoved,corner0,corner1,
                                    nbs->grid[0].atom_density,
                                    nbat->XFormat);
 
@@ -1817,7 +1898,7 @@ void nbnxn_grid_add_simple(nbnxn_search_t nbs,
         }
     }
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
     if (grid->bSimple && nbat->XFormat == nbatX8)
     {
         combine_bounding_box_pairs(grid,grid->bb_simple);
@@ -1955,7 +2036,7 @@ static float subc_bb_dist2(int si,const float *bb_i_ci,
     return d2;
 }
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
 
 /* SSE code for bb distance for bb format xyz0 */
 static float subc_bb_dist2_sse(int na_c,
@@ -2024,12 +2105,12 @@ static float subc_bb_dist2_sse(int na_c,
                                                  \
     shi = si*NNBSBB_D*DIM;                       \
                                                  \
-    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
-    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
-    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
-    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
-    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
-    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
+    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
+    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
+    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
+    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
+    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
+    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
                                                  \
     dx_0 = _mm_sub_ps(xi_l,xj_h);                \
     dy_0 = _mm_sub_ps(yi_l,yj_h);                \
@@ -2071,24 +2152,24 @@ static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 
     zero = _mm_setzero_ps();
 
-    xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
-    yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
-    zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
-    xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
-    yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
-    zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
+    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
 
-    /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
+    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
      * But as we know the number of iterations is 1 or 2, we unroll manually.
      */
     SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
-    if (STRIDE_8BB < nsi)
+    if (STRIDE_PBB < nsi)
     {
-        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
+        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
     }
 }
 
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
 
 /* Plain C function which determines if any atom pair between two cells
  * is within distance sqrt(rl2).
@@ -2141,13 +2222,13 @@ static gmx_bool subc_in_range_sse8(int na_c,
 
     rc2_SSE   = _mm_set1_ps(rl2);
 
-    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
-    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
-    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
-    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
-    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
-    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
-    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
+    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
 
     /* We loop from the outer to the inner particles to maximize
      * the chance that we find a pair in range quickly and return.
@@ -2233,13 +2314,13 @@ static gmx_bool subc_in_range_sse8(int na_c,
 /* Returns the j sub-cell for index cj_ind */
 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
 {
-    return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
 }
 
 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
 {
-    return nbl->cj4[cj_ind>>2].imei[0].imask;
+    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
 }
 
 /* Ensures there is enough space for extra extra exclusion masks */
@@ -2286,7 +2367,7 @@ static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
     /* We can store 4 j-subcell - i-supercell pairs in one struct.
      * since we round down, we need one extra entry.
      */
-    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 
     if (ncj4_max > nbl->cj4_nalloc)
     {
@@ -2376,16 +2457,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 
     snew(nbl->work,1);
 #ifdef NBNXN_BBXXXX
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
 #else
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
 #endif
-    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
 #ifdef GMX_NBNXN_SIMD
-    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
-    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
+    snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
+    snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
 #endif
-    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
 }
 
 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
@@ -2501,7 +2582,7 @@ static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nb
     fprintf(fp,"nbl average j super cell list length %.1f\n",
             0.25*nbl->ncj4/(double)nbl->nsci);
     fprintf(fp,"nbl average i sub cell list length %.1f\n",
-            nbl->nci_tot/(0.25*nbl->ncj4));
+            nbl->nci_tot/((double)nbl->ncj4));
 
     for(si=0; si<=GPU_NSUBCELL; si++)
     {
@@ -2511,7 +2592,7 @@ static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nb
     {
         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
         {
-            for(j=0; j<4; j++)
+            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
             {
                 b = 0;
                 for(si=0; si<GPU_NSUBCELL; si++)
@@ -2628,8 +2709,8 @@ static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
         w = (ej>>2);
         for(ei=ej; ei<nbl->na_ci; ei++)
         {
-            excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
-                ~(1U << (sj_offset*GPU_NSUBCELL+si));
+            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+                ~(1U << (sj_offset*GPU_NSUBCELL + si));
         }
     }
 }
@@ -2840,7 +2921,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 
     for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
     {
-        cj4_ind   = (nbl->work->cj_ind >> 2);
+        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
         cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
         cj4       = &nbl->cj4[cj4_ind];
 
@@ -2863,7 +2944,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 
 #ifdef NBNXN_BBXXXX
         /* Determine all ci1 bb distances in one call with SSE */
-        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
+        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
                                ci1,bb_ci,d2l);
         *ndistc += na_c*2;
 #endif
@@ -2919,7 +3000,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
         {
             /* Avoid using function pointers here, as it's slower */
             if (
-#ifdef NBNXN_8BB_SSE
+#ifdef NBNXN_PBB_SSE
                 !subc_in_range_sse8
 #else
                 !subc_in_range_x
@@ -2957,7 +3038,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
             nbl->nci_tot += npair;
 
             /* Increase the closing index in i super-cell list */
-            nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+            nbl->sci[nbl->nsci].cj4_ind_end =
+                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
         }
     }
 }
@@ -3015,7 +3097,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs,
             ndirect++;
         }
     }
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
     else
     {
         while (cj_ind_first + ndirect <= cj_ind_last &&
@@ -3229,22 +3311,25 @@ static void set_sci_top_excls(const nbnxn_search_t nbs,
                         inner_e = ge - se*na_c;
 
 /* Macro for getting the index of atom a within a cluster */
-#define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
 /* Macro for converting an atom number to a cluster number */
-#define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
 
-                        if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+                        if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
                         {
                             w       = (inner_e >> 2);
 
-                            get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+                            get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
 
-                            nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
-                                ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
                         }
 
-#undef AMODI
-#undef A2CI
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
                     }
                 }
             }
@@ -3356,13 +3441,17 @@ static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
     {
         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
 
-        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+        /* The counts below are used for non-bonded pair/flop counts
+         * and should therefore match the available kernel setups.
+         */
+        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
         {
-            nbl->work->ncj_hlj += jlen;
+            nbl->work->ncj_noq += jlen;
         }
-        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
         {
-            nbl->work->ncj_noq += jlen;
+            nbl->work->ncj_hlj += jlen;
         }
 
         nbl->nci++;
@@ -3483,7 +3572,7 @@ static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
         /* We can only have complete blocks of 4 j-entries in a list,
          * so round the count up before closing.
          */
-        nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
+        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
         nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 
         nbl->nsci++;
@@ -3543,17 +3632,17 @@ static void set_icell_bb_supersub(const float *bb,int ci,
     int ia,m,i;
 
 #ifdef NBNXN_BBXXXX
-    ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
-    for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+    for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
     {
-        for(i=0; i<STRIDE_8BB; i++)
+        for(i=0; i<STRIDE_PBB; i++)
         {
-            bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
-            bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
-            bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
-            bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
-            bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
-            bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
+            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
         }
     }
 #else
@@ -3610,7 +3699,7 @@ static void icell_set_x_supersub(int ci,
     }
 }
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
 /* Copies PBC shifted super-cell packed atom coordinates to working array */
 static void icell_set_x_supersub_sse8(int ci,
                                       real shx,real shy,real shz,
@@ -3625,15 +3714,15 @@ static void icell_set_x_supersub_sse8(int ci,
 
     for(si=0; si<GPU_NSUBCELL; si++)
     {
-        for(i=0; i<na_c; i+=STRIDE_8BB)
+        for(i=0; i<na_c; i+=STRIDE_PBB)
         {
             io = si*na_c + i;
             ia = ci*GPU_NSUBCELL*na_c + io;
-            for(j=0; j<STRIDE_8BB; j++)
+            for(j=0; j<STRIDE_PBB; j++)
             {
-                x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
-                x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
-                x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
+                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
             }
         }
     }
@@ -3821,7 +3910,7 @@ static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 
         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
         {
-            for(j=0; j<4; j++)
+            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
             {
                 fprintf(fp,"  sj %5d  imask %x\n",
                         nbl->cj4[j4].cj[j],
@@ -4756,7 +4845,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs,
     }
     else
     {
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
         nbs->icell_set_x = icell_set_x_supersub_sse8;
 #else
         nbs->icell_set_x = icell_set_x_supersub;
index 7e1a7fc21b56e513fe66ec5f1796ac775e4d3088..c66086631a5851a488e3afc852996f0c1cf03f9e 100644 (file)
@@ -1221,7 +1221,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
     
     if (ed)
     {
-        do_flood(fplog,cr,x,f,ed,box,step,bNS);
+        do_flood(cr,inputrec,x,f,ed,box,step,bNS);
     }
 
     if (bUseOrEmulGPU && !bDiffKernels)
@@ -1306,7 +1306,10 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
             wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
 
             /* now clear the GPU outputs while we finish the step on the CPU */
+
+            wallcycle_start_nocount(wcycle,ewcLAUNCH_GPU_NB);
             nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+            wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
         }
         else
         {            
@@ -1773,7 +1776,7 @@ void do_force_cutsGROUP(FILE *fplog,t_commrec *cr,
 
     if (ed)
     {
-        do_flood(fplog,cr,x,f,ed,box,step,bNS);
+        do_flood(cr,inputrec,x,f,ed,box,step,bNS);
     }
 
     if (DOMAINDECOMP(cr))
index 24044fb31f3b2dfbba97b66343207bf808f39422..47dc8c6a369085425ede79462b91a5e3743bcb23 100644 (file)
@@ -1084,7 +1084,7 @@ t_forcetable make_tables(FILE *out,const output_env_t oenv,
    * numbers per nx+1 data points. For performance reasons we want
    * the table data to be aligned to 16-byte.
    */
-  snew_aligned(table.data, 12*(nx+1)*sizeof(real),16);
+  snew_aligned(table.data, 12*(nx+1)*sizeof(real),32);
 
   for(k=0; (k<etiNR); k++) {
     if (tabsel[k] != etabUSER) {
@@ -1202,7 +1202,7 @@ t_forcetable make_gb_table(FILE *out,const output_env_t oenv,
         * to do this :-)
         */
        
-       snew_aligned(table.data,4*nx,16);
+       snew_aligned(table.data,4*nx,32);
        
        init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
        
@@ -1362,7 +1362,7 @@ t_forcetable make_atf_table(FILE *out,const output_env_t oenv,
         * to do this :-)
         */
        
-    snew_aligned(table.data,4*nx,16);
+    snew_aligned(table.data,4*nx,32);
 
        copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data);
        
index ebf96cd12a6178fb899144b7691c25be0cf6e433..2c4e21f0b3a32af23cc793d719c7bce2a3d23556 100644 (file)
@@ -33,7 +33,6 @@ set(SYMLINK_NAMES
     g_densmap
     g_densorder
     g_dielectric
-    g_dih
     g_dipoles
     g_disre
     g_dist
index 292c56422027c90ac88f54e7086040d7b315a295..a25ebed8d462871cd342824fe9c5fbb84bc3240c 100644 (file)
@@ -193,8 +193,6 @@ void registerLegacyModules(gmx::CommandLineModuleManager *manager)
             "Calculate surface fluctuations");
     LegacyCmdLineWrapper::registerModule(manager, &gmx_dielectric, "dielectric",
             "Calculate frequency dependent dielectric constants");
-    LegacyCmdLineWrapper::registerModule(manager, &gmx_dih, "dih",
-            "Analyze dihedral transitions");
     LegacyCmdLineWrapper::registerModule(manager, &gmx_dipoles, "dipoles",
             "Compute the total dipole plus fluctuations");
     LegacyCmdLineWrapper::registerModule(manager, &gmx_disre, "disre",
index c6fd9cd960ac1aa736b142920b5a72dd243bbd12..4c2bf0350ebb04dc63375793c5c03f428f4310ae 100644 (file)
@@ -213,8 +213,10 @@ static void chk_bonds(t_idef *idef,int ePBC,rvec *x,matrix box,real tol)
        b0   = 0;    
        switch (ftype) {
        case F_BONDS:
-       case F_G96BONDS:
          b0 = idef->iparams[type].harmonic.rA;
+      break;
+       case F_G96BONDS:
+         b0 = sqrt(idef->iparams[type].harmonic.rA);
          break;
        case F_MORSE:
          b0 = idef->iparams[type].morse.b0A;
index 0de30812b6ff0f01d657ce7e24be24d5313e5c26..a90da900756efebbace9a2697c481af4f1e509bc 100644 (file)
@@ -89,17 +89,38 @@ static void set_ljparams(int comb,double reppow,real v,real w,
   }
 }
 
-static void assign_param(t_functype ftype,t_iparams *newparam,
+/* A return value of 0 means parameters were assigned successfully,
+ * returning -1 means this is an all-zero interaction that should not be added.
+ */
+static int
+assign_param(t_functype ftype,t_iparams *newparam,
                         real old[MAXFORCEPARAM],int comb,double reppow)
 {
   int  i,j;
   real tmp;
+  gmx_bool all_param_zero=TRUE;
 
   /* Set to zero */
   for(j=0; (j<MAXFORCEPARAM); j++) 
-    {
+  {
       newparam->generic.buf[j]=0.0;
-    }
+      /* If all parameters are zero we might not add some interaction types (selected below).
+       * We cannot apply this to ALL interactions, since many have valid reasons for having
+       * zero parameters (e.g. an index to a Cmap interaction, or LJ parameters), but
+       * we use it for angles and torsions that are typically generated automatically.
+       */
+      all_param_zero = (all_param_zero==TRUE) && fabs(old[j])<GMX_REAL_MIN;
+  }
+
+  if(all_param_zero==TRUE)
+  {
+      if(IS_ANGLE(ftype) || IS_RESTRAINT_TYPE(ftype) || ftype==F_IDIHS ||
+         ftype==F_PDIHS || ftype==F_PIDIHS || ftype==F_RBDIHS || ftype==F_FOURDIHS)
+      {
+          return -1;
+      }
+  }
+
   switch (ftype) {
   case F_G96ANGLES:
     /* Post processing of input data: store cosine iso angle itself */
@@ -245,29 +266,23 @@ static void assign_param(t_functype ftype,t_iparams *newparam,
   case F_PIDIHS:
   case F_ANGRES:
   case F_ANGRESZ:
-    newparam->pdihs.phiA = old[0];
-    newparam->pdihs.cpA  = old[1];
-                 
-    /* Dont do any checks if all parameters are zero (such interactions will be removed).
-     * Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
-     * so I have changed the lower limit to -99 /EL
-     *
-     * Second, if the force constant is zero in both A and B states, we set the phase
-     * and multiplicity to zero too so the interaction gets removed during clean-up.
-     */        
-    newparam->pdihs.phiB = old[3];
-    newparam->pdihs.cpB  = old[4];
-          
-    if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
-    {
-        newparam->pdihs.phiA = 0.0; 
-        newparam->pdihs.phiB = 0.0; 
-        newparam->pdihs.mult = 0; 
-    } 
-    else
-    {
-        newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
-    }
+          newparam->pdihs.phiA = old[0];
+          newparam->pdihs.cpA  = old[1];
+
+          /* Change 20100720: Amber occasionally uses negative multiplicities (mathematically OK),
+           * so I have changed the lower limit to -99 /EL
+           */
+          newparam->pdihs.phiB = old[3];
+          newparam->pdihs.cpB  = old[4];
+          /* If both force constants are zero there is no interaction. Return -1 to signal
+           * this entry should NOT be added.
+           */
+          if( fabs(newparam->pdihs.cpA) < GMX_REAL_MIN && fabs(newparam->pdihs.cpB) < GMX_REAL_MIN )
+          {
+              return -1;
+          }
+    
+          newparam->pdihs.mult = round_check(old[2],-99,ftype,"multiplicity");
           
     break;
   case F_POSRES:
@@ -347,7 +362,7 @@ static void assign_param(t_functype ftype,t_iparams *newparam,
     newparam->rbdihs.rbcB[3]=-2.0*old[NR_FOURDIHS+2];
     newparam->rbdihs.rbcB[4]=-4.0*old[NR_FOURDIHS+3];
     newparam->rbdihs.rbcB[5]=0.0;
-    break;    
+    break;
   case F_CONSTR:
   case F_CONSTRNC:
     newparam->constr.dA = old[0];
@@ -399,6 +414,7 @@ static void assign_param(t_functype ftype,t_iparams *newparam,
     gmx_fatal(FARGS,"unknown function type %d in %s line %d",
              ftype,__FILE__,__LINE__);
   }
+    return 0;
 }
 
 static int enter_params(gmx_ffparams_t *ffparams, t_functype ftype,
@@ -407,8 +423,14 @@ static int enter_params(gmx_ffparams_t *ffparams, t_functype ftype,
 {
   t_iparams newparam;
   int       type;
-  
-  assign_param(ftype,&newparam,forceparams,comb,reppow);
+  int       rc;
+
+  if( (rc=assign_param(ftype,&newparam,forceparams,comb,reppow))<0 )
+  {
+      /* -1 means this interaction is all-zero and should not be added */
+      return rc;
+  }
+
   if (!bAppend) {
     for (type=start; (type<ffparams->ntypes); type++) {
       if (ffparams->functype[type]==ftype) {
@@ -478,7 +500,8 @@ static void enter_function(t_params *p,t_functype ftype,int comb,real reppow,
                __FILE__,__LINE__,*maxtypes);
     }
     type = enter_params(ffparams,ftype,p->param[k].c,comb,reppow,start,bAppend);
-    if (!bNB) {
+    /* Type==-1 is used as a signal that this interaction is all-zero and should not be added. */
+    if (!bNB && type>=0) {
       nral  = NRAL(ftype);
       delta = nr*(nral+1);
       srenew(il->iatoms,il->nr+delta);
index 386b897e322ff58a7c38adb53ed2bcb644c979b3..da949ade755c10c105406c0d5d0a9a81a2fb5a99 100644 (file)
@@ -570,7 +570,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     {
         nstfep = ir->expandedvals->nstexpanded;
     }
-    if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
+    if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
     {
         nstfep = repl_ex_nst;
     }
@@ -1264,7 +1264,6 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                                 top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
                                 cglo_flags 
                                 | CGLO_ENERGY 
-                                | (bStopCM ? CGLO_STOPCM : 0)
                                 | (bTemp ? CGLO_TEMPERATURE:0) 
                                 | (bPres ? CGLO_PRESSURE : 0) 
                                 | (bPres ? CGLO_CONSTRAINT : 0)
@@ -1285,6 +1284,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                 {
                     if (bTrotter)
                     {
+                        m_add(force_vir,shake_vir,total_vir); /* we need the un-dispersion corrected total vir here */
                         trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
                     } 
                     else 
@@ -1301,9 +1301,6 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                                             top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
                                             CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
                         }
-
-
-                        update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
                     }
                 }
                 
@@ -1350,7 +1347,10 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
             }
             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-            sum_dhdl(enerd,state->lambda,ir->fepvals);
+            if (!bRerunMD)
+            {
+                sum_dhdl(enerd,state->lambda,ir->fepvals);
+            }
         }
         
         /* ########  END FIRST UPDATE STEP  ############## */
@@ -1557,20 +1557,26 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             gs.sig[eglsCHKPT] = 1;
         }
   
-
-        /* at the start of step, randomize the velocities */
-        if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
+        if (EI_VV(ir->eI))
         {
-            gmx_bool bDoAndersenConstr;
-            bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
-            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-            if (bDoAndersenConstr)
+            if (!bInitStep)
             {
-                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
-                                   state,fr->bMolPBC,graph,f,
-                                   &top->idef,tmp_vir,NULL,
-                                   cr,nrnb,wcycle,upd,constr,
-                                   bInitStep,TRUE,bCalcVir,vetanew);
+                update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
+            }
+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+            {
+                gmx_bool bIfRandomize;
+                bIfRandomize = update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr);
+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+                if (constr && bIfRandomize)
+                {
+                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+                                       state,fr->bMolPBC,graph,f,
+                                       &top->idef,tmp_vir,NULL,
+                                       cr,nrnb,wcycle,upd,constr,
+                                       bInitStep,TRUE,bCalcVir,vetanew);
+                }
             }
         }
 
@@ -1764,7 +1770,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                                 lastbox,
                                 top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
                                 cglo_flags 
-                                | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
                                 | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
@@ -1797,7 +1803,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 
         /* only add constraint dvdl after constraints */
         enerd->term[F_DVDL_BONDED] += dvdl;
-        if (!bVV)
+        if (!bVV || bRerunMD)
         {
             /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
             sum_dhdl(enerd,state->lambda,ir->fepvals);
@@ -1940,7 +1946,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             state->fep_state = lamnew;
             for (i=0;i<efptNR;i++)
             {
-                state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
             }
         }
         /* Remaining runtime */
index 0f24b8878afad8c850ab0414b749efe2a8f1fbfb..43f9d0112c5ccda13a191df842213a561f784b03 100644 (file)
@@ -317,7 +317,7 @@ int cmain(int argc,char *argv[])
     "ED (essential dynamics) sampling is switched on by using the [TT]-ei[tt]",
     "flag followed by an [TT].edi[tt] file.",
     "The [TT].edi[tt] file can be produced using options in the essdyn",
-    "menu of the WHAT IF program. [TT]mdrun[tt] produces a [TT].edo[tt] file that",
+    "menu of the WHAT IF program. [TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
     "contains projections of positions, velocities and forces onto selected",
     "eigenvectors.[PAR]",
     "When user-defined potential functions have been selected in the",
@@ -449,7 +449,7 @@ int cmain(int argc,char *argv[])
     { efXVG, "-tpi",    "tpi",      ffOPTWR },
     { efXVG, "-tpid",   "tpidist",  ffOPTWR },
     { efEDI, "-ei",     "sam",      ffOPTRD },
-    { efEDO, "-eo",     "sam",      ffOPTWR },
+    { efXVG, "-eo",     "edsam",    ffOPTWR },
     { efGCT, "-j",      "wham",     ffOPTRD },
     { efGCT, "-jo",     "bam",      ffOPTWR },
     { efXVG, "-ffout",  "gct",      ffOPTWR },
index ab5e4bdaf566a0fce2713c1f6720b0b1b7022870..fbde22b6e9ac4819f6cd88d392362d13539692ad 100644 (file)
@@ -727,6 +727,17 @@ static void print_allswitchind(FILE *fplog,int n,int *ind,int *pind, int *allswa
     }
     fprintf(fplog,"\n");
 
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
     fprintf(fplog,"Order After Exchange: ");
     for (i=0;i<n;i++)
     {
@@ -795,6 +806,7 @@ static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int
                  =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
                  =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
                  =  de[b][a] + de[a][b] */
+
         /* permuted:
            ediff =  E_new - E_old
                  =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
@@ -802,6 +814,16 @@ static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int
                  =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
                  =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
                  =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+        /* but, in the current code implementation, we flip configurations, not indices . . .
+           So let's examine that.
+                 =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                 =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                 = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                 So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                 So the simple solution is to flip the
+                 position of perturbed and original indices in the tests.
+        */
+
         ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
         delta = ediff*beta[a]; /* assume all same temperature in this case */
         break;
@@ -867,7 +889,7 @@ test_for_replica_exchange(FILE *fplog,
     gmx_bool bPrint,bMultiEx;
     gmx_bool *bEx = re->bEx;
     real *prob = re->prob;
-    int *pind = re->destinations;
+    int *pind = re->destinations; /* permuted index */
     gmx_bool bEpot=FALSE;
     gmx_bool bDLambda=FALSE;
     gmx_bool bVol=FALSE;
@@ -953,24 +975,32 @@ test_for_replica_exchange(FILE *fplog,
         for (i=0;i<re->nex;i++)
         {
             /* randomly select a pair  */
-            /* find out which state it is from, and what label that state currently has */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
             i0 = (int)(re->nrepl*rando(&(re->seed)));
             i1 = (int)(re->nrepl*rando(&(re->seed)));
             if (i0==i1)
             {
                 i--;
-                continue;  /* got the same pair, back up and do it again */
+                continue;  /* self-exchange, back up and do it again */
             }
 
-            a = re->ind[i0];
+            a = re->ind[i0]; /* what are the indices of these states? */
             b = re->ind[i1];
             ap = pind[i0];
             bp = pind[i1];
 
             bPrint = FALSE; /* too noisy */
-            delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); /* calculate the energy difference */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog,bPrint,re,ap,bp,a,b);
 
-            /* we actually only use the first space, since there are actually many switches between pairs. */
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
 
             if (delta <= 0)
             {
@@ -1065,6 +1095,7 @@ test_for_replica_exchange(FILE *fplog,
         re->nmoves[re->ind[i]][pind[i]] +=1;
         re->nmoves[pind[i]][re->ind[i]] +=1;
     }
+    fflush(fplog); /* make sure we can see what the last exchange was */
 }
 
 static void write_debug_x(t_state *state)
@@ -1304,6 +1335,7 @@ gmx_bool replica_exchange(FILE *fplog,const t_commrec *cr,struct gmx_repl_ex *re
             /* There will be only one swap cycle with standard replica
              * exchange, but there may be multiple swap cycles if we
              * allow multiple swaps. */
+
             for (j = 0; j < maxswap; j++)
             {
                 exchange_partner = re->order[replica_id][j];
index 61bab7e51a954cbb3bc5acba7f698257f68d92d0..86db8f722534ce8ec808b8ff59db8c6dde2a3f28 100644 (file)
@@ -264,8 +264,6 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
     mda->deviceOptions=deviceOptions;
     mda->Flags=Flags;
 
-    fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
-    fflush(stderr);
     /* now spawn new threads that start mdrunner_start_fn(), while 
        the main thread returns */
     ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
@@ -775,15 +773,18 @@ static void convert_to_verlet_scheme(FILE *fplog,
     gmx_mtop_remove_chargegroups(mtop);
 }
 
-/* Check the process affinity mask and if it is found to be non-zero,
- * will honor it and disable mdrun internal affinity setting.
- * This function should be called first before the OpenMP library gets
- * initialized with the last argument FALSE (which will detect affinity
- * set by external tools like taskset), and later, after the OpenMP
- * initialization, with the last argument TRUE to detect affinity changes
- * made by the OpenMP library.
+/* Check the process affinity mask. If it is non-zero, something
+ * else has set the affinity, and mdrun should honor that and
+ * not attempt to do its own thread pinning.
+ *
+ * This function should be called twice. Once before the OpenMP
+ * library gets initialized with bAfterOpenMPInit=FALSE (which will
+ * detect affinity set by external tools like taskset), and again
+ * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
+ * (which will detect affinity changes made by the OpenMP library).
  *
- * Note that this will only work on Linux as we use a GNU feature. */
+ * Note that this will only work on Linux, because we use a GNU
+ * feature. */
 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
                                    gmx_hw_opt_t *hw_opt, int ncpus,
                                    gmx_bool bAfterOpenmpInit)
@@ -837,19 +838,21 @@ static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
         if (!bAfterOpenmpInit)
         {
             md_print_warn(cr, fplog,
-                          "Non-default process affinity set, disabling internal affinity");
+                          "%s detected a non-default process affinity, "
+                          "so it will not attempt to pin its threads", ShortProgram());
         }
         else
         {
             md_print_warn(cr, fplog,
-                          "Non-default process affinity set probably by the OpenMP library, "
-                          "disabling internal affinity");
+                          "%s detected a non-default process affinity, "
+                          "probably set by the OpenMP library, "
+                          "so it will not attempt to pin its threads", ShortProgram());
         }
         hw_opt->bThreadPinning = FALSE;
 
         if (debug)
         {
-            fprintf(debug, "Non-default affinity mask found\n");
+            fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
         }
     }
     else
@@ -1044,23 +1047,32 @@ static void set_cpu_affinity(FILE *fplog,
         }
         else
         {
-            /* check if some threads failed to set their affinities */
+            /* check & warn if some threads failed to set their affinities */
             if (nth_affinity_set != nthread_local)
             {
-                char sbuf[STRLEN];
-                sbuf[0] = '\0';
+                char sbuf1[STRLEN], sbuf2[STRLEN];
+
+                /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+                sbuf1[0] = sbuf2[0] = '\0';
 #ifdef GMX_MPI
 #ifdef GMX_THREAD_MPI
-                sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
+                sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
 #else /* GMX_LIB_MPI */
+                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
 #endif
-                sprintf(sbuf, "In MPI process #%d", cr->nodeid);
 #endif /* GMX_MPI */
+
+                if (nthread_local > 1)
+                {
+                    sprintf(sbuf2, "of %d/%d thread%s ",
+                            nthread_local - nth_affinity_set, nthread_local,
+                            (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+                }
+
                 md_print_warn(NULL, fplog,
-                              "%s%d/%d thread%s failed to set their affinities. "
-                              "This can cause performance degradation!",
-                              sbuf, nthread_local - nth_affinity_set, nthread_local,
-                              (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+                              "NOTE: %sAffinity setting %sfailed.\n"
+                              "      This can cause performance degradation!",
+                              sbuf1, sbuf2);
             }
         }
     }
@@ -1453,13 +1465,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     /* now make sure the state is initialized and propagated */
     set_state_entries(state,inputrec,cr->nnodes);
 
-    /* remove when vv and rerun works correctly! */
-    if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
-    {
-        gmx_fatal(FARGS,
-                  "Currently can't do velocity verlet with rerun in parallel.");
-    }
-
     /* A parallel command line option consistency check that we can
        only do after any threads have started. */
     if (!PAR(cr) &&
@@ -1622,7 +1627,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     if (opt2bSet("-ei",nfile,fnm))
     {
         /* Open input and output files, allocate space for ED data structure */
-        ed = ed_open(nfile,fnm,Flags,cr);
+        ed = ed_open(mtop->natoms,&state->edsamstate,nfile,fnm,Flags,oenv,cr);
     }
 
     if (PAR(cr) && !((Flags & MD_PARTDEC) ||
@@ -1681,6 +1686,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                   cr->nnodes==1 ? "process" : "processes"
 #endif
                   );
+    fflush(stderr);
 #endif
 
     gmx_omp_nthreads_init(fplog, cr,
index 3a27007d7000beddfd32c16f5bd4af968f1be9a8..ab8c9a5749daedcb7f8d403a0ebdb6d6d54238a0 100644 (file)
@@ -212,7 +212,7 @@ void set_histp(t_atoms *pdba,rvec *x,real angle,real dist){
   /* A histidine residue exists that requires automated assignment, so
    * doing the analysis of donors and acceptors is worthwhile. */
   fprintf(stderr,
-         "Analysing hydrogen-bonding network for automated assigment of histidine\n"
+         "Analysing hydrogen-bonding network for automated assignment of histidine\n"
          " protonation.");
 
   snew(donor,natom);
index 9bdf3666a1de51bfb05665b88875e101e7fe3265..18dde119a6eac6b122882330a3124fd9739c881f 100644 (file)
@@ -267,9 +267,10 @@ static char *search_resrename(int nrr,rtprename_t *rr,
         {
             nn = rr[i].main;
         }
+        
         if (nn[0] == '-')
         {
-            gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? " as a starting terminus" : (bEnd ? " as an ending terminus" : ""));
+            gmx_fatal(FARGS,"In the chosen force field there is no residue type for '%s'%s",name,bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
         }
     }
 
index efd43df29a27c579954203159516667d3b8ca731..578c640bb4f7a4ba0e621150bd28e909f3e52c77 100644 (file)
@@ -7,7 +7,7 @@ add_library(gmxana
             gmx_analyze.c   gmx_anaeig.c    gmx_angle.c     gmx_bond.c      
             gmx_bundle.c    gmx_chi.c       gmx_cluster.c   gmx_confrms.c   
             gmx_covar.c     gmx_current.c   
-            gmx_density.c   gmx_densmap.c   gmx_dih.c       
+            gmx_density.c   gmx_densmap.c       
             gmx_dielectric.c        
             gmx_kinetics.c  gmx_spatial.c   gmx_tune_pme.c
             gmx_dipoles.c   gmx_disre.c     gmx_dist.c      gmx_dyndom.c    
index 98bdebf9cf2bd96a8e96aef27e00531a567132a7..71a0d2d13b1c2a9b6d532b4b3f0e96fab85ae5b6 100644 (file)
@@ -94,20 +94,20 @@ int gmx_g_angle(int argc,char *argv[])
 {
   static const char *desc[] = {
     "[TT]g_angle[tt] computes the angle distribution for a number of angles",
-    "or dihedrals. This way you can check whether your simulation",
-    "is correct. With option [TT]-ov[tt] you can plot the average angle of",
-    "a group of angles as a function of time. With the [TT]-all[tt] option",
-    "the first graph is the average, the rest are the individual angles.[PAR]",
+    "or dihedrals.[PAR]",
+    "With option [TT]-ov[tt], you can plot the average angle of",
+    "a group of angles as a function of time. With the [TT]-all[tt] option,",
+    "the first graph is the average and the rest are the individual angles.[PAR]",
     "With the [TT]-of[tt] option, [TT]g_angle[tt] also calculates the fraction of trans",
     "dihedrals (only for dihedrals) as function of time, but this is",
-    "probably only fun for a selected few.[PAR]",
-    "With option [TT]-oc[tt] a dihedral correlation function is calculated.[PAR]",
-    "It should be noted that the index file should contain",
-    "atom-triples for angles or atom-quadruplets for dihedrals.",
+    "probably only fun for a select few.[PAR]",
+    "With option [TT]-oc[tt], a dihedral correlation function is calculated.[PAR]",
+    "It should be noted that the index file must contain",
+    "atom triplets for angles or atom quadruplets for dihedrals.",
     "If this is not the case, the program will crash.[PAR]",
-    "With option [TT]-or[tt] a trajectory file is dumped containing cos and",
-    "sin of selected dihedral angles which subsequently can be used as",
-    "input for a PCA analysis using [TT]g_covar[tt].[PAR]",
+    "With option [TT]-or[tt], a trajectory file is dumped containing cos and",
+    "sin of selected dihedral angles, which subsequently can be used as",
+    "input for a principal components analysis using [TT]g_covar[tt].[PAR]",
     "Option [TT]-ot[tt] plots when transitions occur between",
     "dihedral rotamers of multiplicity 3 and [TT]-oh[tt]",
     "records a histogram of the times between such transitions,",
diff --git a/src/tools/gmx_dih.c b/src/tools/gmx_dih.c
deleted file mode 100644 (file)
index 21c5eb3..0000000
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * 
- *                This source code is part of
- * 
- *                 G   R   O   M   A   C   S
- * 
- *          GROningen MAchine for Chemical Simulations
- * 
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * 
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
- * 
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
- * 
- * For more info, check our website at http://www.gromacs.org
- * 
- * And Hey:
- * Green Red Orange Magenta Azure Cyan Skyblue
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <math.h>
-
-#include "sysstuff.h"
-#include "string2.h"
-#include "copyrite.h"
-#include "futil.h"
-#include "smalloc.h"
-#include "statutil.h"
-#include "nrama.h"
-#include "physics.h"
-#include "macros.h"
-#include "xvgr.h"
-#include "vec.h"
-#include "gmx_ana.h"
-
-
-#define NOMIN 'X'
-
-static void ana_dih(FILE *out,char *index,int nframes,real dih[],t_dih *dd)
-{
-  int i;
-  real mind,maxd,sum,av,var,prev,width;
-  gmx_bool bTrans;
-  
-  mind=5400,maxd=-5400,sum=0,av=0,var=0;
-
-  prev=dih[0];
-  for(i=0; (i<nframes); i++) {
-    if ((dih[i]-prev) > 180) {
-      /* PBC.. */
-      dih[i]-=360;
-    }
-    else if ((dih[i]-prev) < -180)
-      dih[i]+=360;
-    prev=dih[i];
-      
-    sum+=dih[i];
-    mind=min(mind,dih[i]);
-    maxd=max(maxd,dih[i]);
-  }
-  av=sum/nframes;
-  for(i=0; (i<nframes); i++)
-    var+=sqr(dih[i]-av);
-  var/=nframes;
-  width=(360.0/dd->mult);
-  bTrans=((maxd - mind) > width);
-
-  fprintf(out,"%-10s %10.3f %10.3f %10.3f %10.3f %10.3f %-10s%3.0f\n",
-         index,mind,av,maxd,var,sqrt(var),
-         bTrans ? "Yep" : "",width);
-}
-
-static int find_min(real phi,int ntab,real phitab[])
-{
-  int  i,imin;
-  real mind,mm;
-  real width;
-  /* Set closest minimum to the first one */
-  width=360.0/ntab;
-  mind=fabs(phi-phitab[0]);
-  imin=0;
-  for(i=1; (i<ntab); i++) {
-    mm=fabs(phi-phitab[i]);
-    if (mm < mind) {
-      imin=i;
-      mind=mm;
-    }
-  }
-  if (mind < width*0.5 )
-    return imin;
-  else
-    return -1;
-}
-
-static int vphi(t_dih *dih,real phi,int mult)
-{
-  static real m2[] = { 90, 270 };
-  static real m3[] = { 60, 180, 300 };
-  static real m4[] = { 45, 135, 225, 315 };
-  static real m6[] = { 30, 90, 150, 210, 270, 330 };
-
-  real phiref;
-  int  vpp=0;
-  
-  phiref=RAD2DEG*(phi-dih->phi0);
-  while (phiref < 0)
-    phiref+=360;
-  while (phiref > 360)
-    phiref-=360;
-  
-  switch(mult) {
-  case 2:
-    vpp=find_min(phiref,2,m2);
-    break;
-  case 3:
-    vpp=find_min(phiref,3,m3);
-    break;
-  case 4:
-    vpp=find_min(phiref,4,m4);
-    break;
-  case 6:
-    vpp=find_min(phiref,6,m6);
-    break;
-  default:
-    gmx_fatal(FARGS,"No such multiplicity %d",dih->mult);
-  }
-
-  if (vpp == -1)
-    return NOMIN;
-  else
-    return vpp+'0';
-}
-
-typedef struct t_cluster {
-  int    ndih;
-  int    freq;
-  char   *minimum;
-  struct t_cluster *next;
-} t_cluster;
-
-static t_cluster *search_cluster(t_cluster *cl,char *minimum)
-{
-  t_cluster *ccl=cl;
-
-  while (ccl != NULL) {
-    if (strcmp(minimum,ccl->minimum)==0)
-      return ccl;
-    ccl=ccl->next;
-  }
-  return NULL;
-}
-
-static void add_cluster(t_cluster **cl,int ndih,char *minimum)
-{
-  t_cluster *loper;
-  t_cluster *ccl;
-
-  snew(ccl,1);
-  ccl->ndih=ndih;
-  ccl->freq=1;
-  ccl->minimum=strdup(minimum);
-  ccl->next=NULL;
-  
-  if (*cl == NULL)
-    *cl=ccl;
-  else {
-    loper=*cl;
-    while (loper->next != NULL) 
-      loper=loper->next;
-    loper->next=ccl;
-  }
-}
-
-static void p_cluster(FILE *out,t_cluster *cl)
-{
-  t_cluster *loper;
-
-  fprintf(out,"* * * C L U S T E R   A N A L Y S I S * * *\n\n");
-  fprintf(out," Frequency  Dihedral minima\n");
-  loper=cl;
-  while (loper != NULL) {
-    fprintf(out,"%10d  %s\n",loper->freq,loper->minimum);
-    loper=loper->next;
-  }
-}
-
-static void ana_cluster(FILE *out, t_xrama *xr,real **dih,real time[],
-                       t_topology *top,int nframes,int mult)
-{
-  t_cluster *cl=NULL,*scl;
-  char      *minimum;
-  int       i,j,nx;
-
-  /* Number of dihedrals + terminating NULL 
-   * this allows for using string routines
-   */
-  snew(minimum,xr->ndih+1);
-  
-  for(i=0; (i<nframes); i++) {
-    nx=0;
-    for(j=0; (j<xr->ndih); j++) {
-      minimum[j] = vphi(&xr->dih[j],dih[j][i],
-                       mult == -1 ? xr->dih[j].mult : mult);
-      if (minimum[j] == NOMIN)
-       nx++;
-    }
-    if (nx == 0) {
-      if ((scl=search_cluster(cl,minimum)) == NULL)
-       add_cluster(&cl,xr->ndih,minimum);
-      else
-       scl->freq++;
-    }
-  }
-  p_cluster(out,cl);
-
-  sfree(minimum);
-}
-
-static void ana_trans(FILE *out, t_xrama *xr,real **dih,real time[],
-                     t_topology *top,int nframes, const output_env_t oenv)
-{
-  FILE *outd;
-  real prev_phi,prev_psi;
-  int  i,j,phi,psi;
-  char buf[10];
-
-  fprintf(out,"\n\t* * * D I H E D R A L    S T A T I S T I C S * * *\n\n");
-  fprintf(out,"%-10s %10s %10s %10s %10s %10s %10s\n",
-         "index","minimum","average","maximum","variance","std.dev",
-         "transition");
-  for(i=0; (i<xr->ndih); i++) {
-    sprintf(buf,"dih-%d",i);
-    ana_dih(out,buf,nframes,dih[i],&(xr->dih[i]));
-  }
-  for(i=0; (i<xr->npp); i++) {
-    sprintf(buf,"%s",xr->pp[i].label);
-    outd=xvgropen(buf,"Dihedral Angles","Time (ps)","Degrees",oenv);
-
-    phi=xr->pp[i].iphi;
-    psi=xr->pp[i].ipsi;
-    prev_phi=dih[phi][0];
-    prev_psi=dih[psi][0];
-    for(j=0; (j<nframes); j++) {
-      /* PBC.. */
-      if ((dih[phi][j]-prev_phi) > 180) 
-       dih[phi][j]-=360;
-      else if ((dih[phi][j]-prev_phi) < -180)
-       dih[phi][j]+=360;
-      prev_phi=dih[phi][j];
-      if ((dih[psi][j]-prev_psi) > 180) 
-       dih[psi][j]-=360;
-      else if ((dih[psi][j]-prev_psi) < -180)
-       dih[psi][j]+=360;
-      prev_psi=dih[psi][j];
-      fprintf(outd,"%10g  %10g  %10g\n",time[j],prev_phi,prev_psi);
-    }
-    ffclose(outd);
-  }
-}
-
-int gmx_dih(int argc,char *argv[])
-{
-  const char *desc[] = {
-    "[TT]g_dih[tt] can do two things. The default is to analyze dihedral transitions",
-    "by merely computing all the dihedral angles defined in your topology",
-    "for the whole trajectory. When a dihedral flips over to another minimum",
-    "an angle/time plot is made.[PAR]",
-    "The opther option is to discretize the dihedral space into a number of",
-    "bins, and group each conformation in dihedral space in the",
-    "appropriate bin. The output is then given as a number of dihedral",
-    "conformations sorted according to occupancy."
-  };
-  static int  mult = -1;
-  static gmx_bool bSA  = FALSE;
-  t_pargs pa[] = {
-    { "-sa", FALSE, etBOOL, {&bSA},
-      "Perform cluster analysis in dihedral space instead of analysing dihedral transitions." },
-    { "-mult", FALSE, etINT, {&mult},
-      "mulitiplicity for dihedral angles (by default read from topology)" }
-  };
-  FILE       *out;
-  t_xrama    *xr;
-  t_topology *top;
-  real       **dih,*time;
-  real       dd;
-  int        i,nframes,maxframes=1000;
-  output_env_t oenv;
-  t_filenm   fnm[] = {
-    { efTRX, "-f", NULL, ffREAD },
-    { efTPX, NULL, NULL, ffREAD },
-    { efOUT, NULL, NULL, ffWRITE }
-  };
-#define NFILE asize(fnm)
-
-  parse_common_args(&argc,argv,PCA_CAN_VIEW | PCA_CAN_TIME | PCA_BE_NICE,
-                   NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,&oenv);
-  
-  if (mult != -1)
-    fprintf(stderr,"Using %d for dihedral multiplicity rather than topology values\n",mult);
-    
-  snew(xr,1);
-  init_rama(oenv,ftp2fn(efTRX,NFILE,fnm),
-           ftp2fn(efTPX,NFILE,fnm),xr,3);
-  top=read_top(ftp2fn(efTPX,NFILE,fnm),NULL);
-              
-  /* Brute force malloc, may be too big... */
-  snew(dih,xr->ndih);
-  for(i=0; (i<xr->ndih); i++)
-    snew(dih[i],maxframes);
-  snew(time,maxframes);
-
-  fprintf(stderr,"\n");
-  nframes = 0;
-  while (new_data(xr)) {
-    for(i=0; (i<xr->ndih); i++) {
-      dd=xr->dih[i].ang*RAD2DEG;
-      while (dd < 0)
-       dd+=360;
-      while (dd > 360)
-       dd-=360;
-      dih[i][nframes]=dd;
-    }
-    time[nframes]=xr->t;
-    nframes++;
-    if (nframes > maxframes) {
-      maxframes += 1000;
-      for(i=0; (i<xr->ndih); i++)
-       srenew(dih[i],maxframes);
-      srenew(time,maxframes);
-    }
-  } 
-
-  fprintf(stderr,"\nCalculated all dihedrals, now analysing...\n");
-
-  out=ftp2FILE(efOUT,NFILE,fnm,"w");
-
-  if (bSA) {
-    /* Cluster and structure analysis */
-    ana_cluster(out,xr,dih,time,top,nframes,mult);
-  }
-  else {
-    /* Analyse transitions... */
-    ana_trans(out,xr,dih,time,top,nframes,oenv);
-  }
-  ffclose(out);
-    
-  thanx(stderr);
-    
-  return 0;
-}
index 8d5c000c0ffb106906ec671b2faf2cbd51c1c622..5906c0c4b85867ff87fcf762659d82e8333f04cd 100644 (file)
@@ -133,7 +133,7 @@ int gmx_genpr(int argc,char *argv[])
   nfn     = opt2fn_null("-n",NFILE,fnm);
   
   if (( nfn == NULL ) && ( xfn == NULL))
-    gmx_fatal(FARGS,"no index file and no structure file suplied");
+    gmx_fatal(FARGS,"no index file and no structure file supplied");
       
   if ((disre_frac < 0) || (disre_frac >= 1))
     gmx_fatal(FARGS,"disre_frac should be between 0 and 1");
index b805755e18512e5e37d5d3b2c0df2e2efce87ef1..ffbbf5522a9f1d15419b1c5cd7158549356f52e0 100644 (file)
@@ -115,14 +115,12 @@ int gmx_helix(int argc,char *argv[])
     "of the", 
     "helix in nm. This is simply the average rise (see above) times the",  
     "number of helical residues (see below).[BR]",
-    "[BB]5.[bb] Number of helical residues (file [TT]n-ahx.xvg[tt]). The title says",
-    "it all.[BR]",
-    "[BB]6.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
-    "[BB]7.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
+    "[BB]5.[bb] Helix dipole, backbone only (file [TT]dip-ahx.xvg[tt]).[BR]",
+    "[BB]6.[bb] RMS deviation from ideal helix, calculated for the C[GRK]alpha[grk]",
     "atoms only (file [TT]rms-ahx.xvg[tt]).[BR]",
-    "[BB]8.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
-    "[BB]9.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
-    "[BB]10.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
+    "[BB]7.[bb] Average C[GRK]alpha[grk] - C[GRK]alpha[grk] dihedral angle (file [TT]phi-ahx.xvg[tt]).[BR]",
+    "[BB]8.[bb] Average [GRK]phi[grk] and [GRK]psi[grk] angles (file [TT]phipsi.xvg[tt]).[BR]",
+    "[BB]9.[bb] Ellipticity at 222 nm according to Hirst and Brooks.",
     "[PAR]"
   };
   static const char *ppp[efhNR+2] = { 
index 03934e611eb066bb512b08b0ca962640f47416dc..30c9a86aab18d55631d74c447258e1ffc4249f32 100644 (file)
@@ -466,7 +466,7 @@ int gmx_make_edi(int argc,char *argv[])
   static const char *desc[] = {
       "[TT]make_edi[tt] generates an essential dynamics (ED) sampling input file to be used with [TT]mdrun[tt]",
       "based on eigenvectors of a covariance matrix ([TT]g_covar[tt]) or from a",
-      "normal modes anaysis ([TT]g_nmeig[tt]).",
+      "normal modes analysis ([TT]g_nmeig[tt]).",
       "ED sampling can be used to manipulate the position along collective coordinates",
       "(eigenvectors) of (biological) macromolecules during a simulation. Particularly,",
       "it may be used to enhance the sampling efficiency of MD simulations by stimulating",
@@ -503,7 +503,7 @@ int gmx_make_edi(int argc,char *argv[])
       "[TT]-radcon[tt]: perform acceptance radius contraction along selected eigenvectors",
       "towards a target structure specified with [TT]-tar[tt].[PAR]",
       "NOTE: each eigenvector can be selected only once. [PAR]",
-      "[TT]-outfrq[tt]: frequency (in steps) of writing out projections etc. to [TT].edo[tt] file[PAR]",
+      "[TT]-outfrq[tt]: frequency (in steps) of writing out projections etc. to [TT].xvg[tt] file[PAR]",
       "[TT]-slope[tt]: minimal slope in acceptance radius expansion. A new expansion",
       "cycle will be started if the spontaneous increase of the radius (in nm/step)",
       "is less than the value specified.[PAR]",
@@ -511,17 +511,23 @@ int gmx_make_edi(int argc,char *argv[])
       "before a new cycle is started.[PAR]",
       "Note on the parallel implementation: since ED sampling is a 'global' thing",
       "(collective coordinates etc.), at least on the 'protein' side, ED sampling",
-      "is not very parallel-friendly from an implentation point of view. Because",
+      "is not very parallel-friendly from an implementation point of view. Because",
       "parallel ED requires some extra communication, expect the performance to be",
-      "lower as in a free MD simulation, especially on a large number of nodes. [PAR]",
-      "All output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a .edo file. In the output",
-      "file, per OUTFRQ step the following information is present: [PAR]",
-      "[TT]*[tt] the step number[BR]",
-      "[TT]*[tt] the number of the ED dataset. ([BB]Note[bb] that you can impose multiple ED constraints in",
+      "lower as in a free MD simulation, especially on a large number of nodes and/or",
+      "when the ED group contains a lot of atoms. [PAR]",
+      "Please also note that if your ED group contains more than a single protein,",
+      "then the [TT].tpr[tt] file must contain the correct PBC representation of the ED group.",
+      "Take a look on the initial RMSD from the reference structure, which is printed",
+      "out at the start of the simulation; if this is much higher than expected, one",
+      "of the ED molecules might be shifted by a box vector. [PAR]",
+      "All ED-related output of [TT]mdrun[tt] (specify with [TT]-eo[tt]) is written to a [TT].xvg[tt] file",
+      "as a function of time in intervals of OUTFRQ steps.[PAR]",
+      "[BB]Note[bb] that you can impose multiple ED constraints and flooding potentials in",
       "a single simulation (on different molecules) if several [TT].edi[tt] files were concatenated",
-      "first. The constraints are applied in the order they appear in the [TT].edi[tt] file.) [BR]",
-      "[TT]*[tt] RMSD (for atoms involved in fitting prior to calculating the ED constraints)[BR]",
-      "* projections of the positions onto selected eigenvectors[BR]",
+      "first. The constraints are applied in the order they appear in the [TT].edi[tt] file. ",
+      "Depending on what was specified in the [TT].edi[tt] input file, the output file contains for each ED dataset[PAR]",
+      "[TT]*[tt] the RMSD of the fitted molecule to the reference structure (for atoms involved in fitting prior to calculating the ED constraints)[BR]",
+      "[TT]*[tt] projections of the positions onto selected eigenvectors[BR]",
       "[PAR][PAR]",
       "FLOODING:[PAR]",
       "with [TT]-flood[tt], you can specify which eigenvectors are used to compute a flooding potential,",
@@ -530,7 +536,7 @@ int gmx_make_edi(int argc,char *argv[])
       "is kept in that region.",
       "[PAR]",
       "The origin is normally the average structure stored in the [TT]eigvec.trr[tt] file.",
-      "It can be changed with [TT]-ori[tt] to an arbitrary position in configurational space.",
+      "It can be changed with [TT]-ori[tt] to an arbitrary position in configuration space.",
       "With [TT]-tau[tt], [TT]-deltaF0[tt], and [TT]-Eflnull[tt] you control the flooding behaviour.",
       "Efl is the flooding strength, it is updated according to the rule of adaptive flooding.",
       "Tau is the time constant of adaptive flooding, high [GRK]tau[grk] means slow adaption (i.e. growth). ",
@@ -589,7 +595,7 @@ int gmx_make_edi(int argc,char *argv[])
     { "-flood",  FALSE, etSTR, {&evSelections[2]},
         "Indices of eigenvectors for flooding"},
     { "-outfrq", FALSE, etINT, {&edi_params.outfrq},
-        "Freqency (in steps) of writing output in [TT].edo[tt] file" },
+        "Freqency (in steps) of writing output in [TT].xvg[tt] file" },
     { "-slope", FALSE, etREAL, { &edi_params.slope},
         "Minimal slope in acceptance radius expansion"},
     { "-linstep", FALSE, etSTR, {&evParams[0]},
index d648937c66b9e1444bdec3cf034003cebebfcea6..55631fc532299d7146225e36d795d21bc8eaf5a9 100644 (file)
@@ -422,7 +422,7 @@ int gmx_rmsf(int argc,char *argv[])
                                 *(top.atoms.atomname[index[i]]));
        
        fprintf(fp,"%5d  %10.5f  %10.5f\n",
-               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,rmsf[i]*bfac,
+               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,rmsf[i]*bfac,
                pdb_bfac);
       }
     }
@@ -433,7 +433,7 @@ int gmx_rmsf(int argc,char *argv[])
       if (!bRes || i+1==isize ||
          top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
        fprintf(fp,"%5d %8.4f\n",
-               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
     ffclose(fp);
   }
   
@@ -451,7 +451,7 @@ int gmx_rmsf(int argc,char *argv[])
       if (!bRes || i+1==isize ||
          top.atoms.atom[index[i]].resind!=top.atoms.atom[index[i+1]].resind)
        fprintf(fp,"%5d %8.4f\n",
-               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : i+1,sqrt(rmsf[i]));
+               bRes ? top.atoms.resinfo[top.atoms.atom[index[i]].resind].nr : index[i]+1,sqrt(rmsf[i]));
     ffclose(fp);
   }
 
index 7f2cac7f9a0468c2623d3ffd102afc75d17ad940..2397b3beaf1bc44accd545030a0321b48b3c820c 100644 (file)
@@ -1564,15 +1564,14 @@ static void check_input(
 /* Returns TRUE when "opt" is needed at launch time */
 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
 {
-    /* Apart from the input .tpr we need all options that were set
+    /* Apart from the input .tpr and the error log we need all options that were set
      * on the command line and that do not start with -b */
-    if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2))
+    if (0 == strncmp(opt,"-b", 2) || 0 == strncmp(opt,"-s", 2) || 0 == strncmp(opt,"-err", 4))
+    {
         return FALSE;
+    }
 
-    if (bSet)
-        return TRUE;
-    else
-        return FALSE;
+    return bSet;
 }
 
 
@@ -1925,7 +1924,7 @@ int gmx_tune_pme(int argc,char *argv[])
       { efXVG, "-tpi",    "tpi",      ffOPTWR },
       { efXVG, "-tpid",   "tpidist",  ffOPTWR },
       { efEDI, "-ei",     "sam",      ffOPTRD },
-      { efEDO, "-eo",     "sam",      ffOPTWR },
+      { efXVG, "-eo",     "edsam",    ffOPTWR },
       { efGCT, "-j",      "wham",     ffOPTRD },
       { efGCT, "-jo",     "bam",      ffOPTWR },
       { efXVG, "-ffout",  "gct",      ffOPTWR },
@@ -1946,7 +1945,7 @@ int gmx_tune_pme(int argc,char *argv[])
       { efSTO, "-bc",     "bench",    ffWRITE },
       { efEDR, "-be",     "bench",    ffWRITE },
       { efLOG, "-bg",     "bench",    ffWRITE },
-      { efEDO, "-beo",    "bench",    ffOPTWR },
+      { efXVG, "-beo",    "benchedo", ffOPTWR },
       { efXVG, "-bdhdl",  "benchdhdl",ffOPTWR },
       { efXVG, "-bfield", "benchfld" ,ffOPTWR },
       { efXVG, "-btpi",   "benchtpi", ffOPTWR },